def scrape(): # scrapes several sites and returns free stuff from executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) # Go to sacramento cl and navigate to 1st item of free stuff url = "https://sacramento.craigslist.org/d/free-stuff/search/zip" browser.visit(url) browser.click_link_by_partial_href('https://sacramento.craigslist.org') time.sleep(2) # Using Splinter to scrape cl and store data in dataframe stuff (clean up condition) stuff = pd.DataFrame(columns=['lat', 'long', 'age', 'title']) for x in range(1, 4): html = browser.html soup = BeautifulSoup(html, 'html.parser') title = soup.find(id="titletextonly").text age = soup.find('time', class_="date timeago")["datetime"] loc = soup.find(id='map') lat = loc["data-latitude"] long = loc["data-longitude"] stuff.loc[x] = [lat, long, age, title] time.sleep(2) browser.click_link_by_partial_text('next') return stuff
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = models.User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run) self.process.start() time.sleep(1) def test_add_post(self): log = logging.getLogger("unittest.TestCase") ################################## Login as Alice #self.browser.visit("http://0.0.0.0:8080/login") # original line self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() #self.assertEqual(self.browser.url, "http://0.0.0.0:8080/") # original line # self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") # ask sam about this line ############################################ add a test post ##################### self.browser.visit("http://127.0.0.1:5000") self.browser.click_link_by_partial_href('add') self.browser.fill("title", "post test1 title") self.browser.fill("content", "post test1 content") button = self.browser.find_by_css("button[type=submit]") button.click() post_found = self.browser.find_by_tag( 'h1' ).value #cheated here - made template title h2. how do we access? index? #post_found = self.browser.find_by_text('post test1 title').value - didnt work log.debug("FIRSTH1= %r", post_found) self.assertEqual(post_found, "post test1 title") def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = models.User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run) self.process.start() time.sleep(1) def test_add_post(self): log= logging.getLogger("unittest.TestCase") ################################## Login as Alice #self.browser.visit("http://0.0.0.0:8080/login") # original line self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() #self.assertEqual(self.browser.url, "http://0.0.0.0:8080/") # original line # self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") # ask sam about this line ############################################ add a test post ##################### self.browser.visit("http://127.0.0.1:5000") self.browser.click_link_by_partial_href('add') self.browser.fill("title", "post test1 title") self.browser.fill("content", "post test1 content") button = self.browser.find_by_css("button[type=submit]") button.click() post_found = self.browser.find_by_tag('h1').value #cheated here - made template title h2. how do we access? index? #post_found = self.browser.find_by_text('post test1 title').value - didnt work log.debug( "FIRSTH1= %r", post_found ) self.assertEqual(post_found, "post test1 title") def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
class UserTest(StaticLiveServerTestCase): def setUp(self): check_permissions() self.username = "******" create_user(self.username) self.browser = Browser() self.browser.visit(self.live_server_url) def test_signup(self): signup_url = settings.SIGNUP_URL self.browser.click_link_by_partial_href(signup_url) username = "******" password = "******" email = "*****@*****.**" signup(self.browser, username, password, email) user_exists = exists_user(username) self.assertTrue(user_exists) user = get_user(username) self.assertEquals(user.username, username) # self.assertEquals(user.password, password) self.assertEquals(user.email, email) document_list_url = self.live_server_url + reverse("documents.views.list_documents") self.assertEquals(self.browser.url, document_list_url) profile_xpath = "/html/body/div/div[1]/div/ul[2]/li[4]/a" profile_link = self.browser.find_by_xpath(profile_xpath) self.assertEquals(profile_link.value, "@{}".format(username)) # import time; time.sleep(3) self.browser.quit() def test_signin(self): login_url = settings.LOGIN_URL self.browser.click_link_by_partial_href(login_url) username = self.username password = self.username login(self.browser, username, password) document_list_url = self.live_server_url + reverse("documents.views.list_documents") self.assertEquals(self.browser.url, document_list_url) profile_xpath = "/html/body/div/div[1]/div/ul[2]/li[4]/a" profile_link = self.browser.find_by_xpath(profile_xpath) self.assertEquals(profile_link.value, "@{}".format(username)) # import time; time.sleep(3) self.browser.quit()
def get_url_code(auth_url, username, password, login='******'): b = Browser(driver_name='chrome') b.visit(auth_url) b.click_link_by_partial_href("/en/login") if login == 'facebook': b.click_link_by_partial_href("https://www.facebook.com") b.fill_form({'email': username, 'pass': password}) b.click_link_by_id('loginbutton') elif login == 'spotify': b.fill_form({'username': username, 'password': password}) loginbutton = b.find_by_text('Log In')[0] loginbutton.click() b.visit(auth_url) codeurl = b.url code = codeurl.split("?code=")[1].split('&')[0] b.quit() return code
def download_art(title): browser = Browser() # Visit URL url = "http://gen.lib.rus.ec/scimag/index.php" browser.visit(url) article_title = browser.find_by_name('s') article_title.fill(title) button = browser.find_by_value('Search!') # Interact with elements button.click() #sleep is use at each step to control the follow between program and internet speed time.sleep(10) browser.click_link_by_text('Libgen') time.sleep(15) browser.click_link_by_partial_href('http://gen.lib.rus.ec/scimag/get.php') time.sleep(5) browser.quit()
def get_pass(): return check_output("gpg -dq ~/rsc_password.gpg", shell=True).strip("\n") rsc_password_plaintext = get_pass() b = Browser('chrome', headless=True) time.sleep(wait_delay) b.visit('https://mc.manuscriptcentral.com/ee/') time.sleep(wait_delay) b.fill('USERID', '*****@*****.**') time.sleep(wait_delay) b.fill('PASSWORD',rsc_password_plaintext) time.sleep(wait_delay) b.click_link_by_id('logInButton') time.sleep(wait_delay) b.click_link_by_partial_href("AUTHOR") time.sleep(wait_delay) html_obj = b.html soup = BeautifulSoup(html_obj,"lxml") # soup = BeautifulSoup(html_obj) table = soup.find("table", attrs={"class":"table table-striped rt cf"}) row = table.tbody.findAll('tr')[1] first_column_html = str(row.findAll('td')[1].contents[0]) current_manuscript_status = BeautifulSoup(first_column_html,"lxml").text # current_manuscript_status = 'demo' # print current_status_msg time.sleep(wait_delay) b.quit() if current_manuscript_status == previous_manuscript_status: print 'Your manuscript status remains unchanged ....'
class DocTest(StaticLiveServerTestCase): def setUp(self): fss.remove_tree(settings.MEDIA_ROOT) check_permissions() set_site(self.live_server_url) self.browser = Browser() self.browser.visit(self.live_server_url) login_url = settings.LOGIN_URL self.browser.click_link_by_partial_href(login_url) username = '******' password = '******' create_user(username) login( self.browser, username, password, ) upload_url = reverse('documents.views.add_document') self.browser.click_link_by_partial_href(upload_url) source = 'local' docfile = get_abs_path('doctest.pdf') language = 'eng' public = True title = 'test' notes = 'test notes' upload( self.browser, source, docfile, language, public, title, notes, ) self.browser.is_element_not_present_by_value('ready', 10) self.public = public self.title = title self.notes = notes self.document = get_document(title) def test_upload_doc_local(self): #Create document_exists = exists_document(self.title) self.assertTrue(document_exists) self.assertEquals(self.document.public, self.public) self.assertEquals(self.document.title, self.title) self.assertEquals(self.document.notes, self.notes) document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) document_xpath = '/html/body/div/div[2]/table/tbody/tr[1]' document_tr = self.browser.find_by_xpath(document_xpath) document_id = document_tr['data-id'] self.assertEquals(int(document_id), self.document.id) document_title_xpath = '//*[@id="documents_cell"]/span[1]' document_title = self.browser.find_by_xpath(document_title_xpath) self.assertEquals(document_title.value, self.title) profile_xpath = '/html/body/div/div[1]/div/ul[2]/li[4]/a' profile_link = self.browser.find_by_xpath(profile_xpath) owner_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[4]/a' owner_link = self.browser.find_by_xpath(owner_xpath) self.assertEquals(profile_link.value, owner_link.value) status_xpath = '/html/body/div/div[2]/table/tbody/tr/td[5]/div' status_div = self.browser.find_by_xpath(status_xpath) self.assertEquals(status_div.value, self.document.status) numpages_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[6]/div' numpages_div = self.browser.find_by_xpath(numpages_xpath) self.assertEquals(int(numpages_div.value), self.document.page_count) privacy_icon_xpath = '//*[@id="privacy"]/i' privacy_icon = self.browser.find_by_xpath(privacy_icon_xpath) self.assertTrue(privacy_icon.has_class('icon-eye-open')) structure = create_structure(self.document) root_path = self.document.get_root_path() dirs = fss.listdir(root_path)[0] files = fss.listdir(root_path)[1] for d in dirs: dir_path = os.path.join(root_path, d) for f in structure['dirs'][d]: self.assertIn(f, fss.listdir(dir_path)[1]) for f in structure['files']: self.assertIn(f, fss.listdir(root_path)[1]) # import time; time.sleep(3) self.browser.quit() # # def test_upload_doc_dropbox(self): #Create # pass def test_view_doc(self): #Read link_title_xpath = '//*[@id="documents_cell"]/span[1]/a' self.browser.find_by_xpath(link_title_xpath).click() viewer_title_xpath = ('//*[@id="documentviewer-container"]' '/div/div[1]/div[1]/div[1]/div[2]/h4/a') viewer_title = self.browser.find_by_xpath(viewer_title_xpath) self.assertEquals(viewer_title.value, self.title) # import time; time.sleep(3) self.browser.quit() def test_edit_doc(self): #Update edit_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[7]/a[3]/i' self.browser.find_by_xpath(edit_xpath).click() public = False title = 'new title' notes = 'new notes' edit( self.browser, public, title, notes, ) document = get_document(title) self.assertEquals(document.public, public) self.assertEquals(document.title, title) self.assertEquals(document.notes, notes) document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) document_title_xpath = '//*[@id="documents_cell"]/span[1]' document_title = self.browser.find_by_xpath(document_title_xpath) self.assertEquals(document_title.value, title) privacy_icon_xpath = '//*[@id="privacy"]/i' privacy_icon = self.browser.find_by_xpath(privacy_icon_xpath) self.assertTrue(privacy_icon.has_class('icon-eye-close')) # import time; time.sleep(3) self.browser.quit() def test_remove_doc(self): #Delete old_doc_num = len(self.browser.find_by_css('tr.document-row')) remove_xpath = '//*[@id="remove"]/i' self.browser.find_by_xpath(remove_xpath).click() confirm_xpath = '//*[@id="confirm-remove"]/i' self.browser.find_by_xpath(confirm_xpath).click() document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) new_doc_num = len(self.browser.find_by_css('tr.document-row')) self.assertEquals(new_doc_num, old_doc_num - 1) # import time; time.sleep(3) self.browser.quit()
def scrape(): executable_path = {'executable_path': 'C:/Users/osafi/Desktop/BOOT CAMP/12 WEB SCRAPING/Web_Scrapping_Challenge_OS/Missions_to_Mars/chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = "https://mars.nasa.gov/news/" browser.visit(url) html = browser.html time.sleep(5) soup = bs(html, 'html.parser') mars = soup.find('div', class_ = "list_text") news_title = mars.find('div', class_ = "content_title").text news_p = mars.find('div', class_ = "article_teaser_body").text mars_news = [news_title, news_p] image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(image_url) time.sleep(15) html = browser.html soup = bs(html, 'html.parser') browser.click_link_by_id("full_image") time.sleep(5) html = browser.html soup = bs(html, 'html.parser') time.sleep(15) more_info = soup.find('div', class_="addthis_toolbox addthis_default_style")['addthis:url'] browser.click_link_by_partial_href(more_info) html = browser.html soup = bs(html, 'html.parser') featured_image = soup.find('img', class_="main_image")['src'] featured_image_url = "https://www.jpl.nasa.gov" + featured_image weather_url = "https://twitter.com/marswxreport?lang=en" browser.visit(weather_url) time.sleep(15) html = browser.html soup = bs(html, 'html.parser') results = soup.find('div', class_="css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0") mars_weather = results.find('span').text facts_url = "https://space-facts.com/mars/" browser.visit(facts_url) time.sleep(15) html = browser.html soup = bs(html, 'html.parser') facts = pd.read_html(facts_url) mars_facts = pd.DataFrame(facts[0]) mars_facts_string = mars_facts.to_html(header = False, index = False) hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemi_url) time.sleep(15) html = browser.html soup = bs(html, 'html.parser') hemisphere_image_urls = [] results = soup.find('div', class_ = 'result-list') hemi_pics = results.find_all('div', class_ = 'item') print(hemi_pics) for i in hemi_pics: title = i.find('h3').text title = title.replace("Enhanced", "") href = i.find('a')['href'] image_url = "https://astrogeology.usgs.gov/" + href browser.visit(image_url) time.sleep(15) html = browser.html soup = bs(html, 'html.parser') full_size = soup.find('div', class_ = 'downloads') img_url = full_size.find('a')['href'] hemisphere_image_urls.append({'title': title, 'img_url': img_url}) mars_data = { "mars_title": mars_news[0], "mars_news": mars_news[1], "featured_image": featured_image_url, "mars_weather": mars_weather, "mars_facts": mars_facts_string, "mars_hemis": hemisphere_image_urls } browser.quit() return(mars_data)
def scrape(): # dependencies import pandas as pd import os from splinter import Browser from bs4 import BeautifulSoup from webdriver_manager.chrome import ChromeDriverManager from random import randint from time import sleep # Setup splinter executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) #------------------------------------------------------------------------ # ## NASA News latest headline and p text # visit the site nasa_news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(nasa_news_url) sleep(randint(3,10)) # preparing soup object html = browser.html soup = BeautifulSoup(html, 'html.parser') # get news title news_title = soup.find('div', class_='image_and_description_container').find('div', class_='content_title').a.text.strip() # get news teaser news_p = soup.find( 'div', class_='list_text').find( 'div', class_='article_teaser_body').text #------------------------------------------------------------------------ # ## JPL Featured image # visit the site jpl_img_url = 'https://www.jpl.nasa.gov/images?search=&category=Mars' browser.visit(jpl_img_url) sleep(randint(3,10)) # preparing the soup obj html = browser.html soup = BeautifulSoup(html, 'html.parser') # finding the most recent image to click into # clicking into the image try: partial_href = soup.find('div', class_='SearchResultCard').find('a', class_='group')['href'] browser.click_link_by_partial_href(partial_href) sleep(randint(3,10)) except AttributeError as e: print(e) # preparing the soup obj # including sleeps to allow browser and soup to catch up to new page sleep(randint(3,10)) new_html = browser.html sleep(randint(3,10)) soup=BeautifulSoup(new_html, 'html.parser') # getting the featured image url featured_image_url = soup.find('aside').find('a')['href'] #------------------------------------------------------------------------ # ## Mars Facts table # url for the mars facts website mars_facts_url = 'https://space-facts.com/mars/' # reading the table to html tables = pd.read_html(mars_facts_url) # saving the table to a df mars_facts_df = tables[0] # formatting the df - col names, and setting index mars_facts_df.columns = [' ', 'Mars facts'] mars_facts_df = mars_facts_df.set_index(' ') # # removing the \n breaks mars_table = mars_facts_df.to_html( justify='center', header=False, classes=["table-striped", "table-responsive"]).replace( 'dataframe ', '').replace('border="1"', '').replace( '\n', '') #------------------------------------------------------------------------ # ## Hemisphere images # visit the site hemisphere_img_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemisphere_img_url) sleep(randint(3,10)) # preparing the soup obj html = browser.html soup = BeautifulSoup(html, 'html.parser') # creating a list of links for the 4 hemispheres link_list = [] for item in soup.find_all('div', class_='description'): list_item = item.find('a', class_='itemLink').text link_list.append(list_item) # appending name of hemisphere and img link ## empty list to append dicts of name/img to hemisphere_image_urls = [] ## for loop over the list of links for x in range(0, len(link_list)): nav_url = link_list[x] #print(nav_url) # visiting the link for to the hemisphere images browser.click_link_by_partial_text(nav_url) # prepping the soup object html = browser.html soup = BeautifulSoup(html, 'html.parser') # finding image url and title img_url = soup.find('div', class_='downloads').find('a')['href'] # removing last 9 characters to remove the 'enhanced' end word img_title = soup.find('section', class_='block metadata').h2.text[:-9] # appending to list of dicts hemisphere_image_urls.append({'title': img_title, 'img_url': img_url}) sleep(randint(1,3)) # returning to page from which images come to apply next link hemisphere_img_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemisphere_img_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') sleep(randint(1,3)) # quit the browser browser.quit() return_dict = { 'nasa_news': {'news_title': news_title, 'news_pp': news_p}, 'jpl_featured_image': featured_image_url, 'mars_facts_table': mars_table, 'hemisphere_image_urls': hemisphere_image_urls } return return_dict
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Resize browser window to make sure all elements are visible for tests self.browser.driver.set_window_size(1920, 1080) # Set up the tables in the database Base.metadata.create_all(engine) self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def test_register_new_user(self): self.browser.visit("http://127.0.0.1:8080/create_user") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "testpass") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def test_register_user_exists(self): self.test_register_new_user() self.browser.visit("http://127.0.0.1:8080/create_user") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "testpass") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/create_user") def test_login_correct(self): self.test_register_new_user() self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "testpass") button = self.browser.find_by_css("button[type=submit]") button.click() login_link = self.browser.is_element_present_by_text('Login') self.assertFalse(login_link) self.assertEqual(self.browser.url, "http://127.0.0.1:8080/fight") def test_login_incorrect(self): self.test_register_new_user() self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "testpass") button = self.browser.find_by_css("button[type=submit]") button.click() login_link = self.browser.is_element_present_by_text('Login') self.assertTrue(login_link) self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def test_logout(self): self.test_login_correct() self.browser.click_link_by_partial_href('logout') logout_link = self.browser.is_element_present_by_text('Logout') self.assertFalse(logout_link) self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
def scrape(): # browser = Browser("chrome") browser = init_browser() mars_data = {} executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) url = "https://mars.nasa.gov/news/" browser.visit(url) html = browser.html soup = BeautifulSoup(html, "html.parser") news_title = soup.find("div", class_="content_title").text news_p = soup.find("div", class_="article_teaser_body").text mars_data["news_title"] = news_title mars_data["summary"] = news_p # url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" # browser.visit(url2) # html = browser.html # soup = BeautifulSoup(html,"html.parser") # browser.click_link_by_partial_text("FULL IMAGE") # time.sleep(1) # browser.click_link_by_partial_text('more info') # time.sleep(1) # featured_image_url = soup.find("article") # featured_image_url = featured_image_url.find("a") # mars_data["featured_image_url"] = featured_image_url url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url2) html = browser.html soup = BeautifulSoup(html, "html.parser") browser.click_link_by_partial_text("FULL IMAGE") time.sleep(1) browser.click_link_by_partial_text('more info') time.sleep(1) browser.click_link_by_partial_href('/spaceimages/images') featured_image_url = (str(browser.url)) mars_data["featured_image_url"] = featured_image_url # # Twitter API Keys # consumer_key = consumer_key # consumer_secret = consumer_secret # access_token = access_token # access_token_secret = access_token_secret # # Setup Tweepy API Authentication # auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # auth.set_access_token(access_token, access_token_secret) # api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) url_weather = "https://twitter.com/marswxreport?lang=en" browser.visit(url_weather) html_weather = browser.html soup = BeautifulSoup(html_weather, "html.parser") mars_weather = soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text mars_data["mars_weather"] = mars_weather url3 = 'https://space-facts.com/mars/' mars_facts = pd.read_html(url3) mars_df = mars_facts[0] mars_df.columns = ['Measure', 'Values'] mars_df = mars_df.set_index('Measure') mars_df.head() mars_table = mars_df.to_html(classes='marstable') mars_table = mars_table.replace('\n', ' ') mars_data["mars_table"] = mars_table # hemisphere_image_urls = [] # url3 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" # browser.visit(url3) # browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') # html = browser.html # soup = BeautifulSoup(html,"html.parser") # hem1title= soup.find(class_='title').text # browser.click_link_by_partial_href('http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg') # hem1 = (str(browser.url)) # hemisphere_image_urls.append({"title": hem1title, "img_url": hem1}) # browser.visit(url3) # browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') # html = browser.html # soup = BeautifulSoup(html,"html.parser") # hem2title= soup.find(class_='title').text # browser.click_link_by_partial_href("http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg") # hem2 = (str(browser.url)) # hemisphere_image_urls.append({"title": hem2title, "img_url": hem2}) # browser.visit(url3) # browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') # html = browser.html # soup = BeautifulSoup(html,"html.parser") # hem3title= soup.find(class_='title').text # browser.click_link_by_partial_href("http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg") # hem3 = (str(browser.url)) # hemisphere_image_urls.append({"title": hem3title, "img_url": hem3}) # browser.visit(url3) # browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced') # html = browser.html # soup = BeautifulSoup(html,"html.parser") # hem4title= soup.find(class_='title').text # browser.click_link_by_partial_href("http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg") # hem4 = (str(browser.url)) # hemisphere_image_urls.append({"title": hem4title, "img_url": hem4}) # # hemisphere_image_urls.append({"title": hem1title, "img_url": hem1}) # # hemisphere_image_urls.append({"title": hem2title, "img_url": hem2}) # # hemisphere_image_urls.append({"title": hem3title, "img_url": hem3}) # # hemisphere_image_urls.append({"title": hem4title, "img_url": hem4}) # mars_data["mars_hemis"] = hemisphere_image_urls return mars_data
def scrape_mars(): #setup browser executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) #get title information from the first site url = "https://mars.nasa.gov/news/" response = requests.get(url) #create the soup object soup = bs(response.text) m_titles = soup.find_all('div', class_ = 'content_title') #title1 is the variable that contains the desired news headline title1 = m_titles[0].text #get paragraph information from the first site m_para = soup.find_all('div', class_ = 'rollover_description_inner') #para1 is the variable that contains the desired blurb para1 = m_para[0].text #get the image desired from the second page url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url2) # Click the first link on the page browser.click_link_by_partial_href('images') html = browser.html ibisque = bs(html, 'html.parser') #return all the img tags and store in a variable img_url = ibisque.find_all("img") #url1 is the url for the desired image url1 = img_url[2]['src'] #get tables from site number 3 table_url = 'https://space-facts.com/mars/' mt_tables = pd.read_html(table_url) #save the desired table to mars_df mars_df = mt_tables[0] #write the table out to html, while getting rid of line break characters mars_html_t = mars_df.to_html(index = False) mars_html_t = mars_html_t.replace('\n', '') #get the martian hemisphere picture urls pics_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' p_urls = ['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles'] pic_urls = [] pic_titles = [] for p in p_urls: #visit the website with the pictures browser.visit(pics_url) # Click the first link on the page browser.click_link_by_partial_text(p) #Use the html in the open browser html3 = browser.html #create a beautiful soup object chilli = bs(html3, 'html.parser') #find the image url by searching for <a> tags im_a = chilli.find_all('a') #append the image url into the correct list pic_urls.append(im_a[5]['href']) #find the image title im_title = chilli.find('h2', class_ = 'title').text #append the title to the correct list pic_titles.append(im_title) #package results in a list of dictionaries # pics list is the output list for the function pics_list = [] for q in range(4): u_dict = {} u_dict['title'] = pic_titles[q] u_dict['url'] = pic_urls[q] pics_list.append(u_dict) #close browser browser.quit() #store data in a dict mars_data = { 'news_headline': title1, 'news_blurb': para1, 'daily_img': url1, 'html_d_table': mars_html_t, 'hemi_pics': pics_list } return(mars_data)
# browser.find_by_id('product-section') # browser.click_link_by_class('link_id') # browser.click_link_by_id("results") WebElement elem = driver.findElement(By.xpath("""//*[@id="product-section"]/div[2]/div[1]/a/img""")); JavascriptExecutor executor = (JavascriptExecutor)driver; executor.executeScript("arguments[0].click();", elem); #%% executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) # from selenium.webdriver.common.keys import Keys #need to send keystrokesWebDriverWait wait2 = new WebDriverWait(driver, 10); # wait2.until(ExpectedConditions.elementToBeClickable(By.href(""))); browser.click_link_by_partial_href("/search/map/Mars/Viking/cerberus_enhanced") #%% from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.events import EventFiringWebDriver as EwC driver = webdriver.Chrome("C://Users/lcc25/repos/utexas_hmwk_python/marsmission/chromedriver.exe") driver.get(url) # element = WebDriverWait(driver,30).until(EwC.find_elements_by_partial_link_text('https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'))
class control_google(): def init_browser(self, email, passwd): self.state = 'good' self.passwd = passwd self.login = email param = { 'chrome.noWebsiteTestingDefaults': True, 'chrome.prefs': { 'profile.default_content_settings': { 'images': 2 }, } } from selenium.webdriver.chrome.options import Options options = Options() #options.add_argument('--allow-running-insecure-content') #options.add_argument('--disable-web-security') #options.add_argument('--disk-cache-dir=/var/www/cake2.2.4/app/tmp/cache/selenium-chrome-cache') #options.add_argument('--no-referrers') #options.add_argument('--window-size=1003,719') #options.add_argument('--proxy-server=localhost:8118') options.add_argument( "'chrome.prefs': {'profile.managed_default_content_settings.images': 2}" ) CHROME = { "browserName": "chrome", "chrome.prefs": { "profile.managed_default_content_settings.images": 2 }, "chrome.switches": ["disable-images"], } self.browser = Browser('chrome', user_agent=useragent) #self.browser = Browser('chrome', user_agent=useragent, desired_capabilities=CHROME) load_page = 'https://accounts.google.com/ServiceLogin?btmpl=mobile_tier2&hl=ru&service=mobile' self.browser.visit(load_page) self.browser.find_by_id('Email').first.fill(email + '@gmail.com') self.browser.find_by_id('Passwd').first.fill(passwd) self.browser.find_by_id('signIn').first.click() def _google_hook(self): if self.browser.is_element_present_by_id('358'): self.browser.find_by_id('358').first.click() if self.browser.is_element_present_by_id('link_dismiss'): try: self.browser.find_by_id('link_dismiss').first.click() except: pass if 'getstarted' in self.browser.url: self.browser.back() if self.browser.is_element_present_by_id('link_dismiss'): self.browser.find_by_id('link_dismiss').first.click() def open_profile(self): print 'Open light version profile' load_page = 'https://plus.google.com/app/basic/%s/about' % self.profile_id self.browser.visit(load_page) def save_profile(self): self.browser.find_by_id('177').first.click() def register_google_plus(self, firstName, lastName): load_page = 'https://plus.google.com/u/0/?gpsrc=ogpy0&tab=XX' self.browser.visit(load_page) self.browser.fill('firstName', firstName) self.browser.fill('lastName', lastName) self.browser.find_by_name('buttonPressed').first.click() self.browser.find_by_id('357').first.click() def get_profile_id(self): load_page = 'https://www.google.com/settings/general-light?ref=/settings/account' self.browser.visit(load_page) if self.browser.is_element_present_by_xpath('//a[@class="CS"]'): profile_link = self.browser.find_by_xpath('//a[@class="CS"]').first link_path = profile_link['href'] return link_path.split('/')[3] else: return False def profile_edit(self, vals): self.open_profile() print 'Click change profile' self.browser.find_by_id('59').first.click() #Confirm mobile rules self._google_hook() self.browser.find_by_name('peWork0').first.fill(vals['company']) self.browser.find_by_name('peWorkTitle0').first.fill(vals['position']) self.browser.find_by_name('peWorkStartYear0').first.fill( vals['year_start']) self.browser.find_by_name('peWorkEndYear0').first.fill( vals['year_stop']) self.browser.find_by_name('peSchool0').first.fill( vals['university_name']) self.browser.find_by_name('peSchoolMajor0').first.fill( vals['field_education_name']) self.browser.find_by_name('peSchoolStartYear0').first.fill( vals['going_to_college_year']) self.browser.find_by_name('peSchoolEndYear0').first.fill( vals['after_graduation_year']) self.browser.find_by_name('pePlaceLived0').first.fill( vals['place_lived']) self.browser.find_by_name('pePlaceLivedIsCurrent').first.check() self.browser.find_by_name('peGender').first.select("1") print 'Done profile_edit' self.save_profile() def change_photo(self, photo_path): self.open_profile() print 'Click change profile' self.browser.find_by_id('59').first.click() print 'Click change photo' self.browser.find_by_id('375').first.click() self.browser.attach_file('photo_upload_file_name', self.photo_path) print 'Done profile_edit' self.browser.find_by_id('314').first.click() self.save_profile() def change_pass(self, old_pass, new_pass): print 'Open password change page' load_page = 'https://accounts.google.com/b/0/EditPasswd?hl=ru' self.browser.visit(load_page) self.browser.find_by_id('OldPasswd').first.fill(old_pass) self.browser.find_by_id('Passwd').first.fill(new_pass) self.browser.find_by_id('PasswdAgain').first.fill(new_pass) self.browser.find_by_id('save').first.click() print 'Done change pass' def open_full_plus(self): 'Print open full Google+' load_page = 'https://plus.google.com/u/0/' self.browser.visit(load_page) def open_full_profile(self): self.open_full_plus() self._google_hook() print 'Click user icon' self.browser.find_by_id('gbi4i').first.click() print 'Click show profile' #self.browser.find_by_id('gbmplp').first.click() self.browser.find_by_xpath( '//a[@class="gbqfb gbiba gbp1"]').first.click() def change_name(self, firstName, lastName): self.open_full_plus() self.open_full_profile() print 'Click change name' time.sleep(5) self.browser.find_by_xpath( '//div[@guidedhelpid="profile_name"]').first.click() print 'Fill values' time.sleep(5) self.browser.find_by_xpath( '//input[@class="l-pR osa g-A-G"]').first.fill(firstName) self.browser.find_by_xpath( '//input[@class="l-oR Ika g-A-G"]').first.fill(lastName) print 'Save results' self.browser.find_by_xpath( '//*[starts-with(@class,"a-f-e c-b c-b-M nVrMHf nZQKMd h019o")]' ).first.click() print 'Confirm' self.browser.find_by_name('ok').first.click() def youtube_hoock(self): if 'ServiceLogin?' in self.browser.url: print 'ServiceLogin? Hook' self.browser.fill('Passwd', self.passwd) self.browser.find_by_name('signIn').first.click() #self.browser.back() if 'create_channel?' in self.browser.url: print 'create_channel? Hook' self.browser.click_link_by_partial_href('create_channel') self.browser.fill('username', self.login) self.browser.find_by_id('channel_submit').click() self.browser.back() self.browser.back() self.browser.back() if 'select_site?' in self.browser.url: print 'select_site? Hook' self.browser.find_by_xpath('//input[@type="submit"]').click() self.browser.back() self.browser.back() if 'switch-profile.g?' in self.browser.url: print 'switch-profile.g? Hook' self.browser.find_by_id('switchButton').click() def youtube_like(self, url): self.browser.visit(url) self.browser.click_link_by_partial_href('action_like=1') self.youtube_hoock(url) self.browser.find_by_name('action_rate').click() def youtube_dislike(self, url): self.browser.visit(url) self.browser.click_link_by_partial_href('action_dislike=1') self.youtube_hoock() self.browser.find_by_name('action_rate').click() def youtube_comment(self, url, comment): self.browser.visit(url) self.browser.click_link_by_partial_href('post_comment') self.youtube_hoock() try: self.browser.click_link_by_partial_href('post_comment') except: pass self.youtube_hoock() self.browser.fill('comment', comment) self.browser.find_by_name('action_comment').click() self.youtube_hoock() def youtube_subscribe(self, chane_name): load_page = 'http://m.youtube.com/user/%s' % chane_name self.browser.visit(load_page) self.browser.find_by_name('submit')[1].click() self.youtube_hoock() try: self.browser.find_by_name('submit')[1].click() except: pass def google_friend_connector(self): #self.browser.click_link_by_partial_href('post_comment') pass def blogspot_follow(self, url): pass def get_capture(self): cap_element = self.browser.find_by_xpath('//img[@width="300"]').first cap_code = recognize_captcha(cap_element['src']) self.browser.fill('recaptcha_response_field', cap_code) def blogspot_post_plus(self, url): self.browser.visit(url) frame_name = self.browser.find_by_xpath( '//*[starts-with(@name,"I0_")]')[0]['name'] print frame_name with self.browser.get_iframe(frame_name) as iframe: # #self.browser.find_by_xpath('//span[@class="hAa Qo Bg"]').first.click() iframe.find_by_xpath('//span[@class="hAa Qo Bg"]').first.click() def blogspot_post(self, url, comment): self.browser.visit(url) with self.browser.get_iframe('comment-editor') as iframe: self.browser.fill('commentBody', comment) iframe.find_by_id('postCommentSubmit').click() self.youtube_hoock() with self.browser.get_iframe('comment-editor') as iframe: if iframe.is_element_present_by_id('recaptcha_image'): self.get_capture() iframe.find_by_id('postCommentSubmit').click() if 'showComment=' in self.browser.url: return True else: return False def google_post_like(self, url): self.browser.visit(url) if not self.browser.is_element_present_by_name('stupop'): self.browser.find_by_id('162').click() return True else: return False def google_post_dislike(self, url): self.browser.visit(url) if self.browser.is_element_present_by_name('stupop'): self.browser.find_by_id('162').click() return True else: return False def google_post_comment(self, url, comment): self.browser.visit(url) self.browser.fill('adcp', comment) self.browser.find_by_id('110').click() def google_post_share(self, url, comment): self.browser.visit(url) self.browser.find_by_id('396').click() self.browser.fill('rpPostMsg', comment) self.browser.find_by_id('253').click() def google_profile_join(self, id): self.browser.visit('https://plus.google.com/app/basic/%s/' % id) self.browser.find_by_id('59').click() self.circle_join() def circle_join(self): self.browser.find_by_name('chcccp')[3].click() self.browser.find_by_id('49').click() self.browser.reload() def google_communities_enter(self, id): self.browser.visit('https://plus.google.com/u/0/communities/%s/' % id) self._google_hook() def google_communities_join(self, id): self.google_communities_enter(id) if self.browser.is_element_present_by_xpath( '//*[starts-with(@class,"a-f-e c-b c-b-La")]'): self.browser.find_by_xpath( '//*[starts-with(@class,"a-f-e c-b c-b-La")]').first.click() def google_communities_post(self, id, mess): print 'Start communities post' self.google_communities_join(id) time.sleep(60) #for i in self.browser.find_by_xpath('//a[@class="FW9qdb Wk"]'): #` print i['oid'] #self.browser.reload() self.browser.find_by_xpath( '//div[@guidedhelpid="sharebox_textarea"]').first.click() self.browser.find_by_xpath('//div[@class="yd editable"]').first.fill( mess) self.browser.find_by_xpath( '//div[@guidedhelpid="sharebutton"]').click() time.sleep(60) self.browser.find_by_xpath('//div[@class="a-n Ph Hw"]').first.click() print '-' * 30 for i in self.browser.find_by_xpath('//a[@class="FW9qdb Wk"]'): print i['oid'] def google_people_suggested(self): self.browser.visit( 'https://plus.google.com/app/basic/people/suggested?') for i in range(10): try: self.browser.find_by_xpath('//a[@class="vfc"]').first.click() self.circle_join() except: self.browser.visit( 'https://plus.google.com/app/basic/people/suggested?') def google_grab_comm_members(self, id, qty): irr_qty = int((qty - 64) / 20.00) + 3 print 'Irr qty= %d' % irr_qty self.browser.visit( 'https://plus.google.com/u/0/communities/%s/members' % id) ret_arr = [] js_del_all_img = """ var images = document.getElementsByTagName('img'); while(images.length > 0) { images[0].parentNode.removeChild(images[0]); } """ for i in range(irr_qty): elem_arr = self.browser.find_by_xpath('//div[@class="ib31if"]') print 'Array len %d' % len(elem_arr) print i print '' elem_arr[len(elem_arr) - 2].right_click() #self.browser.execute_script(js_del_all_img) for elem in elem_arr: oid = elem['oid'] img = self.browser.find_by_xpath('//img[@oid="%s"]' % oid)[0] #print img['src'] if not oid in ret_arr: ret_arr.append(oid) print oid f = open('/tmp/google_oid.txt', 'w') for s in ret_arr: f.write('<item>' + s + '</item>\n') f.close() print 'Grab done' def quit(self): self.browser.quit()
def add_album_to_rym(args, config_file): br = Browser() br.visit('https://rateyourmusic.com/account/login') time.sleep(3) # Login br.fill('username', credentials.username) br.fill('password', credentials.password) br.find_by_id('login_submit').click() time.sleep(5) (title, artist, tracklist, release, cover) = config.read_config(config_file) """ if args.update_album: br.visit(args.rym_album) else: """ if args.add_artist: br.visit('https://rateyourmusic.com/artist_add') #br.fill('lastname', unicode(artist)) br.fill('lastname', artist) br.fill('comments', args.url) br.find_by_id('submitbtn').click() time.sleep(3) br.find_by_text(artist).click() else: br.visit(args.rym_profile) time.sleep(3) br.click_link_by_partial_href('/releases/ac?artist_id=') # Add data #br.fill('title', unicode(title)) br.fill('title', title) br.find_by_id('format58').click() br.find_by_id('goAdvancedBtn').click() tracks_div = br.find_by_id('tracks_adv') tracks_text_area = tracks_div.find_by_id('track_advanced') #tracks_text_area.fill(unicode(tracklist)) tracks_text_area.fill(tracklist) br.find_by_id('goSimpleBtn').click() br.fill('notes', args.url) (year, month, day) = parse_release_date(release) release_month_selector = br.find_by_id('month') release_month_selector.select(month) release_day_selector = br.find_by_id('day') release_day_selector.select(day) release_year_selector = br.find_by_id('year') release_year_selector.select(year) br.find_by_id('previewbtn').click() br.find_by_id('submitbtn').click() # Add cover art """ coverart_img_element = br.find_by_xpath("//img[@class='coverart_img']") print(coverart_im_element) sys.exit(0) """ br.click_link_by_partial_href('/images/upload?type=l&assoc_id=') br.attach_file('upload_file', cover) br.fill('source', args.url) br.find_by_id('uploadbutton').click() time.sleep(5) br.click_link_by_partial_href('javascript:setStatus') # Vote for genre br.click_link_by_partial_href('/release/') time.sleep(3) br.click_link_by_partial_href('/rgenre/set?') prigen_text_area = br.find_by_xpath("//input[@id='prigen']") prigen_text_area.fill('vaporwave') prigen_vote_button = br.find_by_xpath("//input[@value='+ propose']").first prigen_vote_button.click() # Done br.click_link_by_partial_href('/release/') print("Finished")
class WOS(object): """ A little module for exporting Web of Science search results into a txt file """ def __init__(self, **kwargs): """ Construct a new WOS object given a query, an export file (without ".txt") a username and a password for authentication eg : WOS(query="TS=(epigenetic*", outfile="epigenetic", user="******", passw="mypassw") """ #defining params self.query = kwargs["query"] self.outfile = kwargs["outfile"] + ".tsv" """ try: self.user=kwargs["user"] self.passw = kwargs["passw"] except: self.user, self.passw = private """ try: self.browser_app = kwargs["browser"] except: self.browser_app = "splinter" #using MLV Auth Server #self.auth_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/WOS_AdvancedSearch_input.do?&product=WOS&search_mode=AdvancedSearch" self.auth_url = "http://apps.webofknowledge.com/UA_AdvancedSearch_input.do?&product=UA&search_mode=AdvancedSearch" #Firefox Browser if self.browser_app == "splinter": self.browser = Browser("firefox") else: self.browser = spynner.Browser() self.browser.set_html_parser(PyQuery) #self.browser = Browser('zope.testbrowser', ignore_robots=True) #Session params self.session = None self.cookies = {} if self.query is None: sys.exit("No query provided") if "=" not in self.query: #or "(" not in self.query logging.warning("Syntax is not WOS compliant. Check Query Syntax") sys.exit("Query Syntax Error") if self.outfile is None: self.outfile = str( re.sub(re.compile("[^0-9a-zA-Z]+"), "_", self.query)) + ".txt" # if self.user is None and self.passw is None: # self.user, self.passw = private # logging.info("WOS search parameters:\n\t- query: %s\n\t- outfile: %s\n\t- user: %s\n\t- password: %s" %(self.query, self.outfile, self.user, self.passw)) self.run() def auth(self): """ authentification throught auth_url to get the session id SID """ #Loading url if self.browser_app == "splinter": self.browser.visit(self.auth_url) # self.browser.fill('username', self.user) # self.browser.fill('password', self.passw) # self.browser.find_by_name("submit").click() self.cookies = self.browser.cookies.all() else: self.browser = self.browser.load(self.url) # self.browser.wk_fill('input[id="username"]',self.username) # self.browser.wk_fill('input[id="password"]',self.password) self.browser.click('input[name="submit"]') #~ if self.debug is True: #~ print "Proceding to authentication..." if "SessionError" in self.session.url: self.session.click('a[target="_top"]') self.session.wait(random.uniform(1, 3)) p_url = urlparse(self.browser.url) if p_url.netloc == "apps.webofknowledge.com": #print p_url.scheme+"//"+p_url.netloc+"/WOS_GeneralSearch_input.do?"+p_url.query match = re.match( re.compile( "product\=(?P<product>.*?)\&search_mode\=(?P<search_mode>.*?)\&SID=(?P<ssid>.*?)\&preferencesSaved\=" ), str(p_url.query)) if match is not None: self.product = match.group("product") self.ssid = match.group("ssid") self.search_mode = re.sub("General", "Advanced", match.group("search_mode")) #self.search_mode = match.group("search_mode") self.search_url = "%s://%s/%s_%s_input.do?product=%s&search_mode=%s&SID=%s" % ( p_url.scheme, p_url.netloc, self.product, self.search_mode, self.product, self.search_mode, self.ssid) if self.browser_app == "splinter": self.browser.visit(self.search_url) print self.browser.url else: self.browser.load(self.search_url) print self.browser.url return self else: return sys.exit("Session Id could not be found") else: logging.info("No redirection to service") return sys.exit("Invalid credentials") def launch_search(self): """ Filling the query form found into advanced search page """ logging.info("Launching search") if self.browser_app == "splinter": self.browser.fill("value(input1)", self.query) self.browser.find_by_xpath( "/html/body/div[1]/form/div[1]/table/tbody/tr/td[1]/div[2]/div[1]/table/tbody/tr/td[1]/span[1]/input" ).click() bs = BeautifulSoup(self.browser.html) else: self.session.wk_fill('textarea[id="value(input1)"]', self.query) self.session.click('input[title="Search"]') self.session.wait(random.randint(2, 5)) bs = BeautifulSoup(self.browser.html.encode("utf-8")) query_history = bs.find_all("div", {"class": "historyResults"}) self.nb_search = len(query_history) try: self.nb_results = int(re.sub(",", "", query_history[0].text)) except IndexError: self.nb_results = int(re.sub(",", "", query_history.text)) print self.nb_results logging.warning("Your search \"%s\" gave %i results" % (self.query, self.nb_results)) logging.info("Your SSID is : %s" % self.ssid) if self.nb_results > 0: if self.browser_app == "splinter": self.browser.click_link_by_partial_href('/summary.do?') else: self.session.click('a[title="Click to view the results"]', wait_load=True) print urlparse(self.browser.url).query match = re.search( re.compile( "product=UA&doc\=(?P<doc>.*?)\&qid\=(?P<qid>.*?)&SID"), urlparse(self.browser.url).query) if match is not None: print match.group() self.doc, self.qid = match.group("doc"), match.group('qid') print self.doc, self.qid return self else: self.doc, self.qid = self.parse_params() return self else: return self def load_results(self, markFrom, markTo): """ Load_results(markFrom, markTo) 500 by 500 given the nb of results """ logging.info("loading results") #print "exporting" #p_url0= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=output" %self.ssid #r0 = requests.post(p_url0, headers= headers, cookies=self.cookies) # print p_url0 #print r0 #p_url1= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=results" %self.ssid # print p_url1 #r1 = requests.post(p_url1, headers= headers, cookies=self.cookies) #print r1 r_url = "https://apps.webofknowledge.com/summary.do?product=UA&doc=1&qid=" + self.qid + "&SID=" + self.ssid + "&search_mode=AdvancedSearch" post_url = "https://apps.webofknowledge.com/OutboundService.do?action=go&&" #r2 = requests.post() header = { 'Host': 'apps.webofknowledge.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'DNT': 1, 'Referer': 'https://apps.webofknowledge.com/summary.do?product=UA&doc=1&qid=%s&SID=%s&search_mode=AdvancedSearch' % (self.qid, self.ssid), 'Connection': 'keep-alive' } # markTo = 500 # markFrom = 1 data = { 'SID': self.ssid, 'colName': 'WOS', 'count_new_items_marked': 0, 'displayCitedRefs': 'true', 'displayTimesCited': 'true', 'fields_selection': 'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS', 'filters': 'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS', 'format': 'saveToFile', 'locale': 'en_US', 'markFrom': 1, 'markTo': markTo, 'mark_from': markFrom, 'mark_id': 'WOS', 'mark_to': markTo, 'mode': 'OpenOutputService', 'product': 'UA', 'qid': self.qid, #rurl:'http%3A%2F%2Fapps.webofknowledge.com%2Fsummary.do%3FSID%3DT1WYtnvIngPkHzI4ShI%26product%3DWOS%26doc%3D1%26qid%3D1%26search_mode%3DAd 'rurl': urllib.quote_plus(r_url), 'save_options': 'tabMacUnicode', 'search_mode': 'AdvancedSearch', 'selectedIds': '', 'sortBy': 'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A', 'value(record_select_type)': 'range', 'viewType': 'summary', 'view_name': 'WOS-summary', } r = requests.get(post_url, params=data, headers=header, cookies=self.cookies) #redirects to #url = "http://ets.webofknowledge.com/ETS/ets.do?" final_r = requests.get(r.url, cookies=self.cookies, stream=True) with open(self.outfile, 'a') as f: final_r.text f.write(final_r.text.encode('utf-8')) return self.outfile def export(self): """Writing results into outfile (defaut is normalized query)""" start_time = time.time() open(self.outfile, 'w').close() l = list(range(0, self.nb_results, 500)) l.append(self.nb_results) logging.info("Exporting %s 500 by 500..." % self.nb_results) for i, n in enumerate(l): if l[i] + 1 < self.nb_results: self.load_results(l[i] + 1, l[i + 1]) total = time.time() - start_time, "seconds" raw_file = open(self.outfile, 'r') raw_file_data = raw_file.read().decode("utf-8-sig").encode("utf-8") nb_occurence = len(raw_file_data.split("\r")) - 2 logging.info("Query \"%s\" had %d results: %d has been exported" % (self.query, self.nb_results, nb_occurence)) logging.info("Sucessfully stored in file : %s\n" % (self.outfile)) #logging.info("Execution total time:"+str(" ".join(total))) return def run(self): """ Generic method that encapsulates the WOS extract process """ self.auth() self.launch_search() self.export() self.browser.quit() return
class DocTest(StaticLiveServerTestCase): def setUp(self): fss.remove_tree(settings.MEDIA_ROOT) check_permissions() set_site(self.live_server_url) self.browser = Browser() self.browser.visit(self.live_server_url) login_url = settings.LOGIN_URL self.browser.click_link_by_partial_href(login_url) username = '******' password = '******' create_user(username) login( self.browser, username, password, ) upload_url = reverse('documents.views.add_document') self.browser.click_link_by_partial_href(upload_url) source = 'local' docfile = get_abs_path('doctest.pdf') language = 'eng' public = True title = 'test' notes = 'test notes' upload( self.browser, source, docfile, language, public, title, notes, ) self.browser.is_element_not_present_by_value('ready', 10) self.public = public self.title = title self.notes = notes self.document = get_document(title) def test_upload_doc_local(self): #Create document_exists = exists_document(self.title) self.assertTrue(document_exists) self.assertEquals(self.document.public, self.public) self.assertEquals(self.document.title, self.title) self.assertEquals(self.document.notes, self.notes) document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) document_xpath = '/html/body/div/div[2]/table/tbody/tr[1]' document_tr = self.browser.find_by_xpath(document_xpath) document_id = document_tr['data-id'] self.assertEquals(int(document_id), self.document.id) document_title_xpath = '//*[@id="documents_cell"]/span[1]' document_title = self.browser.find_by_xpath(document_title_xpath) self.assertEquals(document_title.value, self.title) profile_xpath = '/html/body/div/div[1]/div/ul[2]/li[4]/a' profile_link = self.browser.find_by_xpath(profile_xpath) owner_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[4]/a' owner_link = self.browser.find_by_xpath(owner_xpath) self.assertEquals(profile_link.value, owner_link.value) status_xpath = '/html/body/div/div[2]/table/tbody/tr/td[5]/div' status_div = self.browser.find_by_xpath(status_xpath) self.assertEquals(status_div.value, self.document.status) numpages_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[6]/div' numpages_div = self.browser.find_by_xpath(numpages_xpath) self.assertEquals(int(numpages_div.value), self.document.page_count) privacy_icon_xpath = '//*[@id="privacy"]/i' privacy_icon = self.browser.find_by_xpath(privacy_icon_xpath) self.assertTrue(privacy_icon.has_class('icon-eye-open')) structure = create_structure(self.document) root_path = self.document.get_root_path() dirs = fss.listdir(root_path)[0] files = fss.listdir(root_path)[1] for d in dirs: dir_path = os.path.join(root_path, d) for f in structure['dirs'][d]: self.assertIn(f, fss.listdir(dir_path)[1]) for f in structure['files']: self.assertIn(f, fss.listdir(root_path)[1]) # import time; time.sleep(3) self.browser.quit() # # def test_upload_doc_dropbox(self): #Create # pass def test_view_doc(self): #Read link_title_xpath = '//*[@id="documents_cell"]/span[1]/a' self.browser.find_by_xpath(link_title_xpath).click() viewer_title_xpath = ( '//*[@id="documentviewer-container"]' '/div/div[1]/div[1]/div[1]/div[2]/h4/a' ) viewer_title = self.browser.find_by_xpath(viewer_title_xpath) self.assertEquals(viewer_title.value, self.title) # import time; time.sleep(3) self.browser.quit() def test_edit_doc(self): #Update edit_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[7]/a[3]/i' self.browser.find_by_xpath(edit_xpath).click() public = False title = 'new title' notes = 'new notes' edit( self.browser, public, title, notes, ) document = get_document(title) self.assertEquals(document.public, public) self.assertEquals(document.title, title) self.assertEquals(document.notes, notes) document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) document_title_xpath = '//*[@id="documents_cell"]/span[1]' document_title = self.browser.find_by_xpath(document_title_xpath) self.assertEquals(document_title.value, title) privacy_icon_xpath = '//*[@id="privacy"]/i' privacy_icon = self.browser.find_by_xpath(privacy_icon_xpath) self.assertTrue(privacy_icon.has_class('icon-eye-close')) # import time; time.sleep(3) self.browser.quit() def test_remove_doc(self): #Delete old_doc_num = len(self.browser.find_by_css('tr.document-row')) remove_xpath = '//*[@id="remove"]/i' self.browser.find_by_xpath(remove_xpath).click() confirm_xpath = '//*[@id="confirm-remove"]/i' self.browser.find_by_xpath(confirm_xpath).click() document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) new_doc_num = len(self.browser.find_by_css('tr.document-row')) self.assertEquals(new_doc_num, old_doc_num - 1) # import time; time.sleep(3) self.browser.quit()
class SearchTest(StaticLiveServerTestCase): def setUp(self): fss.remove_tree(settings.MEDIA_ROOT) check_permissions() set_site(self.live_server_url) self.browser = Browser() self.browser.visit(self.live_server_url) login_url = settings.LOGIN_URL self.browser.click_link_by_partial_href(login_url) username = '******' password = '******' create_user(username) login( self.browser, username, password, ) upload_url = reverse('documents.views.add_document') self.browser.click_link_by_partial_href(upload_url) source = 'local' docfile = get_abs_path('doctest.pdf') language = 'eng' public = True title = 'test' notes = 'test notes' upload( self.browser, source, docfile, language, public, title, notes, ) self.browser.is_element_not_present_by_value('ready', 10) self.title = title import time; time.sleep(1) def test_search_title(self): self.browser.visit(self.live_server_url) title = 'test' driver = self.browser.driver actions = ActionChains(driver) searchbar_xpath = '//*[@id="search"]/div/div/div[2]' searchbar_div = driver.find_element_by_xpath(searchbar_xpath) actions.move_to_element(searchbar_div) actions.click() actions.perform() menu_title_xpath = '/html/body/ul/li[4]/a' menu_title = self.browser.find_by_xpath(menu_title_xpath) menu_title.click() input_title_xpath = \ '//*[@id="search"]/div/div/div[2]/div[2]/div[2]/input' input_title = self.browser.find_by_xpath(input_title_xpath) input_title.type(title + '\r') search_list_url = \ self.live_server_url + '/?title=' + title + '&' self.assertEquals(self.browser.url, search_list_url) summary_xpath = '/html/body/div/div[2]/p/small' summary = self.browser.find_by_xpath(summary_xpath) self.assertEquals(summary.value, '1 documents found') document_img_xpath = '/html/body/div/div[2]/ul/li/a/img' document_img = self.browser.find_by_xpath(document_img_xpath).click() viewer_title_xpath = ( '//*[@id="documentviewer-container"]' '/div/div[1]/div[1]/div[1]/div[2]/h4/a' ) viewer_title = self.browser.find_by_xpath(viewer_title_xpath) self.assertEquals(viewer_title.value, self.title) # import time; time.sleep(3) self.browser.quit() def test_search_text(self): self.browser.visit(self.live_server_url) text = 'download' driver = self.browser.driver actions = ActionChains(driver) searchbar_xpath = '//*[@id="search"]/div/div/div[2]' searchbar_div = driver.find_element_by_xpath(searchbar_xpath) actions.move_to_element(searchbar_div) actions.click() actions.perform() menu_text_xpath = '/html/body/ul/li[3]/a' menu_text = self.browser.find_by_xpath(menu_text_xpath) menu_text.click() input_text_xpath = \ '//*[@id="search"]/div/div/div[2]/div[2]/div[2]/input' input_text = self.browser.find_by_xpath(input_text_xpath) input_text.type(text + '\r') search_list_url = \ self.live_server_url + '/?q=' + text + '&' self.assertEquals(self.browser.url, search_list_url) summary_xpath = '/html/body/div/div[2]/p/small' summary = self.browser.find_by_xpath(summary_xpath) self.assertEquals(summary.value, '1 documents found') page_xpath = '/html/body/div/div[2]/ul/li[1]/div[2]/div/div[2]/a/div' page_div = self.browser.find_by_xpath(page_xpath) self.assertIn(text, page_div.value) document_img_xpath = '/html/body/div/div[2]/ul/li/a/img' document_img = self.browser.find_by_xpath(document_img_xpath).click() viewer_title_xpath = ( '//*[@id="documentviewer-container"]' '/div/div[1]/div[1]/div[1]/div[2]/h4/a' ) viewer_title = self.browser.find_by_xpath(viewer_title_xpath) self.assertEquals(viewer_title.value, self.title) # import time; time.sleep(3) self.browser.quit()
class WOS(object): """ A little module for exporting Web of Science search results into a txt file """ def __init__(self, **kwargs): """ Construct a new WOS object given a query, an export file (without ".isi") a username and a password for authentication eg : WOS(query="TS=(epigenetic*", outfile="epigenetic", user="******", passw="mypassw") """ #defining params self.query = kwargs["query"] self.outfile = kwargs["outfile"]+".isi" try: self.user=kwargs["user"] self.passw = kwargs["passw"] except: self.user, self.passw = private try: self.browser_app = kwargs["browser"] except: self.browser_app = "splinter" #using MLV Auth Server self.auth_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/WOS_AdvancedSearch_input.do?&product=WOS&search_mode=AdvancedSearch" #Firefox Browser if self.browser_app == "splinter": self.browser = Browser("firefox") else: self.browser = spynner.Browser() self.browser.set_html_parser(PyQuery) #self.browser = Browser('zope.testbrowser', ignore_robots=True) #Session params self.session = None self.cookies = {} if self.query is None: sys.exit("No query provided") if "=" not in self.query: #or "(" not in self.query logging.warning("Syntax is not WOS compliant. Check Query Syntax") sys.exit("Query Syntax Error") if self.outfile is None: self.outfile = str(re.sub(re.compile("[^0-9a-zA-Z]+"),"_", self.query))+".isi" if self.user is None and self.passw is None: self.user, self.passw = private logging.info("WOS search parameters:\n\t- query: %s\n\t- outfile: %s\n\t- user: %s\n\t- password: %s" %(self.query, self.outfile, self.user, self.passw)) self.run() def auth(self): """ authentification throught auth_url to get the session id SID """ #Loading url if self.browser_app == "splinter": self.browser.visit(self.auth_url) self.browser.fill('username', self.user) self.browser.fill('password', self.passw) self.browser.find_by_name("submit").click() self.cookies = self.browser.cookies.all() else: self.browser = self.browser.load(self.url) self.browser.wk_fill('input[id="username"]',self.username) self.browser.wk_fill('input[id="password"]',self.password) self.browser.click('input[name="submit"]') #~ if self.debug is True: #~ print "Proceding to authentication..." if "SessionError" in self.session.url : self.session.click('a[target="_top"]') self.session.wait(random.uniform(1, 3)) p_url = urlparse(self.browser.url) if p_url.netloc == "apps-webofknowledge-com.fennec.u-pem.fr": #print p_url.scheme+"//"+p_url.netloc+"/WOS_GeneralSearch_input.do?"+p_url.query match = re.match(re.compile("product\=(?P<product>.*?)\&search_mode\=(?P<search_mode>.*?)\&SID=(?P<ssid>.*?)\&preferencesSaved\="), str(p_url.query)) if match is not None: self.product = match.group("product") self.ssid = match.group("ssid") self.search_mode = re.sub("General", "Advanced", match.group("search_mode")) #self.search_mode = match.group("search_mode") self.search_url = "%s://%s/%s_%s_input.do?product=%s&search_mode=%s&SID=%s" %(p_url.scheme, p_url.netloc, self.product,self.search_mode,self.product,self.search_mode,self.ssid) if self.browser_app == "splinter": self.browser.visit(self.search_url) print self.browser.url else: self.browser.load(self.search_url) print self.browser.url return self else: return sys.exit("Session Id could not be found") else: logging.info("No redirection to service") return sys.exit("Invalid credentials") def launch_search(self): """ Filling the query form found into advanced search page """ logging.info("Launching search") if self.browser_app == "splinter": self.browser.fill("value(input1)", self.query) self.browser.find_by_xpath("/html/body/div[1]/form/div[1]/table/tbody/tr/td[1]/div[2]/div[1]/table/tbody/tr/td[1]/span[1]/input").click() bs = BeautifulSoup(self.browser.html) else: self.session.wk_fill('textarea[id="value(input1)"]', self.query) self.session.click('input[title="Search"]') self.session.wait(random.randint(2,5)) bs = BeautifulSoup(self.browser.html.encode("utf-8")) query_history = bs.find_all("div", {"class":"historyResults"}) self.nb_search = len(query_history) try: self.nb_results = int(re.sub(",", "", query_history[0].text)) except IndexError: self.nb_results = int(re.sub(",", "", query_history.text)) print self.nb_results logging.warning("Your search \"%s\" gave %i results"%(self.query, self.nb_results)) logging.info("Your SSID is : %s" %self.ssid) if self.browser_app == "splinter": self.browser.click_link_by_partial_href('/summary.do?') else: self.session.click('a[title="Click to view the results"]',wait_load=True) print urlparse(self.browser.url).query match = re.search(re.compile("product=WOS&doc\=(?P<doc>.*?)\&qid\=(?P<qid>.*?)&SID"), urlparse(self.browser.url).query) if match is not None: print match.group() self.doc, self.qid = match.group("doc"), match.group('qid') print self.doc, self.qid return self else: self.doc, self.qid = self.parse_params() return self def load_results(self, markFrom, markTo, i): """ Load_results(markFrom, markTo) 500 by 500 given the nb of results """ logging.info("loading results") #print "exporting" #p_url0= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=output" %self.ssid #r0 = requests.post(p_url0, headers= headers, cookies=self.cookies) # print p_url0 #print r0 #p_url1= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=results" %self.ssid # print p_url1 #r1 = requests.post(p_url1, headers= headers, cookies=self.cookies) #print r1 r_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/summary.do?product=WOS&doc=1&qid="+self.qid+"&SID="+self.ssid+"&search_mode=AdvancedSearch" post_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/OutboundService.do?action=go&&" #r2 = requests.post() header={ 'Host': 'apps-webofknowledge-com.fennec.u-pem.fr', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'DNT': 1, 'Referer': 'https://apps-webofknowledge-com.fennec.u-pem.fr/summary.do?product=WOS&doc=1&qid=%s&SID=%s&search_mode=AdvancedSearch'%(self.qid, self.ssid), 'Connection': 'keep-alive' } # markTo = 500 # markFrom = 1 data = { 'SID': self.ssid, 'colName':'WOS', 'count_new_items_marked':0, 'displayCitedRefs':'true', 'displayTimesCited':'true', 'fields_selection':'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS', 'filters':'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS', 'format':'saveToFile', 'locale':'en_US', 'markFrom':1, 'markTo':markTo, 'mark_from':markFrom, 'product':'WOS', 'mark_to':markTo, 'mode':'OpenOutputService', 'product':'WOS', 'qid':self.qid, 'startYear':'2015', 'endYear':'2014', #rurl:'http%3A%2F%2Fapps.webofknowledge.com%2Fsummary.do%3FSID%3DT1WYtnvIngPkHzI4ShI%26product%3DWOS%26doc%3D1%26qid%3D1%26search_mode%3DAd 'rurl':urllib.quote_plus(r_url), 'save_options':'othersoftware', 'search_mode':'AdvancedSearch', 'selectedIds':'', 'sortBy':'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A', 'value(record_select_type)':'range', 'viewType':'summary', 'view_name':'WOS-summary', } r = requests.get(post_url, params=data,headers=header, cookies=self.cookies) #redirects to #url = "http://ets.webofknowledge.com/ETS/ets.do?" data_directory = self.outfile.split('.isi')[0] try: os.mkdir("exported_data") print "creating directory exported_data" except: print "exported_data already exists" pass try: os.mkdir("exported_data/"+data_directory) print "creating directory "+data_directory except: print data_directory +" already exists" pass final_r = requests.get(r.url, cookies=self.cookies, stream=True) with open( "exported_data/"+data_directory+'/'+data_directory+'_'+str(i) +'.isi' , 'w') as f: final_r.text f.write(final_r.text.encode('utf-8')) return self.outfile def export(self): """Writing results into outfile (defaut is normalized query)""" start_time = time.time() #open(self.outfile, 'w').close() l = list(range(0, self.nb_results, 500)) l.append(self.nb_results) logging.info("Exporting %s 500 by 500..." %self.nb_results) for i,n in enumerate(l): if l[i]+1 < self.nb_results: self.load_results(l[i]+1, l[i+1],str(l[i]+1)+'-'+str(l[i+1])) total = time.time() - start_time, "seconds" # raw_file = open(self.outfile, 'r') # raw_file_data = raw_file.read().decode("utf-8-sig").encode("utf-8") # nb_occurence = len(raw_file_data.split("\n\n"))-1 logging.info("Query \"%s\" had %d results: %d has been exported" %(self.query, self.nb_results)) logging.info("Sucessfully stored in directory : %s\n" %(self.outfile)) #logging.info("Execution total time:"+str(" ".join(total))) return def run(self): """ Generic method that encapsulates the WOS extract process """ self.auth() self.launch_search() self.export() self.browser.close() return
def scrape(): #Executable path executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) #dictionary to store data mars_data = {} #############Nasa News################## #Use splinter module to visit Nasa news url = 'https://mars.nasa.gov/news/' browser.visit(url) #html object html = browser.html #Parse through HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') #Latest headline and blurb news_title = soup.find('div', class_='content_title').text news_p = soup.find("div", class_="article_teaser_body").text #enter into mars_data mars_data["news_title"] = (news_title) mars_data["news_paragraph"] = (news_p) #Print valuesp print(news_title) print(news_p) ############### Featured Image ################ #Url #2 jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl_url) #Click on appropriate link button = browser.find_by_id("full_image") button.click() #Click on second appropriate link time.sleep(5) browser.click_link_by_partial_text("more info") #html and Beautiful Soup html = browser.html soup = BeautifulSoup(html, 'html.parser') #Obtain image source image = soup.find("img", class_='main_image').get('src') #Put it all together featured_image_url = f'https://www.jpl.nasa.gov{image}' #enter into mars_data mars_data["featured_image"] = (featured_image_url) #print url print(featured_image_url) ############ Mars Weather ##################### #Url 3 url_3 = "https://twitter.com/marswxreport?lang=en" browser.visit(url_3) #Beautiful soup object html = browser.html soup = BeautifulSoup(html, 'html.parser') #scraping what I need mars_weather = soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text #enter into mars_data mars_data["mars_weather"] = (mars_weather) #print weather print(mars_weather) ######### Mars Facts ############ #url 4 url_4 = "https://space-facts.com/mars/" browser.visit(url_4) #obtain table data table = pd.read_html(url_4)[0] #Rename columns renamed_table = table.rename(columns={0: "Mars Profile", 1: "Value"}) #Make an HTML object mars_html = renamed_table.to_html() #removed /n mars_html = mars_html.replace('\n', ' ') #save to mars_data mars_data["mars_facts"] = (mars_html) ############ Mars Hemispheres ########## #url_5 url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_5) #Cerberus browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') html = browser.html soup = BeautifulSoup(html, 'html.parser') cerberus_title = soup.find("h2", class_='title').text time.sleep(2) download = browser.find_link_by_partial_text('Sample').first cerberus_url = download['href'] #back to main page url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_5) #Schiaparelli browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') html = browser.html soup = BeautifulSoup(html, 'html.parser') schiaparelli_title = soup.find("h2", class_='title').text browser.click_link_by_partial_href( 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg' ) schiaparelli_url = (str(browser.url)) #back to main page url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_5) #Syrtis Major browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') html = browser.html soup = BeautifulSoup(html, 'html.parser') syrtis_title = soup.find("h2", class_='title').text browser.click_link_by_partial_href( 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg' ) syrtis_url = (str(browser.url)) #back to main page url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_5) #Valles Marineris browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced') html = browser.html soup = BeautifulSoup(html, 'html.parser') valles_title = soup.find("h2", class_='title').text browser.click_link_by_partial_href( 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg' ) valles_url = (str(browser.url)) #hemisphere_image_urls hemisphere_image_urls = [{ "title": cerberus_title, "img_url": cerberus_url }, { "title": schiaparelli_title, "img_url": schiaparelli_url }, { "title": syrtis_title, "img_url": syrtis_url }, { "title": valles_title, "img_url": valles_url }] #put into mars_data mars_data["mars_hemispheres"] = (hemisphere_image_urls) return mars_data
#!/usr/bin/env python2 # -*- coding: utf-8 -*- import sys, os from splinter import Browser reload(sys) sys.setdefaultencoding('utf-8') br = Browser() file_prefix = 'file://' dir_name = os.path.dirname(os.path.realpath(__file__)) file_name = "output.html" full_path = file_prefix + dir_name + '/' + file_name br.visit(full_path) br.click_link_by_partial_href('/releases/ac?artist_id=')
browser.find_by_tag('h1') browser.find_by_name('name') browser.find_by_text('Hello World!') browser.find_by_id('firstheader') browser.find_by_value('query') # get element first_found = browser.find_by_name('name').first last_found = browser.find_by_name('name').last second_found = browser.find_by_name('name')[1] # Get value of an element browser.find_by_css('h1').first.value # Clicking links,return the first link browser.click_link_by_href('http://www.the_site.com/my_link') browser.click_link_by_partial_href('my_link') browser.click_link_by_text('my link') browser.click_link_by_partial_text('part of link text') browser.click_link_by_id('link_id') # element is visible or invisible browser.find_by_css('h1').first.visible #fill content browser.find_by_id('productName').fill( 'splinter - python acceptance testing for web applications') browser.fill('q', 'splinter - python acceptance testing for web applications') # Verifying if element has a className browser.find_by_css('.content').first.has_class('content') # click button
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def test_login_correct(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_login_incorrect(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def test_add_entry(self): # Login to blog self.test_login_correct() # Add new entry self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "test post") self.browser.fill("content", "acceptance testing post") self.browser.find_by_css("button[type=submit]").first.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_view_single_entry(self): # Login to blog self.test_login_correct() # Click on top entry title self.browser.visit("http://127.0.0.1:8080/entry/1/") self.assertEqual(self.browser.url, "http://127.0.0.1:8080/entry/1/") def test_edit_entry(self): # Login to blog self.test_login_correct() # Add new entry self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "test post") self.browser.fill("content", "acceptance testing post") self.browser.find_by_css("button[type=submit]").first.click() # Click edit link on top entry self.browser.click_link_by_partial_href('edit') # Enter new title and contents self.browser.fill("title", "edited test post") self.browser.fill("content", "edited acceptance testing post") self.browser.find_by_css("button[type=submit]").first.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_delete_entry(self): # Login to blog self.test_login_correct() # Add new entry self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "test post") self.browser.fill("content", "acceptance testing post") self.browser.find_by_css("button[type=submit]").first.click() # Delete entry self.browser.click_link_by_partial_href('delete') button = self.browser.find_by_css("button[type=submit]") button.click() # Make sure browser puts you back on home self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_logout(self): # Login to blog self.test_login_correct() # Click on 'Logout' link self.browser.click_link_by_text('Logout') # Check to see if 'Logout' link is visible self.assertEqual(self.browser.is_element_present_by_text('Logout'), False) # Check to see if 'Login' link is visible self.assertEqual(self.browser.is_element_present_by_text('Login'), True) def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
def scrape(): # Dependencies from splinter import Browser from bs4 import BeautifulSoup import requests import pandas as pd import pymongo import time import ctypes # An included library with Python install. def Mbox(title, text, style): return ctypes.windll.user32.MessageBoxW(0, text, title, style) mars_data_dict = {} ## (1) NASA Mars News # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. # Assign the text to variables that you can reference later. # URL of page to be scraped url_nz = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module response_nz = requests.get(url_nz) # Create BeautifulSoup object; parse with 'html.parser' soup_nz = BeautifulSoup(response_nz.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_nz.prettify()) #time.sleep(2) # Find the latest News Title news_title = soup_nz.find("div", class_="content_title").a.text[1:-1] #print(news_title) # Find the latest News Paragraph Text news_p = soup_nz.find("div", class_="image_and_description_container").a.text[3:-7] #print(news_p) mars_data_dict["news_title"] = news_title mars_data_dict["news_p"] = news_p ## (2) JPL Mars Space Images - Featured Image # Use splinter to navigate the site and find the image url for the current Featured Mars Image # and assign the url string to a variable called featured_image_url. # Make sure to find the image url to the full size .jpg image. # Make sure to save a complete url string for this image. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) time.sleep(2) #dir(browser) browser.click_link_by_id('full_image') time.sleep(2) browser.click_link_by_partial_href("/spaceimages/details.") time.sleep(2) browser.click_link_by_partial_href("/spaceimages/images/largesize") time.sleep(2) featured_image_url = browser.url #print(featured_image_url) mars_data_dict["feat_img"] = featured_image_url browser.quit() ## (3) Mars Weather # Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. # Save the tweet text for the weather report as a variable called mars_weather. # URL of page to be scraped url_tweet = 'https://twitter.com/marswxreport?lang=en' # Retrieve page with the requests module response_tweet = requests.get(url_tweet) # Create BeautifulSoup object; parse with 'html.parser' soup_tweet = BeautifulSoup(response_tweet.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_tweet.prettify()) #time.sleep(2) # scrape the latest Mars weather tweet from the page tweets = soup_tweet.find_all("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text") for tweet in tweets: find_text = tweet.text.find("InSight sol") if find_text == 0: mars_weather = tweet.text #print(mars_weather) break mars_data_dict["weather"] = mars_weather ## (4) Mars Facts # URL of page to be scraped url_mfacts = 'https://space-facts.com/mars/' # Retrieve page with the requests module response_mfacts = requests.get(url_mfacts) # Create BeautifulSoup object; parse with 'html.parser' soup_mfacts = BeautifulSoup(response_mfacts.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_mfacts.prettify()) #time.sleep(2) tables = pd.read_html(url_mfacts)[1] #tables mars_data_dict["mfacts"] = tables tables.to_html("../html/mars_facts.html") ## (5) Mars Hemispheres # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres. # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. # Save both the image url string for the full resolution hemisphere image, # and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the # keys img_url and title. # Append the dictionary with the image url string and the hemisphere title to a list. # This list will contain one dictionary for each hemisphere executable_path = {"executable_path": "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False) # URL of page to be scraped url_mhemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_mhemi) time.sleep(2) # Image 1 browser.click_link_by_partial_text("Cerberus Hemisphere Enhanced") time.sleep(2) title1 = browser.title.split("|")[0] #print(title1) browser.click_link_by_text("Sample") time.sleep(2) img1_url = browser.windows[1].url #print(img1_url) time.sleep(2) browser.windows[1].close() browser.back() hemi1_dict = {} hemi1_dict["title"] = title1 hemi1_dict["img_url"] = img1_url #hemi1_dict # Image 2 browser.click_link_by_partial_text("Schiaparelli Hemisphere Enhanced") time.sleep(2) title2 = browser.title.split("|")[0] #print(title2) browser.click_link_by_text("Sample") time.sleep(2) img2_url = browser.windows[1].url #print(img2_url) time.sleep(2) browser.windows[1].close() browser.back() hemi2_dict = {} hemi2_dict["title"] = title2 hemi2_dict["img_url"] = img2_url #hemi2_dict # Image 3 browser.click_link_by_partial_text("Syrtis Major Hemisphere Enhanced") time.sleep(2) title3 = browser.title.split("|")[0] #print(title3) browser.click_link_by_text("Sample") time.sleep(2) img3_url = browser.windows[1].url #print(img3_url) time.sleep(2) browser.windows[1].close() browser.back() hemi3_dict = {} hemi3_dict["title"] = title3 hemi3_dict["img_url"] = img3_url #hemi3_dict # Image 4 browser.click_link_by_partial_text("Valles Marineris Hemisphere Enhanced") time.sleep(2) title4 = browser.title.split("|")[0] #print(title4) browser.click_link_by_text("Sample") time.sleep(2) img4_url = browser.windows[1].url #print(img4_url) time.sleep(2) browser.windows[1].close() browser.back() hemi4_dict = {} hemi4_dict["title"] = title4 hemi4_dict["img_url"] = img4_url #hemi4_dict hemisphere_image_urls = [hemi1_dict, hemi2_dict, hemi3_dict, hemi4_dict] #hemisphere_image_urls mars_data_dict["hemi_img"] = hemisphere_image_urls mars_data_dict browser.quit() Mbox("Mission to Mars Completed", "Congratulations!!! You've mined Mars!", 1)
def scrape(): import pandas as pd from bs4 import BeautifulSoup as bs import requests from selenium import webdriver from splinter import Browser #we visit the first site and get our title/paragraph text url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" driver = webdriver.Chrome() driver.get(url) data = driver.page_source driver.quit() soup = bs(data, 'html.parser') news_title = soup.find('div', 'content_title').text news_p = soup.find('div', 'article_teaser_body').text #we visit the second site and navigate to our image page and save the url url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser = Browser('chrome', 'chromedriver.exe', headless=False) browser.visit(url) html = browser.html soup = bs(html, 'html.parser') browser.click_link_by_partial_text('FULL IMAGE') browser.click_link_by_partial_text('more info') browser.click_link_by_partial_href('/spaceimages/images/largesize') featured_image_url = browser.url browser.quit() #we visit our third site and retrieve the text of the latest tweet url = "https://twitter.com/marswxreport?lang=en" response = requests.get(url) soup = bs(response.text, 'html.parser') tweet_text = soup.find('p', 'tweet-text').text #don't forget to remove the image text! If it exists, this will remove it. img_text = soup.find('a', 'u-hidden').text mars_weather = tweet_text.replace(img_text, '') #we visit our fourth site and retrieve the needed table url = "https://space-facts.com/mars/" tables = pd.read_html(url) df = tables[0].rename(columns={0: 'Metric', 1: 'Value'}) df = df.set_index('Metric') html_table = df.to_html() #we visit our final site to retrieve our image urls url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser = Browser('chrome', 'chromedriver.exe', headless=False) browser.visit(url) html = browser.html soup = bs(html, 'html.parser') divs = soup.find_all('div', 'description') hemisphere_image_urls = [] for div in divs: link = div.find('a') browser.visit("https://astrogeology.usgs.gov/" + link['href']) title = link.text.replace(" Enhanced", "") html = browser.html soup = bs(html, 'html.parser') img_url = soup.find('a', text="Sample")['href'] hemisphere_image_urls.append({"title": title, "img_url": img_url}) browser.quit() mars_dict = { 'news_title': news_title, 'news_p': news_p, 'featured_image_url': featured_image_url, 'mars_weather': mars_weather, 'html_table': html_table, 'hemisphere_image_urls': hemisphere_image_urls } return mars_dict
def add_album_to_rym(args, config_file): br = Browser() br.visit('https://rateyourmusic.com/account/login') time.sleep(3) # Login br.fill('username', credentials.username) br.fill('password', credentials.password) br.find_by_id('login_submit').click() time.sleep(5) (title, artist, tracklist, release, cover) = config.read_config(config_file) """ if args.update_album: br.visit(args.rym_album) else: """ if args.add_artist: br.visit('https://rateyourmusic.com/artist_add') #br.fill('lastname', unicode(artist)) br.fill('lastname', artist) br.fill('comments', args.url) br.find_by_id('submitbtn').click() time.sleep(3) br.find_by_text(artist).click() else: br.visit(args.rym_profile) time.sleep(3) br.click_link_by_partial_href('/releases/ac?artist_id=') # Add data #br.fill('title', unicode(title)) br.fill('title', title) br.find_by_id('format58').click() br.find_by_id('goAdvancedBtn').click() tracks_div = br.find_by_id('tracks_adv') tracks_text_area = tracks_div.find_by_id('track_advanced') #tracks_text_area.fill(unicode(tracklist)) tracks_text_area.fill(tracklist) br.find_by_id('goSimpleBtn').click() br.fill('notes', args.url) (year, month, day) = parse_release_date(release) release_month_selector = br.find_by_id('month') release_month_selector.select(month) release_day_selector = br.find_by_id('day') release_day_selector.select(day) release_year_selector = br.find_by_id('year') release_year_selector.select(year) br.find_by_id('previewbtn').click() br.find_by_id('submitbtn').click() # Add cover art """ coverart_img_element = br.find_by_xpath("//img[@class='coverart_img']") print(coverart_im_element) sys.exit(0) """ br.click_link_by_partial_href('/images/upload?type=l&assoc_id=') br.attach_file('upload_file', cover) br.fill('source', args.url) br.find_by_id('uploadbutton').click() time.sleep(5) br.click_link_by_partial_href('javascript:setStatus') # Vote for genre br.click_link_by_partial_href('/release/') time.sleep(3) br.click_link_by_partial_href('/rgenre/set?') prigen_text_area = br.find_by_xpath("//input[@id='prigen']") prigen_text_area.fill('vaporwave') prigen_vote_button = br.find_by_xpath("//input[@value='+ propose']").first prigen_vote_button.click() # Done br.click_link_by_partial_href('/release/') print("Finished")
def scrape(): scrape_dict = {} # Update dictionary with scrape time scrape_dict["scrape_time"] = str(datetime.datetime.now()) # Get most current news story from NASA's mars site nasa_news_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" nasa_html = requests.get(nasa_news_url).text nasa_soup = bs(nasa_html, 'lxml') # Get first title title_results = nasa_soup.find_all('div', class_="content_title") title_list = [] for result in title_results: try: title = result.find('a').text.strip() if title: title_list.append(title) except Exception as e: return e news_title = title_list[0] # Get first paragraph p_results = nasa_soup.find_all('div', class_="rollover_description_inner") p_list = [] for p in p_results: try: par = p.text.strip() if par: p_list.append(par) except Exception as e: return e news_p = p_list[0] # Update dictionary scrape_dict["mars_news_title"] = news_title scrape_dict["mars_news_p"] = news_p # Create splinter browser instance executable_path = { 'executable_path': 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' } browser = Browser('chrome', **executable_path) # Scrape NASA images page for featured image nasa_images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(nasa_images_url) browser.find_by_css('.button').first.click() time.sleep(3) browser.find_by_css('.button').last.click() partial_link = browser.find_by_css('.download_tiff').last.value.split( " ")[2] browser.click_link_by_partial_href(partial_link) featured_image_url = browser.url # Update dictionary scrape_dict["featured_image"] = featured_image_url # Use Splinter to scrape USGS for hemisphere images and urls usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(usgs_url) link_objects = browser.find_by_css('h3') hemisphere_list = [] { hemisphere_list.append(link.value.replace(" Enhanced", "")) for link in link_objects } url_list = [] for hemisphere in hemisphere_list: browser.click_link_by_partial_text(hemisphere) image_object = browser.find_by_css('img.wide-image') img_url = image_object['src'] url_list.append(img_url) browser.back() browser.quit() hemisphere_image_urls = [] for hemisphere, url in zip(hemisphere_list, url_list): hemisphere_dict = {"title": hemisphere, "url": url} hemisphere_image_urls.append(hemisphere_dict) hemisphere_image_urls # Update dictionary scrape_dict["hemisphere_images"] = hemisphere_image_urls # Scrape weather conditions from Mars Weather Twitter twitter_url = "https://twitter.com/marswxreport?lang=en" twitter_html = requests.get(twitter_url).text twitter_soup = bs(twitter_html, 'lxml') tweets = twitter_soup.find_all('div', class_="content") weather_only_tweets = [] for tweet in tweets: username = tweet.find('span', class_="username u-dir u-textTruncate") pic_link = tweet.find('a', class_="twitter-timeline-link u-hidden") if username.text == "@MarsWxReport": tweet_content = tweet.find( 'p', class_= "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text.strip() # Eliminate non-weather tweets report_test = tweet_content.split(" ") if report_test[0] == "Sol": if pic_link is not None: weather_only_tweets.append( tweet_content.replace(pic_link.text, "")) else: weather_only_tweets.append(tweet_content) mars_weather = weather_only_tweets[0] # Update dictionary scrape_dict["mars_weather"] = mars_weather # Scrape facts table facts_url = "https://space-facts.com/mars/" facts_table = pd.read_html(facts_url) facts_df = facts_table[0] facts_df = facts_df.set_index(0) facts_html = facts_df.to_html(classes="table table-format", border=0, header=False, index_names=False).replace("\n", "") # Update dictionary scrape_dict["mars_facts"] = facts_html return scrape_dict
def scrape(): #set up Browser executable_path = {'executable_path': "chromedriver"} browser = Browser('chrome', **executable_path, headless=False) #Get Nasa News nasa_news = 'https://mars.nasa.gov/news/' browser.visit(nasa_news) html = browser.html soup = bs(html, 'html.parser') results = soup.find_all('li', class_="slide") for result in results[0]: news_title = result.find('div',class_="content_title").text news_description = result.find('div',class_="article_teaser_body").text news_url = nasa_news + result.a['href'] time.sleep(1) #Collect JPL Image jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl) html = browser.html soup = bs(html, 'html.parser') browser.click_link_by_id('full_image') time.sleep(2) browser.click_link_by_partial_href('/spaceimages/details') soup = bs(browser.html, 'html.parser') results = soup.find('figure', class_ = 'lede') base_url = browser.url[:24] img = results.a.img['src'] featured_img_url = base_url + img time.sleep(1) #Mars Weather weather = 'https://twitter.com/marswxreport?lang=en' browser.visit(weather) html = browser.html soup = bs(html, 'html.parser') results = soup.find('div', class_="js-tweet-text-container") try: results.a.decompose() except: pass mars_weather = results.find('p').text time.sleep(1) #Mars Facts space_facts = 'https://space-facts.com/mars/' mars_facts = pd.read_html(space_facts)[1].rename(columns = {0:'Fact',1:'Data'}).to_html(index=False).replace('\n','') time.sleep(1) #Mars Hemispheres hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemispheres) html = browser.html soup = bs(html, 'html.parser') #Find list of image tags base_url = browser.url[:29] results = soup.find_all('div',attrs={'class':'collapsible results'})[0] images = results.find_all('div')[:] #iterate through length of tags and collect hrefs, navigate to page and collect full image link hemisphere_image_urls = [] for image in range(0,len(images)): if image == 0 or image % 2 == 0: url = base_url+images[image].a['href'] title = (images[image].h3.text) browser.visit(url) time.sleep(1) soup = bs(browser.html,'html.parser') results = soup.find_all('ul')[0] result = results.find_all('li')[0] hemi_url = (result.a['href']) hemisphere_image_urls.append({'title':title, 'img_url':hemi_url}) facts = {'news_title':news_title, 'news_description':news_description, 'news_url':news_url, 'featured_img_url':featured_img_url, 'mars_weather':mars_weather, 'mars_facts':mars_facts, 'hemi_img_url':hemisphere_image_urls } browser.visit('https://i.pinimg.com/originals/49/78/3e/49783e18b9ac11c560362029ba1f3328.jpg') return facts
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Mars News url_1 = "https://mars.nasa.gov/news/" browser.visit(url_1) time.sleep(2) html = browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.find('div', class_="content_title").a.text news_p = soup.find('div', class_="article_teaser_body").text # JPL Mars Space Images url_2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_2) time.sleep(2) html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') time.sleep(5) browser.click_link_by_partial_href('/spaceimages/images/largesize/') html = browser.html soup = BeautifulSoup(html, 'html.parser') featured_image_url = soup.find('img')["src"] # Mars Weather url_3 = "https://twitter.com/marswxreport?lang=en" browser.visit(url_3) html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_weather = soup.find("p", class_="js-tweet-text").text # Mars Facts url_4 = "https://space-facts.com/mars/" tables = pd.read_html(url_4) df = tables[0] df = df.rename(columns={0: "Category", 1: "Value"}) df = df.set_index("Category", drop=True) del df.index.name table_data = df.to_html() print(table_data) # Mars Hemispheres Mars_Hem = [] url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_5) html = browser.html soup = BeautifulSoup(html, 'html.parser') Hemis = soup.findAll("div", class_="description") for hemi in Hemis: Name = hemi.a.h3.text print(Name) browser.click_link_by_partial_text(Name) time.sleep(3) browser.click_link_by_partial_text('Open') time.sleep(2) html = browser.html soup = BeautifulSoup(html, 'html.parser') img_src = soup.find('img', class_="wide-image")['src'] img_src_full = f"https://astrogeology.usgs.gov" + img_src print(img_src_full) Name = Name[:-9] post = {"title": Name, "img_url": img_src_full} Mars_Hem.append(post) print(Mars_Hem) browser.click_link_by_partial_text('Close') time.sleep(3) browser.click_link_by_partial_text('Back') return news_title, news_p, featured_image_url, mars_weather, table_data, Mars_Hem
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") self.browser.driver.set_window_size(1280, 800) # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def test_login_correct(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_login_incorrect(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def test_logout(self): self.test_login_correct() self.browser.find_by_css("button[type=logout]").first.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def testAddEntryNotLoggedIn(self): self.test_login_incorrect() #tries to visit entry page self.browser.visit("http://127.0.0.1:8080/entry/add") #redirects to login page self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login?next=%2Fentry%2Fadd") def testAddEntryLoggedIn(self): self.test_login_correct() #visit the add entry page by clicking on button - how to make this work? #self.browser.find_by_css("button[type=add]").first.click() self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "Add Entry Logged In Test Title") self.browser.fill("content", "Test content for add entry logged in") #find button for add entry and click it button = self.browser.find_by_css("button[type=submit]") button.click() #browser should return to homepage after test entry added self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def testEditEntryLoggedIn(self): self.test_login_correct() self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "Edit Entry Logged In Title") self.browser.fill("content", "Edit Entry Logged in content") self.browser.find_by_css("button[type=submit]").first.click() self.browser.click_link_by_partial_href('edit') self.browser.fill("title", "edited title") self.browser.fill("content", "edited content") self.browser.find_by_css("button[type=submit]").first.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def testDeleteEntryLoggedIn(self): self.test_login_correct() self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "Test Delete Entry") self.browser.fill("content", "Test content for delete entry") self.browser.find_by_css("button[type=submit]").first.click() self.browser.click_link_by_partial_href('delete') button = self.browser.find_by_css("button[type=submit]") button.click() #browser should return to homepage after delete self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
class TagTest(StaticLiveServerTestCase): def setUp(self): fss.remove_tree(settings.MEDIA_ROOT) check_permissions() set_site(self.live_server_url) self.browser = Browser() self.browser.visit(self.live_server_url) login_url = settings.LOGIN_URL self.browser.click_link_by_partial_href(login_url) username = '******' password = '******' create_user(username) login( self.browser, username, password, ) upload_url = reverse('documents.views.add_document') self.browser.click_link_by_partial_href(upload_url) source = 'local' docfile = get_abs_path('doctest.pdf') language = 'eng' public = True title = 'test' notes = 'test notes' upload( self.browser, source, docfile, language, public, title, notes, ) self.browser.is_element_not_present_by_value('ready', 10) tag = 'tag' add_tag( self.browser, tag, ) self.tag = tag self.tag_obj = get_tag(tag) def test_add_tag(self): tag_exists = exists_tag(self.tag) self.assertTrue(tag_exists) self.assertEquals(self.tag_obj.name, self.tag) document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) tag_span = self.browser.find_by_css('span.taggit_tag') self.assertEquals(tag_span.value, self.tag) # import time; time.sleep(3) self.browser.quit() def test_add_different_tag(self): old_tag_num = len(self.browser.find_by_css('span.taggit_tag')) tag = 'other' add_tag( self.browser, tag, ) new_tag_num = len(self.browser.find_by_css('span.taggit_tag')) self.assertEquals(new_tag_num, old_tag_num + 1) # import time; time.sleep(3) self.browser.quit() def test_add_same_tag(self): old_tag_num = len(self.browser.find_by_css('span.taggit_tag')) tag = self.tag add_tag( self.browser, tag, ) new_tag_num = len(self.browser.find_by_css('span.taggit_tag')) self.assertEquals(new_tag_num, old_tag_num) # import time; time.sleep(3) self.browser.quit() def test_remove_tag(self): old_tag_num = len(self.browser.find_by_css('span.taggit_tag')) driver = self.browser.driver actions = ActionChains(driver) tag_link = driver.find_element_by_css_selector('#taggit_tags a') actions.move_to_element(tag_link) actions.move_by_offset(25, 10) actions.click() actions.perform() document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) new_tag_num = len(self.browser.find_by_css('span.taggit_tag')) self.assertEquals(new_tag_num, old_tag_num - 1) # import time; time.sleep(3) self.browser.quit()
def scrape(): #dependencies from bs4 import BeautifulSoup as bs import splinter import requests from splinter import Browser import time import pandas as pd from selenium import webdriver import os import pymongo import json #The dictionary mars_facts_data={} #1 #emulate the browser and get the html executable_path = {'executable_path': 'C:/chromedriver/chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) #url to visit url='https://mars.nasa.gov/news/' #we need to use the browser to visit the page because there are many elements that do not load until the page is loaded. #requests would only get the raw html. browser.visit(url) html = browser.html soup = bs(html, 'html.parser') news_p =soup.select_one("div.rollover_description_inner") news_title = soup.select_one("div.content_title") news_p = news_p.text news_title = news_title.text mars_facts_data['news_title'] = news_title mars_facts_data['news_paragraph'] = news_p #2 executable_path = {'executable_path': 'C:/chromedriver/chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) browser.click_link_by_id('full_image') time.sleep(3) browser.click_link_by_partial_text('more info') time.sleep(3) time.sleep(3) browser.click_link_by_partial_href('/spaceimages/images/') #Download the image and Store response = requests.get(browser.url) if response.status_code == 200: linkname= (browser.url.rsplit('/', 1)[-1]) SaveFile = (f'Resources/{linkname}') with open(SaveFile, 'wb') as f: f.write(response.content) print(browser.url) Space_image_dict = {} Space_image_dict['Url'] = browser.url mars_facts_data['featured_image'] = browser.url #collection.insert_one(Space_image_dict) #3 mars_weather_dict = {} url='https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'html.parser') mars_weather = soup.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text mars_weather =mars_weather.strip() mars_facts_data['weather'] = mars_weather mars_facts_data #collection.insert_one(mars_weather_dict) #4 url = 'https://space-facts.com/mars/' df = pd.read_html(url) #df = pd.DataFrame(df) df= df[0] df.columns = ['Category', 'Measure'] df.set_index('Category',inplace = True) mars_html_table = df.to_html() mars_html_table = mars_html_table.replace("\n","") mars_facts_data['mars_facts_table'] = mars_html_table return mars_facts_data
def scrape(): #Dependencies from bs4 import BeautifulSoup import requests from splinter import Browser from splinter.exceptions import ElementDoesNotExist import pandas as pd # First URL of page to be scraped url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'lxml' soup = BeautifulSoup(response.text, 'lxml') #Retrieve latest news' title and paragraph; store into variables results = soup.find('div', class_='image_and_description_container') news_title = results.find_all('img') news_title = news_title[1]['alt'] news_p = results.find('div', class_='rollover_description_inner').text news_p = news_p.replace('\n', '') #Set up Chrome.exe executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #Connect to URL url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) #Prepare to use Beautiful Soup html = browser.html soup = BeautifulSoup(html, 'html.parser') #Push FULL IMAGE button to retrieve the image URL browser.click_link_by_partial_text('FULL IMAGE') #Retrieve image URL results = soup.find('a', class_='button fancybox') feature_image_url = results['data-fancybox-href'] feature_image_url = feature_image_url.replace('medium', 'large') feature_image_url = feature_image_url.replace('ip', 'hires') url_short = url.rsplit('/spaceimages', 1)[0] feature_image_url = url_short + feature_image_url #Now let's retrieve Mars weather url = 'https://twitter.com/marswxreport?lang=en' # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'lxml' soup = BeautifulSoup(response.text, 'lxml') #Find all the tags that contain tweets results = soup.find_all('div', class_='content') for result in results: texto = result.find( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' ).text x = texto.find("InSight sol") if x == 0: mars_weather = texto.rsplit('pic.twitter', 1)[0] break #Go for the FACTS table! url = 'https://space-facts.com/mars/' #Start retrieving the data from the table table = pd.read_html(url) #Organize pandas df df = table[0] df.columns = ['Description', 'Value'] df.set_index('Description', inplace=True) #Transform to HTML string html_table = df.to_html() #Set up Chrome.exe executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #Connect to URL to find photos of Mars Hemispheres url = 'https://astrogeology.usgs.gov/maps/mars-viking-hemisphere-point-perspectives' browser.visit(url) Hemispheres = [ 'valles_marineris', 'syrtis_major', 'schiaparelli', 'cerberus' ] hemisphere_image_urls = [] for Hemisphere in Hemispheres: try: browser.click_link_by_partial_href(Hemisphere + '_enhanced') #Prepare to use Beautiful Soup html = browser.html soup = BeautifulSoup(html, 'html.parser') title = soup.find('h2', class_='title').text #title = title.rsplit(' Enhanced',1)[0] image = soup.find('img', class_='wide-image') image_link = 'https://astrogeology.usgs.gov' + image['src'] d = {'title': title, 'image_url': image_link} hemisphere_image_urls.append(d) except: browser.find_link_by_text('2').first.click() browser.click_link_by_partial_href(Hemisphere + '_enhanced') #Prepare to use Beautiful Soup html = browser.html soup = BeautifulSoup(html, 'html.parser') title = soup.find('h2', class_='title').text #title = title.rsplit(' Enhanced',1)[0] image = soup.find('img', class_='wide-image') image_link = 'https://astrogeology.usgs.gov' + image['src'] d = {'title': title, 'image_url': image_link} hemisphere_image_urls.append(d) results_dict = {'news_title' : news_title, 'news_p' : news_p, 'feature_image_url' : feature_image_url,\ 'mars_weather' : mars_weather, 'html_table' : html_table, 'hemisphere_image_urls' : hemisphere_image_urls} return results_dict
#!/usr/bin/env python2 # -*- coding: utf-8 -*- import sys, os from splinter import Browser reload(sys) sys.setdefaultencoding('utf-8') br = Browser() file_prefix = 'file://' dir_name = os.path.dirname(os.path.realpath(__file__)) file_name = "output.html" full_path = file_prefix + dir_name + '/' + file_name br.visit(full_path) br.click_link_by_partial_href('/releases/ac?artist_id=')
def scrape(): #NEWS nasaUrl = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' html = requests.get(nasaUrl) soup = bs(html.text, 'html5lib') news_title = soup.find_all(class_='content_title')[0].text news_p = soup.find_all(class_='rollover_description_inner')[0].text #Featured Image executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False, wait_time=5) marsUrl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(marsUrl) time.sleep(5) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') browser.click_link_by_partial_href('/jpeg') featuredImageUrl = browser.find_by_css('img')['src'] #Mars Weather twitterUrl = 'https://twitter.com/marswxreport?lang=en' html = requests.get(twitterUrl) soup = bs(html.text, 'html5lib') marsWeather = soup.find_all(class_='TweetTextSize')[0].text #Mars Facts marsFactsUrl = 'https://space-facts.com/mars/' df = pd.read_html(marsFactsUrl)[0] df = df.rename(columns={0: 'Description', 1: 'Value'}) df = df.set_index('Description') df = df.to_dict() marsfactsdict = df['Value'] #Hemispheres marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(marsHemispheresUrl) hemiList = [] marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(marsHemispheresUrl) browser.click_link_by_partial_text('Cerberus Hemisphere') img_url = browser.find_by_css('img[class = wide-image]')['src'] title = browser.find_by_css('h2[class = title]').text hemiList.append(dict({'title': title, 'img_url': img_url})) marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(marsHemispheresUrl) browser.click_link_by_partial_text('Schiaparelli Hemisphere') img_url = browser.find_by_css('img[class = wide-image]')['src'] title = browser.find_by_css('h2[class = title]').text hemiList.append(dict({'title': title, 'img_url': img_url})) marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(marsHemispheresUrl) browser.click_link_by_partial_text('Syrtis Major Hemisphere') img_url = browser.find_by_css('img[class = wide-image]')['src'] title = browser.find_by_css('h2[class = title]').text hemiList.append(dict({'title': title, 'img_url': img_url})) marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(marsHemispheresUrl) browser.click_link_by_partial_text('Valles Marineris Hemisphere') img_url = browser.find_by_css('img[class = wide-image]')['src'] title = browser.find_by_css('h2[class = title]').text hemiList.append(dict({'title': title, 'img_url': img_url})) scrapedDict = { 'news_title': news_title, 'news_p': news_p, 'featured_image': featuredImageUrl, 'weather': marsWeather, 'facts': marsfactsdict, 'hemispheres': hemiList } return scrapedDict
def scrape(): # A webscraping function for the latest news on mars # Python dictionary of the results scrape_rsult = {} # ### NASA Mars News # In[2]: # *** Scrape the [NASA Mars News Site] *** url_NASA = "https://mars.nasa.gov/news" r = req.get(url_NASA) # sends a request to the url time.sleep(1) data = r.text # turns response into texts soup = BeautifulSoup( data, "html.parser") # changes the response from text to html # In[3]: # collect the latest News Title and Paragragh Text. Assign the text to variables that you can reference later. soup_div = soup.find( class_="slide") # within div in body, within <ul>, <li class=slide>. soup_news = soup_div.find_all('a') # search by anchor # In[4]: #getting the title NASA_latest_t = soup_news[1].get_text().strip() # ^^^Latest News Title scrape_rsult["Nasa_latest_title"] = NASA_latest_t # In[5]: #getting the paragraph # getting the paragraph url soup_p = soup_div.find_all('a', href=True) soup_p_url = soup_p[0]['href'] # only the url of latest news article's paragraph # In[6]: # Scrape the href of the first news article url = "https://mars.nasa.gov/" news_url = url + soup_p_url # request url r = requests.get(news_url) time.sleep(1) data = r.text soup = BeautifulSoup(data, "html.parser") soup_para = soup.find(class_='wysiwyg_content') soup_para = soup_para.find_all('p') # In[7]: # save the text of the paragraphs to a list NASA_latest_p = [] for entry in soup_para: paragraph = entry.get_text().strip() NASA_latest_p.append(paragraph) # ^^^ NASA_latest_p is list of paragraphs from the latest news article scrape_rsult["Nasa_latest_paragraph"] = NASA_latest_p # ### JPL Mars Space Images - Featured Image # In[8]: # Visit the url for JPL's Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars). executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(3) # In[9]: # Use splinter to navigate the site and find the image url for the current Featured Mars Image # the mars featured images are under a list element of the slide class. '>' signifies a child element. browser.find_by_css('li.slide>a.fancybox').first.click() time.sleep(1) # clicks the 'more info' button (caution!: the 'share' button is under a similar but different class) browser.find_by_css('div.buttons>a.button').first.click() time.sleep(1) # In[10]: # assign the url string to a variable called `featured_image_url`. # Here, I decide to get both the full-size .jpg and an 800x600 size image for the webpage html = browser.html soup = BeautifulSoup(html, "html.parser") # full-size jpg (to be linked if image is clicked) feat_full_img_soup = soup.find(class_="main_image") feat_full_img = feat_full_img_soup.get('src') # smaller size jpg (to be displayed on the webpage) # uses splinter instead of beautiful soup browser.click_link_by_partial_href('800x600.jpg') # switch over to the next browser (window no. 2) # save it's url, then close 2nd window browser.windows.current = browser.windows[1] featured_image_url = browser.url browser.windows[1].close() # save the two urls ori_url = 'https://www.jpl.nasa.gov' feat_full_img = ori_url + feat_full_img # ^^^ feat_full_img is https://www.jpl.nasa.gov + url of the full-sized featured image # featured_image_url is the smaller 800x600 image that will be featured on the webpage scrape_rsult["featured_image_url"] = featured_image_url scrape_rsult['feat_full_img'] = feat_full_img # ### Mars Weather # In[11]: ''' *** Visit the Mars Weather twitter account (https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called `mars_weather`. *** ''' url = 'https://twitter.com/marswxreport?lang=en' r = requests.get(url) time.sleep(1) data = r.text soup = BeautifulSoup(data, 'html.parser') mars_tweets = soup.find(class_='stream-items js-navigable-stream') mars_tweets = mars_tweets.find(class_="js-tweet-text-container") mars_weather = mars_tweets.p.text # ^^^ mars_weather is the paragraph <p> text of the latest tweet from the Mars weather handle scrape_rsult["mars_weather_tweet"] = mars_weather # ### Mars Facts # In[12]: ''' *** Visit the Mars Facts webpage (http://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. *** ''' facts_url = 'http://space-facts.com/mars/' all_facts_df = pd.read_html( facts_url) # searches for html tables & returns list of dataframes all_facts_df = all_facts_df[0] # In[14]: # Use Pandas to convert the data to a HTML table string. facts_html = all_facts_df.to_html(header=False, index=False, justify='left') # ^^^ facts_html is the html table of the mars facts table scrape_rsult["mars_facts_table"] = facts_html # ### Mars Hemispheres # In[114]: ''' *** Visit the USGS Astrogeology site (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres. ''' url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) time.sleep(3) # In[115]: # click each of the links to the hemispheres to find the image url to the full resolution image. # old code, may be useful later ''' # get list of <a href links> html = browser.html soup = BeautifulSoup(html, 'html.parser') hemi_soup = soup.find_all(class_='itemLink product-item') hemi_href_ls = [] for item in hemi_soup: url_index = 'https://astrogeology.usgs.gov' href = item['href'] link = url_index + href hemi_href_ls.append(link) ''' # Get unique hrefs ''' I could just go to these urls separately using browser.visit(url). But I interpret the instructions as saying that I need to use splinter to click on the link in the browser. ''' # hemi_href_ls = np.unique(hemi_href_ls) # hemi_href_ls # In[116]: ''' Caution!: It seems splinter can only click link based on the exact wording of the text browser.click_link_by_partial_text('Cerberus Hemisphere') #e.g. function will fail to find lower case 'cerberus' ''' # In[117]: # Beautiful soup to search browser html for headers (these contain the hemisphere names) html = browser.html soup = BeautifulSoup(html, 'html.parser') headers_soup = soup.find_all('h3') #test = headers_soup[2].text.replace(" Enhanced", "") #test # In[128]: # For each header in the beautiful soup, click link associated with it and get img_url hemisphere_image_urls = [] url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' for header in headers_soup: #start at origin url for the Mars hemisphere section window = browser.windows[0] # current window, the first window browser.visit(url) time.sleep(2) # wait 2 secs for browser to load #getting title title = header.text title = title.replace( " Enhanced", "") #get rid of " " + "Enhanced" for when dict is appended browser.click_link_by_partial_text(title) time.sleep(2) # again, wait 2 secs for browser to load browser.click_link_by_text('Sample') browser.windows.current = browser.windows[ 1] # switch current window to the window that just opened img_url = browser.url browser.windows.current = window # switch the current window back hemisphere_image_urls.append({'title': title, 'img_url': img_url}) window.close_others( ) # close all the other windows to keep browser nice and tidy! # ^^^ hemisphere_image_urls is list of dicts of img_url and title of hemisphere scrape_rsult["hemispheres"] = hemisphere_image_urls return scrape_rsult
class UserTest(StaticLiveServerTestCase): def setUp(self): check_permissions() self.username = '******' create_user(self.username) self.browser = Browser() self.browser.visit(self.live_server_url) def test_signup(self): signup_url = settings.SIGNUP_URL self.browser.click_link_by_partial_href(signup_url) username = '******' password = '******' email = '*****@*****.**' signup( self.browser, username, password, email, ) user_exists = exists_user(username) self.assertTrue(user_exists) user = get_user(username) self.assertEquals(user.username, username) #self.assertEquals(user.password, password) self.assertEquals(user.email, email) document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) profile_xpath = '/html/body/div/div[1]/div/ul[2]/li[4]/a' profile_link = self.browser.find_by_xpath(profile_xpath) self.assertEquals(profile_link.value, '@{}'.format(username)) # import time; time.sleep(3) self.browser.quit() def test_signin(self): login_url = settings.LOGIN_URL self.browser.click_link_by_partial_href(login_url) username = self.username password = self.username login( self.browser, username, password, ) document_list_url = \ self.live_server_url + reverse('documents.views.list_documents') self.assertEquals(self.browser.url, document_list_url) profile_xpath = '/html/body/div/div[1]/div/ul[2]/li[4]/a' profile_link = self.browser.find_by_xpath(profile_xpath) self.assertEquals(profile_link.value, '@{}'.format(username)) # import time; time.sleep(3) self.browser.quit()