def mars_Images(): from splinter import Browser from bs4 import BeautifulSoup image_dict = {} browser = Browser('chrome', headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) time.sleep(3) browser.click_link_by_id("full_image") elem = browser.find_link_by_partial_href("PIA") image_url = elem['href'] browser.quit() browser2 = Browser('chrome', headless=False) url2 = image_url browser2.visit(url2) browser2 = Browser('chrome', headless=False) url2 = image_url browser2.visit(url2) elem = browser2.find_link_by_partial_href("/spaceimages/images") featured_image_url = elem['href'] image_dict["featured_image_url"] = featured_image_url return image_dict
def scrape(pokemon): url = f'https://bulbapedia.bulbagarden.net/wiki/{pokemon}_(Pok%C3%A9mon)' driver = webdriver.Chrome() #Open browser browser = Browser('chrome') browser.visit(url) #Turn webpage into html html = browser.html soup = bs(html, 'lxml') #Find the picture of the Pokemon and click on it until it's just the .png file browser.execute_script("window.scrollTo(0, 400);") links_found = browser.find_link_by_partial_href(f'{pokemon}.png').click() time.sleep(2) browser.execute_script("window.scrollTo(0, 400);") image = browser.find_by_id('file').click() time.sleep(2) pokemon_url = browser.url #Store it in a dictionary pokemon_image = {'name': pokemon, 'url': pokemon_url} browser.quit() return pokemon_image big_ol_pokemon_list = csv['Name']
def scrape(pokemon): import pandas as pd from bs4 import BeautifulSoup as bs import requests from splinter import Browser import time #Latest Headline / Paragraph url = f'https://bulbapedia.bulbagarden.net/wiki/{pokemon}_(Pok%C3%A9mon)' #Open browser browser = Browser('chrome', headless=True) browser.visit(url) #Turn webpage into html html = browser.html soup = bs(html, 'lxml') #Find the picture of the Pokemon and click on it until it's just the .png file links_found = browser.find_link_by_partial_href(f'{pokemon}.png').click() image = browser.find_by_id('file').click() pokemon_url = browser.url #Store it in a dictionary pokemon_image = {'URL': pokemon_url} browser.quit() return pokemon_image
def find_hemisperes(name): browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) browser.click_link_by_partial_text(name) links_found = browser.find_link_by_partial_href(name.split()[0].lower()) url = links_found['href'] dic = {"title": f"{name} Hemisphere", "img_url": url} hemisphere_image_urls.append(dic) browser.quit()
def scrape(): #setup response = {} executable_path = {'executable_path': os.path.join("C:/","Users","kling","UNCC Data Analytics","chromedriver.exe")} options = webdriver.ChromeOptions() options.add_argument('--ignore-certificate-errors') options.add_argument("--test-type") browser = Browser('chrome', **executable_path, headless=False) #retrieve text news about mars browser.visit(nasa_url) time.sleep(5) #get first article and follow link article_link = browser.find_link_by_partial_href('/news/')[0].click() html = browser.html soup = bs(html, "html.parser") news_title = soup.find("title").text news_title = news_title.strip('\n') #Get all paragraphs from article, strip tags and add them together into one block of text news_p = soup.find_all("p") paragraph= news_p[1] response['title'] = news_title response['paragraph'] = paragraph #get featured image driver = webdriver.Chrome(chrome_options=options) driver.get(jpl_url) link = driver.find_element_by_partial_link_text("FULL IMAGE").click() time.sleep(2) images = driver.find_elements_by_class_name('fancybox-image') for image in images: image_url = image.get_attribute('src') print(image_url) response['featured_img'] = image_url # img=requests.get(image_url)#fetch image # with open('featured_image.jpg','wb') as writer:#open for writing in binary mode # writer.write(img.content)#write the image #Retrieve weather data driver = webdriver.Chrome(chrome_options=options) driver.get(twitter_url) tweet = driver.find_element_by_css_selector('p.tweet-text').text response['weather'] = tweet #Get Mars facts facts = pd.read_html(facts_url)[0] response['facts'] = facts.to_html() return(response)
def scrape(username, password): appartments = [] browser = Browser('chrome', headless=True) login(browser, username, password) browser.click_link_by_text('Lgh') links = browser.find_link_by_partial_href( 'https://nya.boplats.se/objekt/1hand/') for l in links: appartments.append(extract_table_info(browser, l)) for a in appartments: add_details(browser, a) return appartments
def getRoutes(start, end): browser = Browser(driver_name="firefox") browser.visit('https://www.hopstop.com/search?xfr=cityscape') print(browser.url) browser.fill('address1', str(start)) browser.fill('address2', str(end)) browser.find_by_name('get_dirs').click() print(browser.url) if browser.is_text_present('Did you mean?'): browser.click_link_by_href("#") if browser.is_text_present('Did you mean?'): browser.click_link_by_href("#") browser.click_link_by_href("#") links = browser.find_link_by_partial_href("/station?stid") results = [] for link in links: results.append(link.value) return results
def getRoutes(start, end): browser = Browser(driver_name="firefox") browser.visit("https://www.hopstop.com/search?xfr=cityscape") print(browser.url) browser.fill("address1", str(start)) browser.fill("address2", str(end)) browser.find_by_name("get_dirs").click() print(browser.url) if browser.is_text_present("Did you mean?"): browser.click_link_by_href("#") if browser.is_text_present("Did you mean?"): browser.click_link_by_href("#") browser.click_link_by_href("#") links = browser.find_link_by_partial_href("/station?stid") results = [] for link in links: results.append(link.value) return results
def JPL_image(): browser = init_browser() executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) #for button in buttons: browser.find_link_by_partial_text('FULL IMAGE').click() browser.is_element_not_present_by_id('images', wait_time=2) browser.find_link_by_partial_text('more info').click() link = browser.find_link_by_partial_href('largesize') image_url = link.html.split("=") image_url = link.html.split("=")[-1].lstrip('"') image_url = image_url.rstrip('">') featured_image_url = 'https://www.jpl.nasa.gov' + image_url return featured_image_url
def scrape(author): print("\nRETRIEVING DATA FOR:", author, "\n") print("\nINITIALIZING CRAWLER\n") # Visit URL browser = Browser( driver_name='chrome', headless=True) #headless=False will show the browser navigation url = "https://scholar.google.com.br/" browser.visit(url) browser.fill('q', author) # Find and click the 'search' button button = browser.find_by_name('btnG') time.sleep(1) # needs to sleep for the button to become active button.click() # If the profile doesn't exist, stop. profile_check = browser.html if "feather-72.png" not in profile_check: print( "\nERROR: PROFILE DOES NOT EXIST. PLEASE CHECK YOUR QUERY OR TYPE ANOTHER NAME.\n" ) return # Find and click the first link (if profile exists). button = browser.find_link_by_partial_href('citations?user='******'gsc_bpf_more') check_button = browser.evaluate_script( 'document.getElementById("gsc_bpf_more").disabled') while check_button == False: time.sleep(1) check_button = browser.evaluate_script( 'document.getElementById("gsc_bpf_more").disabled') button.click() #get html soup = BeautifulSoup(browser.html, 'html.parser') soup.findAll("td", {"class": "gsc_a_t"}) print("\nBUILDING PAPERS DICTIONARY.\n") papers = [] table = soup.find("table", id="gsc_a_t") for tr in table.find_all('tr')[2:]: for td in tr.find_all("td", {"class": "gsc_a_t"}): paper = {} text = re.sub( "[\'\"]", "", tr.find("a", { "class": "gsc_a_at" }).get_text()).strip() # evita erro de sintaxe no sql paper['title'] = text authors = tr.find("div", { "class": "gs_gray" }).get_text().split(',')[:5] authors = [a for a in authors if a != "..." ] # in some cases, the 4th author might be ... authors = [ a.strip().upper() for a in authors ] #remove espaçamento antes de alguns nomes e resolve case sensitiveness authors = [re.sub("[\'\"]", "", a) for a in authors] # evita erro de sintaxe no sql paper['authors'] = authors papers.append(paper) return papers
def scrape(): print("scrape_mars scrape rtn") #tk moved imports to here #Imports from splinter import Browser from bs4 import BeautifulSoup as bs import pandas as pd import requests import time import re #tk moved to hetk #def init_browser(): executable_path = {"executable_path": "chromedriver.exe"} #tk return Browser("chrome", **executable_path, headless=True) browser = Browser("chrome", **executable_path, headless=False) #tk browser = init_browser() mars_data_scrape = {} mars_news = 'https://mars.nasa.gov/news/' browser.visit(mars_news) time.sleep(2) html = browser.html news_soup = bs(html, 'html.parser') #Data Scrape print("#Data Scrape") news_title = news_soup.find('div', class_='content_title').get_text() news_p = news_soup.find('div', class_='article_teaser_body').get_text() time.sleep(2) mars_data_scrape["data1"] = news_title mars_data_scrape["data2"] = news_p #Paths print("#Paths") #executable_path = {"executable_path": "chromedriver"} #browser = Browser("chrome", **executable_path, headless=True) jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl_url) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(2) browser.click_link_by_partial_text('more info') time.sleep(2) browser.click_link_by_partial_text('.jpg') #Soup print("#Soup") html = browser.html jpl_soup = bs(html, 'html.parser') featured_img_url = jpl_soup.find('img').get('src') mars_data_scrape["image"] = featured_img_url #Weather print("#Weather") weather_url = 'https://twitter.com/marswxreport?lang=en' html = requests.get(weather_url) beautiful_soup = bs(html.text, 'html.parser') #tk mars_weather = weather_soup.find_all(string=re.compile("Sol"), #tk class_ = "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")[0].text #tk mars_data_scrape["weather"] = mars_weather #SpaceFacts print("#SpaceFacts") mars_facts_url = 'https://space-facts.com/mars/' table_df = pd.read_html(mars_facts_url)[0] table_df.columns = ["description", "value"] table_df = table_df.set_index('description', drop=True) mars_data_scrape["table"] = table_df.to_html() # In[35]: print("#In35") #executable_path = {"executable_path": "chromedriver.exe"} #browser = Browser("chrome", **executable_path, headless=True) hem_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hem_url) html = browser.html hem_soup = bs(html, 'html.parser') #Final print("#Final") hem_img_urls = [] hem_dict = { 'title': [], 'img_url': [], } x = hem_soup.find_all('h3') for i in x: t = i.get_text() title = t.strip('Enhanced') browser.click_link_by_partial_text(t) hem_url = browser.find_link_by_partial_href('download')['href'] hem_dict = {'title': title, 'img_url': hem_url} hem_img_urls.append(hem_dict) browser.back() mars_data_scrape["hemispheres"] = hem_img_urls #tk added print print(mars_data_scrape) return mars_data_scrape
element = browser.find_by_name('avc').first element.select('4') browser.fill('sic', '7759') browser.fill('totalSale', '10') igArray = ['99','98','95','90','87','85','83','80','77','75','73','70','65','60','40','30'] for x in range(16): element = browser.find_by_name('obligorIgCode').first element.select(igArray[x]) browser.find_by_name('UpdateButton').first.click() pdData() browser.driver.save_screenshot(typeaName + countryList[conIndex] + '_' + igArray[x] +'.png') browser.find_link_by_partial_href('/CNETCORP/cpmScenarios.do').first.click() else: # create the new scenario if cType == 0: typeaName = 'Existing_Scotia_Public_' elif cType == 1: typeaName = 'Existing_Scotia_Private_' else: typeaName = 'Non_Scotia_Public_' browser.find_by_name('addScenario').first.click() browser.fill('scName', typeaName + countryList[conIndex]) browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b') browser.type('scEffDate', '2014-10-31') browser.find_by_name('update').first.click()
def scrape(): #Set up path for browser executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #Open URL url = 'https://mars.nasa.gov/news/' browser.visit(url) #Grab latest headline html = browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.find('div', class_='image_and_description_container').find( 'div', class_='content_title').find('a').text news_p = soup.find(class_='article_teaser_body').text #Navigate to JPL Mars featuered image jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl_url) browser.find_by_id('full_image').click() #Navigate to image page browser.find_link_by_partial_text('more info').click() #Get to fullsize image browser.find_link_by_partial_href('/spaceimages/images').click() #Scrape image URL featured_image_url = browser.url #Get Mars facts with Pandas facts_url = 'https://space-facts.com/mars/' tables = pd.read_html(facts_url) #Slice off other tables df = tables[0] df = df.rename({'0': 'Description', '1': 'Mars'}) df.reset_index() #Convert to HTML facts_table = df.to_html() #Get Mars hemisphere pictures hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemi_url) soup = BeautifulSoup(browser.html) img_title = [] title = soup.find_all('h3') for x in title: img_title.append(x.text) images = [] counter = 0 for x in img_title: browser.find_by_css('img.thumb')[counter].click() images.append(browser.find_by_text('Sample')['href']) counter = counter + 1 browser.back() hemisphere_image_urls = [] counter = 0 for x in title: hemisphere_image_urls.append({ 'title': img_title[counter], 'img_url': images[counter] }) counter = counter + 1 browser.quit() return { 'headline': news_title, 'article_detail': news_p, 'feat_img': featured_image_url, 'table': facts_table, 'hemisphere_imgs': hemisphere_image_urls }
def get_data(): # 1 Nasa news *** USING BROWSER = SPLINTER *** browser = Browser('chrome') url = "https://mars.nasa.gov/news/" #visit url browser.visit(url) # HTML object mars_html = browser.html # Parse HTML soup = bs(mars_html, "html.parser") # Collect News Title and Paragraph news_title = soup.find("div", class_="content_title").text.strip() print(news_title) news_paragraph = soup.find('div', class_="article_teaser_body").text print(news_paragraph) # Close the browser after scraping browser.quit() #2- JPL Mars Space Images - Featured Image browser = Browser('chrome') image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)" #go to url browser.visit(image_url) #navigate to link browser.click_link_by_partial_text('FULL IMAGE') time.sleep(2) browser.click_link_by_partial_text('more info') image_html = browser.html image_soup = bs(image_html, "html.parser") image_path = image_soup.find('figure', class_='lede').a['href'] featured_image_url = "https://www.jpl.nasa.gov/" + image_path print(featured_image_url) # Close #3- Mars Weather browser = Browser('chrome') url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html weather_soup = bs(html, 'html.parser') mars_weather = weather_soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text.strip() print(mars_weather) # Close browser.quit() #5-Mars Hemispheres #create dictionaries hemisphere_img_urls = [] hemisphere_dicts = {"title": [], "img_url": []} # url url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser = Browser("chrome") browser.visit(url) home_page = browser.html #HTML & Parsing hemispheres_soup = bs(home_page, "html.parser") results = hemispheres_soup.find_all("h3") # Use loop for result in results: title = result.text print(title) #title title = title[:-9] print(title) browser.click_link_by_partial_text(title) img_url = browser.find_link_by_partial_href("download")["href"] print(img_url) hemisphere_dicts = {"title": title, "img_url": img_url} hemisphere_img_urls.append(hemisphere_dicts) browser.visit(url) # Close browser.quit() mars_data = { "title": title, "content": news_p, "featured_image_url": featured_image_url, "latest_weather": mars_weather, "image_data": hemisphere_img_urls, } existing = mars_collection.find_one() if existing: mars_data['_id'] = existing['_id'] mars_collection.save(mars_data) else: mars_collection.save(mars_data) return mars_data
def bundle(argv): username = "" password = "" all = False overwrite = False skip = False try: opts, args = getopt.getopt(argv, "hu:p:aso", ["username="******"password="******"-h": print_help() exit() elif opt == "-a": download_all_warning() all = True elif opt == "-o": overwrite = True elif opt == "-s": skip = True elif opt in ("-u", "--username"): username = arg elif opt in ("-p", "--password"): password = arg del argv, args, opts if not username: username = input("Username (email): ") if not username: print("Empty, exiting.") exit(2) if not password: password = input("Password: "******"Empty, exiting.") exit(2) browser = Browser() print("Logging in...") browser.visit('https://bundleofholding.com/user/login') browser.fill('users_email', username) browser.fill('password', password) browser.find_by_name('submit').click() if (len(browser.find_by_css("div.logged-in")) > 0): #if browser.is_text_present("Wizard's Cabinet"): print("Getting lists...") browser.visit('https://bundleofholding.com/download/list') else: print("Failed to log in.") if (input("Quit browser? */n ") != "n"): browser.quit() exit() bListBox = browser.find_by_id('overview') bListList = bListBox.find_by_tag('a') bundles = [] for e in bListList: bundles.append( (e.value, e['href']) ) del bListBox, bListList bundle_count = len(bundles) item = 0 vault = {} for b in bundles: item += 1 print("\tFile list {0} of {1}.".format(item, bundle_count)) vault[b[0]] = [] browser.visit(b[1]) bLinks = browser.find_link_by_partial_href('file_id') # todo: get file list with file sizes if possible # problem: not all pages have "core-bundle" element; older ones are uglier # xpath span/a? for e in bLinks: vault[b[0]].append( (e.value, e['href']) ) del item, bLinks print("\n\n") # Chose to make command line parameter only to help reinforce FAQ. # if not all: # download_all_warning() # # totalfiles = 0 # for bundle, files in vault.items(): # totalfiles += len(files) # # if (input("There are {0} bundles with a total of {1} files. Download all? y/* ".format(len(vault), totalfiles) ) == "y"): # all = True # del totalfiles rx = re.compile("[^\w _()'-]+") cookies = browser.cookies.all() currentBundle = 1 for bundle, files in vault.items(): length = len(files) print("({2}/{3}) {0} has {1} files.".format(bundle, length, currentBundle, bundle_count) ) # todo: accept input (text file?) of accept list rather than just 'all', or maybe command line for one bundle to fetch if (all or input("\tDownload? y/* ") == "y"): print("\t...Downloading {0}".format(bundle)) p = rx.sub("", bundle) os.makedirs(p, exist_ok=True) currentFile = 1 for f in files: #fn = rx.sub("", f[0]) # Or, assume remote's fine. print("\t\t({0}/{1}) - {2}".format(currentFile, length, f[0], end="")) fn = p + "/" + f[0] if (os.path.isfile(fn)): if (not skip and (overwrite or input("\tExists. Overwrite? y/* ") == "y")): print("Overwrite.") pass else: print("Skip.") continue r = requests.get(f[1], cookies = cookies, stream=True) #idiom taken from a stack overflow result with open(fn, 'wb') as fd: for chunk in r.iter_content(1000000): if chunk: fd.write(chunk) fd.flush() print('.', end='') sys.stdout.flush() print() currentFile+=1 currentBundle+=1 del rx, cookies print("\n") browser.quit() exit()
def scrape(): # browser = init_browser() mars_info = {} executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = f'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.find("div", class_="content_title").text news_date = soup.find("div", class_="list_date").text news_p = soup.find("div", class_="article_teaser_body").text # Dictionary entry from MARS NEWS mars_info['news_paragraph'] = news_p mars_info['news_title'] = news_title mars_info['news_date'] = news_date # Visit the url for JPL Featured Space Image url2 = (f"https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars") browser.visit(url2) time.sleep(1) html2 = browser.html soup = BeautifulSoup(html2, 'html.parser') image = soup.find("img", class_="thumb")["src"] # Make sure to find the image url to the full size `.jpg` image. img_jpl = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" + image mars_info['img_jpl'] = img_jpl print(img_jpl) # visit the mars weather report twitter and scrape the latest tweet urlt = (f'https://twitter.com/marswxreport?lang=en') browser.visit(urlt) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_weather = soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text # create dictionary entry mars_info['mars_weather'] = mars_weather # visit space facts and scrap the mars facts table url_fact = (f"https://space-facts.com/mars/") mars_facts = pd.read_html(url_fact) mars_df = mars_facts[0] mars_df.columns = ['Description', 'Value'] mars_df.set_index('Description', inplace=True) # dictionary entry mars_info['mars_facts'] = mars_df # scrape images of Mars' hemispheres from the USGS site urlmars = ( f'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' ) browser.visit(urlmars) time.sleep(1) htmlm = browser.html soup = BeautifulSoup(htmlm, 'html.parser') # loop trought collect entries img_urls = [] img_dict = { 'Title': [], 'Image URL': [], } results = soup.find_all('h3') for r in results: text = r.get_text() title = text.strip('Enhanced') browser.click_link_by_partial_text(text) img_url = browser.find_link_by_partial_href('download')['href'] img_dict = {'title': title, 'img_url': img_url} img_urls.append(img_dict) browser.back() # create dictionary entries mars_info['img_dict'] = img_dict print(img_dict) return mars_info
def scrape(): # In[3]: #1.1 MARS NEWS------------------------------ # get latest news from nasa mars exploration page at https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest mars_news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' # set up a Browser to get access to js stuff executable_path = {"executable_path": "/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) # In[4]: # visit the website browser.visit(mars_news_url) # In[5]: nasa_news = browser.html soup_nasa_news = bs(nasa_news, 'html.parser') nasa_news_title = soup_nasa_news.find('div', class_='content_title').text.strip() #nasa_news_teaser = soup_nasa_news.find('div', class_="artlce_teaser_body").text.strip() nasa_news_teaser = soup_nasa_news.find('div', class_='article_teaser_body').text # .find('li', class_='slide').find('div', class_='list_text') # print(nasa_news_title) # print(nasa_news_teaser) # In[6]: # 1.2 JPL Mars space images # Visit the url for JPL Featured Space Image https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars. # Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url. # Make sure to find the image url to the full size .jpg image. # Make sure to save a complete url string for this image. nasa_image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(nasa_image_url) # In[7]: button = browser.find_by_id('full_image') button.click() # In[8]: button1 = browser.find_by_text('more info ') button1.click() # In[9]: featured_image_url = browser.find_link_by_partial_href('spaceimages/images') #jpl_image = browser.html #soup_jpl_image = bs(jpl_image, 'html.parser') #soup_jpl_image featured_image_url = featured_image_url['href'] # In[10]: # Mars Weather # Visit the Mars Weather twitter account https://twitter.com/marswxreport?lang=en and scrape the latest Mars weather tweet from the page. # Save the tweet text for the weather report as a variable called mars_weather. mars_weather_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_weather_url) # In[14]: html = browser.html parsed_tweet = bs(html, 'html.parser') mars_weather = parsed_tweet.find('p', class_='tweet-text').text # print(mars_weather) # In[ ]: # In[15]: # Mars Facts # Visit the Mars Facts webpage https://space-facts.com/mars/ and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. # Use Pandas to convert the data to a HTML table string. mars_facts_url = 'https://space-facts.com/mars/' browser.visit(mars_facts_url) # In[17]: mars_df = pd.read_html(mars_facts_url) # print(mars_df) # In[18]: mars_df[1] # In[19]: mars_facts_df = mars_df[1] mars_facts_df = mars_facts_df.to_html() mars_facts_df # In[35]: #Mars Hemispheres # Visit the USGS Astrogeology site https://space-facts.com/mars/ to obtain high resolution images for each of Mar's hemispheres. # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. # Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title. # Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere. base_hem_html = 'https://astrogeology.usgs.gov/' # used later mars_hem_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(mars_hem_url) # In[36]: html = browser.html hemisphere_parsed = bs(html,"html.parser") # In[37]: browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') #wait # i feel like there should be a "wait" command or something time.sleep(1) html = browser.html page_parsed = bs(html, 'html.parser') # In[40]: cerberus_image = page_parsed.find('img', class_='wide-image').get('src') cerberus_img_html = base_hem_html + cerberus_image cerberus_title = page_parsed.find('h2', class_='title').text # print(cerberus_img_html) # print(cerberus_title) # In[45]: # rinse-repeat Schiaparelli browser.visit(mars_hem_url) time.sleep(1) html = browser.html hemisphere_parsed = bs(html,"html.parser") # In[46]: browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') time.sleep(1) html = browser.html page_parsed = bs(html, 'html.parser') # In[47]: schiaparelli_image = page_parsed.find('img', class_='wide-image').get('src') schiaparelli_img_html = base_hem_html + schiaparelli_image schiaparelli_title = page_parsed.find('h2', class_='title').text # print(schiaparelli_img_html) # print(schiaparelli_title) # In[48]: # rinse-repeat Syrtis browser.visit(mars_hem_url) time.sleep(1) html = browser.html hemisphere_parsed = bs(html,"html.parser") # In[50]: browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') time.sleep(1) html = browser.html page_parsed = bs(html, 'html.parser') # In[51]: syrtis_image = page_parsed.find('img', class_='wide-image').get('src') syrtis_img_html = base_hem_html + syrtis_image syrtis_title = page_parsed.find('h2', class_='title').text # print(syrtis_img_html) # print(syrtis_title) # In[52]: # rinse-repeat Valles browser.visit(mars_hem_url) time.sleep(1) html = browser.html hemisphere_parsed = bs(html,"html.parser") # In[54]: browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced') time.sleep(1) html = browser.html page_parsed = bs(html, 'html.parser') # In[55]: valles_image = page_parsed.find('img', class_='wide-image').get('src') valles_img_html = base_hem_html + valles_image valles_title = page_parsed.find('h2', class_='title').text # print(valles_img_html) # print(valles_title) # In[57]: # bring it all together in a dict hs_title_img_final = [ {"title": cerberus_title, "img_src": cerberus_img_html}, {"title": schiaparelli_title, "img_src": schiaparelli_img_html}, {"title": syrtis_title, "img_src": syrtis_img_html}, {"title": valles_title, "img_src": valles_img_html} ] # print(hs_title_img_final) # In[39]: #I could probably loop the above section for all hemispheres, but I can't think of how to do it at the moment # hs_titles = [] # hs_urls = [] # img_title_loc = hemisphere_parsed.find_all('a', class_='h3') # for x in img_title_loc: # hs_title.append(hemisphere_parsed.find('h3').text) # hs_urls.append(base_hem_html + hemisphere_parsed.find('a', class_='href') # make dictionary out of all collected data for later use in flask app mars_info={"nasa_news_title": nasa_news_title, "nasa_news_teaser": nasa_news_teaser, "featured_image_url":featured_image_url, "mars_weather_url":mars_weather_url, "mars_weather":mars_weather, "mars_facts_df":mars_facts_df, "hs_title_img_final":hs_title_img_final } browser.quit() return mars_info
def scrape(): url = 'https://mars.nasa.gov/news/' response = requests.get(url) soup = bs(response.text, 'lxml') title = soup.find('div', class_='content_title') article_title = title.text paragraph = soup.find('div', class_='rollover_description_inner') article_paragraph = paragraph.text browser = Browser('chrome') url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(5) browser.find_by_id('full_image').click() time.sleep(5) browser.find_link_by_partial_href('spaceimages/details.php?').click() time.sleep(5) browser.find_link_by_partial_href('hires.jpg').click() featured_image_url = browser.url url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) twitter = browser.html soup = bs(twitter, 'lxml') i = 0 tweet = soup.find('p', class_='TweetTextSize') tweet = tweet.text.split('pic.twitter.com')[0] while tweet.startswith('InSight sol 1') == False: i += 1 tweet = soup.find('p', class_='TweetTextSize')[i] mars_weather = tweet url = 'https://space-facts.com/mars/' response = requests.get(url) soup = bs(response.text, 'html.parser') html_string = soup.find('table') html_table_string = pd.read_html(str(html_string)) html_table_string = str(html_table_string) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemispheres = [ 'Cerberus Hemisphere', 'Schiaparelli Hemisphere', 'Syrtis Major Hemisphere', 'Valles Marineris Hemisphere' ] hemisphere_images = [] i = 0 for hemisphere in hemispheres: browser.find_link_by_partial_text(' Hemisphere')[i].click() time.sleep(2) browser.find_link_by_text("Original") time.sleep(2) url = browser.url #url = url + '.tif' #url = url[:4] + url[5:] #url = url.replace('search/map', 'download') hemisphere_dict = {'url': url, 'title': hemisphere} hemisphere_images.append(hemisphere_dict) browser.back() time.sleep(2) i += 1 mars_dict = dict() mars_dict = { 'NASA Mars News': { 'Article Title': article_title, 'Article Paragraph': article_paragraph }, 'JPL Featured Image': featured_image_url, 'Mars Weather': mars_weather, 'Mars Facts': html_table_string, 'Mars Hemispheres': hemisphere_images } return mars_dict
def Mars_Hemispheres(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemisphere_url) html = browser.html soup = BeautifulSoup(html, "html.parser") img_header = soup.find_all("h3") title_list = [] imgs_url_list = [] for i in img_header: try: title = i.get_text() browser.click_link_by_partial_text(title) imgs_url = browser.find_link_by_partial_href('download')['href'] title_list.append(title) imgs_url_list.append(imgs_url) browser.visit(hemisphere_url) print('-----------') print(title) print(img_url) # Click the 'Next' button on each page #try: # browser.click_link_by_partial_text('next') except: print("Scraping Complete") hemisphere_dict = [{"title": title_list[i], "img_url": imgs_url_list[i]} for i in range(len(img_header))] return(hemisphere_dict) def scrape(): mars_w = {} mars_w = { "weather":mars_weather() } mars_w["weather"] = Mars_Weather() #################################################################################### # Create an instance of our Flask app. app = Flask(__name__) # Create connection variable conn = 'mongodb://*****:*****@app.route('/') def index(): # Store the entire team collection in a list teams = list(db.team.find()) print(teams) # Return the template with the teams list passed in return render_template('index.html', teams=teams) if __name__ == "__main__": app.run(debug=True) ##############################################################3 app = Flask(__name__) # Use flask_pymongo to set up mongo connection app.config["MONGO_URI"] = "mongodb://*****:*****@app.route("/") def index(): listings = mongo.db.listings.find_one() return render_template("index.html", listings=listings) @app.route("/scrape") def scraper(): listings = mongo.db.listings listings_data = scrape_craigslist.scrape() listings.update({}, listings_data, upsert=True) return redirect("/", code=302) if __name__ == "__main__": app.run(debug=True)
### Key to cracking this problem was use below code ###link_text = soup.find(class_="description").find('h3').get_text() ###browser.click_link_by_partial_text(link_text) ### Thank Dylan for helping me to crack it.... ### Then I tested it with below to get the link from next page ### browser.find_link_by_partial_href('download')['href'] # In[1037]: hemisphere_image_urls = [] temp_dict = {'title': [], 'img_url': []} capture_text = soup.find_all('h3') for i in capture_text: y = i.get_text() val1 = y.strip('Enhanced') browser.click_link_by_partial_text(y) val2 = browser.find_link_by_partial_href('download')['href'] temp_dict = {'title': val1, 'img_url': val2} hemisphere_image_urls.append(temp_dict) #img_url.append(browser.find_link_by_partial_href('download')['href']) browser.visit(url) # In[1038]: hemisphere_image_urls # In[1039]: browser.quit()
def scrape(): # Get latest news title and text executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = "https://mars.nasa.gov/news/" browser.visit(url) time.sleep(3) html = browser.html soup = bs(html, 'html.parser') # print(soup.prettify()) content_title_div = soup.find('div', class_='list_text') news_title = content_title_div.find('div', class_='content_title').a.text news_p = content_title_div.find('div', 'article_teaser_body').text final_dct['news_title'] = news_title final_dct['news_p'] = news_p final_dct # Get featured img url url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) browser.click_link_by_id("full_image") time.sleep(10) html = browser.html soup = bs(html, 'html.parser') img = soup.find('img', class_='fancybox-image') img_src = img['src'] featured_image_url = 'https://www.jpl.nasa.gov' + img_src final_dct["featured_img_url"] = featured_image_url # Get Mars data table url = "https://space-facts.com/mars/" d = pd.read_html(url) df = pd.DataFrame({}) df['Attribute'] = d[1]['Mars - Earth Comparison'] df['Mars'] = d[1]['Mars'] table_html = df.to_html() final_dct['table_html'] = table_html # Get hemisphere title and img url url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) links = browser.find_link_by_partial_href('enhanced') final_links = [] for i in links: if final_links.count(i['href']) == 0: final_links.append(i['href']) else: pass hemisphere_img_urls = [] main_url = 'https://astrogeology.usgs.gov' for i in final_links: browser.visit(i) html = browser.html soup = bs(html, 'html.parser') img = soup.find('img', class_='wide-image') img_src = img['src'] img_url = main_url + img_src hemisphere_title = soup.find('h2', class_='title').text final_title = hemisphere_title.rsplit(' ', 1)[0] hemisphere_img_urls.append({ "title": final_title, "img_url": img_url }) final_dct['hemisphere_img_urls'] = hemisphere_img_urls return final_dct
import pandas as pd from bs4 import BeautifulSoup as bs from splinter import Browser import requests import time hemisphere_dictionary = [] hemisphere_data = {"Image": [] , "URL": []} USGS_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser = Browser("chrome", headless = False) browser.visit(USGS_url) time.sleep(3) home = browser.html USGSsoup = bs(home, "html.parser") headings = USGSsoup.find_all("h3") for heading in headings: title = heading.text print(title) browser.click_link_by_partial_text(title) time.sleep(3) img_url = browser.find_link_by_partial_href("download")["href"] print(img_url) hemisphere_data = {"Image": title, "URL": img_url} hemisphere_dictionary.append(hemisphere_data) time.sleep(3) browser.visit(USGS_url) print(hemisphere_dictionary)
def query_iPfam( pdb_structures_query ): # # open browser # br = Browser() url = 'http://www.ipfam.org/search/keyword' br.visit(url) # # Search pdb structures vs. interactions # # make a search qeury with all the pdb structures br.find_by_css("#keywords")[0].fill(pdb_structures_query) br.find_by_css("input.button").click() # all structure interactions br.find_by_css(".lozenge > ul:nth-child(2) > li:nth-child(3) > input:nth-child(1)").click() # all ligand interactions # ... # click "show all" br.find_by_css("input.button:nth-child(3)").click() # show 100 entries br.find_by_css("#pdb_matches_table_length > label:nth-child(1) > select:nth-child(1)").first.select("-1") # grab all structure's and their interactions links count = 0 pdb_to_url = [] while True: count += 1 try: pdb_id = br.find_by_css("#pdb_matches_table > tbody:nth-child(2) > tr:nth-child("+str(count)+") > td:nth-child(1) > a:nth-child(1)").first.text pdb_url = br.find_by_css("#pdb_matches_table > tbody:nth-child(2) > tr:nth-child("+str(count)+") > td:nth-child(1) > a:nth-child(1)").first['href'] pdb_to_url.append((pdb_id,pdb_url)) except exceptions.ElementDoesNotExist: break # # obtain interactions per pdb # print "obtaining interactions for each pdb structure..." pdb_to_interactions = {} interaction_to_url = {} for pdb, url in pdb_to_url: print "pdb structure: "+pdb br.visit(url) interaction_status = br.find_by_css("div.lozenge:nth-child(1) > dl:nth-child(3) > dd:nth-child(2) > p:nth-child(1) > label:nth-child(2)").first.text n_family_interactions = int(interaction_status.replace("Family (","").replace(")","")) if n_family_interactions > 0: print "\t\t"+str(n_family_interactions)+" interactions found" br.find_by_value("fam_int").first.click() # click family interactions family_interactions = br.find_link_by_partial_href("/fam_int/") # @todo: test if this is a correct matcher for interaction in family_interactions: interaction_url = interaction['href'] a, b = interaction_url.split("/fam_int/") a_pfam_id = a.split("/family/")[1] b_pfam_id = b.split("/sequence")[0] interaction_neat = (a_pfam_id,b_pfam_id) print "\t\t\titeraction: "+interaction_neat[0]+"-to-"+interaction_neat[1]+" url: "+interaction['href'] # e.g. RVP-to-RVP interaction_to_url[interaction_neat] = interaction['href'] if pdb_to_interactions.has_key(pdb): pdb_to_interactions[pdb].append(interaction_neat) else: pdb_to_interactions[pdb] = [interaction_neat] else: print "\t\t"+str(n_family_interactions)+" interactions found" pdb_to_interactions[pdb] = [] # # save interactions data # pickle.dump( pdb_to_interactions, open( "./data/pdb_to_interactions.p", "wb" ) ) # pickle.dump( interaction_to_url, open( "./data/interaction_to_url.p", "wb" ) ) # # determine which pdb protein structures interact # Note: problem, we do not know which of the interacting pfams belong to the native protein # return pdb_to_interactions, interaction_to_url
hemisphere_image_urls = [] hem_dict = {} # Parse the resulting html with soup html = browser.html hem_soup = soup(html, 'html.parser') # Write code to retrieve the image urls and titles for each hemisphere. # Find all titles titles = hem_soup.find_all('h3') for i in titles: t = i.get_text() title = t.strip() browser.click_link_by_partial_text(t) href = browser.find_link_by_partial_href('_enhanced.tif/full.jpg')['href'] img_url = f'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars/{href}' hem_dict = {'title': title, 'img_url': img_url} hemisphere_image_urls.append(hem_dict) browser.visit(url) # Print the list that holds the dictionary of each image url and title. hemisphere_image_urls # Quit the browser browser.quit()
# In[12]: browser.find_by_id('full_image').click() # In[14]: browser.find_by_text('more info ').click() # In[15]: feature_img_link = browser.find_link_by_partial_href('photojournal.jpl.nasa.gov/jpeg') print(feature_img_link) # In[16]: feature_img_link=feature_img_link['href'] print(feature_img_link) # In[17]: # Retrieve page with the requests module response_mars_weather = requests.get(url_Mars_weather)
browser.find_by_name('addScenario').first.click() browser.fill('scName', countryTypeList[conIndex]+typeaName+igType) browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b') browser.type('scEffDate', '2015-10-31') browser.find_by_name('update').first.click() browser.find_link_by_text('Obligor').first.click() # choose the companyType type element = browser.find_by_name('companyType').first element.select(str(cType)) browser.fill('obligorName', companyName) browser.find_by_name('ObligorSearch').first.click() browser.find_link_by_partial_href('javascript:refPortResult')[0].click() # select "B-III counterpaty type" to be "corporate" element = browser.find_by_name('counterPartyType').first element.select('1') # select "Classification re Asset Value Correlation" to be "Non-Financial Institution (N)" element = browser.find_by_name('avc').first element.select('4') # select proper IG according to the IG type if igType == 'orig': pass else: element = browser.find_by_name('obligorIgCode').first # eleVal = element.find_element_by_xpath("//option[@selected='selected']").first.value eleVal = element.find_by_xpath('option[@selected="selected"]').first.value
class Youtube: def __init__(self): self.log = common.Logger() self.settings = common.Settings() self.comment_generator = common.CommentGenerator( ) # генератор комментариев self.browser = Browser('chrome') self.subscriptions = [] # наши подписки self.comments = common.Comments2() # база данных комментариев common.Subscription.url_list = [] # массив наших подписок self.sleep_time_after_visit = 5 self.our_channel_url = u'https://www.youtube.com/channel/'.format( self.settings.get_parameter('address')) # наш канал self.max_subscribers_amount = 1000 # подписываемся если количество подписчиком меньше этого числа with open( 'channels.txt', 'r' ) as f: # файл каналов с которых берем спосок каналов для подписки buffer = f.read() self.channels_list = buffer.split() self.channels_list = filter(bool, self.channels_list) self.channels_list = filter(lambda x: not x[0] == '#', self.channels_list) #self.channels_list = [x for x in self.channels_list if not x[0] == '#'] self.all_channel_mode = True self.re_is_cyrillic = regex.compile('[\p{IsCyrillic}]', regex.UNICODE) self.comment_not_russian = 'not russian title!' self.comment_errors_counter = 0 def login(self): browser = self.browser browser.visit('https://accounts.google.com') browser.fill('Email', self.settings.get_parameter('login')) button = browser.find_by_id('next') button.click() browser.fill('Passwd', self.settings.get_parameter('password')) button = browser.find_by_id('signIn') button.click() self.log.info('login ok') time.sleep(self.sleep_time_after_visit) def get_subscriptions(self): self.browser.visit('https://www.youtube.com/subscription_manager') time.sleep(self.sleep_time_after_visit) del self.subscriptions[:] links = self.browser.find_link_by_partial_href('/channel/') for link in links: #if link.visible: link_url = link['href'] if not link_url in common.Subscription.url_list and not self.our_channel_url in link_url: self.subscriptions.append( common.Subscription(link_url, link.value)) #print link.value #link.click() #self.subscriptions.reverse() return links def get_user_subscribers(self, user_url): if self.all_channel_mode: user_url = user_url.url self.browser.visit(user_url + '/channels') time.sleep(self.sleep_time_after_visit) self.log.info('open user {}'.format(user_url)) links = self.browser.find_link_by_partial_href('/channel/') user_subs = [] for link in links: #if link.visible: #print link.find_by_id('href').first if link['dir'] == 'ltr' and 'yt-ui-ellipsis' in link['class']: #print link.value#, link['href'], link['class'] subs_url = link['href'] if not subs_url == self.our_channel_url: user_subs.append(subs_url) return user_subs def get_subscribers_amount(self): #elements = self.browser.find_by_xpath('//*[@id="c4-primary-header-contents"]/div/div/div[2]/div/span[2]/span[1]') #elements = self.browser.find_by_xpath('//*[@id="c4-primary-header-contents"]/div/div/div[2]/div/span/span[1]') #//*[@id="watch7-subscription-container"]/span/span[2] elements = self.browser.find_by_id('c4-primary-header-contents') spans = elements.find_by_tag('span') amount = 0 for span in spans: if span['class'] == 'yt-subscription-button-subscriber-count-branded-horizontal subscribed yt-uix-tooltip': amount_str = span['title'].replace(unichr(160), '') #print map(ord, list(amount_str)) amount = int(amount_str) return amount def open_user_page(self, user_url): self.browser.visit(user_url) time.sleep(self.sleep_time_after_visit) subs = self.get_subscribers_amount() return subs def open_user_videos_page(self, user_url): links = self.browser.find_link_by_partial_href('/videos') for link in links: if link.visible: self.log.info('open videos list {}'.format(link['href'])) link.click() break time.sleep(self.sleep_time_after_visit) def open_last_user_video(self, user_url, not_commented=True): self.open_user_videos_page(user_url) links = self.browser.find_link_by_partial_href('watch?') url_found = False for link in links: #if link.visible: #print link.find_by_id('href').first #if link['dir'] == 'ltr' and 'yt-ui-ellipsis' in link['class']: url = link['href'] #print link.value, url, link['class'] if 'yt-uix-sessionlink' in link[ 'class'] and not self.comments.is_video_commented(url): self.log.info('open video {}'.format(url)) #link.click() self.browser.visit(url) url_found = True break if not url_found: return '' else: time.sleep(self.sleep_time_after_visit) return url def find_user_of_current_video(self): path = '//*[@id="watch7-user-header"]/a' elements = self.browser.find_by_xpath(path) return elements.first['href'] def press_like(self): path_notlike = '//*[@id="watch8-sentiment-actions"]/span/span[2]/button' path_like = '//*[@id="watch8-sentiment-actions"]/span/span[1]/button' elements = self.browser.find_by_xpath(path_like) b = elements.first #print b['title'] if b.visible: b.click() def press_subscribe(self): path = '//*[@id="watch7-subscription-container"]/span/button[1]' elements = self.browser.find_by_xpath(path) b = elements.first already_subscribed = False if b['data-is-subscribed']: #print 'already subscribed!' already_subscribed = True else: if b.visible: b.click() return already_subscribed #//*[@id="watch7-subscription-container"]/span/span[1] #//*[@id="c4-primary-header-contents"]/div/div/div[2]/div/span[2]/span[1] def have_it_cyrillic_letters(self, buffer): return not len(regex.findall(self.re_is_cyrillic, buffer)) == 0 def comment_on_video(self): url = self.browser.driver.current_url if not 'watch?' in url: self.log.error('not video page!') return '' #raise Exception('not video page!') user = self.find_user_of_current_video() self.subscriptions.insert(0, common.Subscription( user, '')) #добавляем пользователя в подписки if self.comments.is_user_commented(user): print 'user already commented {}'.format(user) if self.comments.is_video_commented(url): self.log.error('video already commented!') return '' #print self.get_subscribers_amount() #проверяем есть ли в названии русские буквы title = self.browser.title if not self.have_it_cyrillic_letters(title): msg = self.comment_not_russian self.log.error(msg) return msg time.sleep(5) #raise Exception('like') self.browser.driver.execute_script("window.scrollTo(0, 350)") time.sleep(10) #browser.find_by_tag('html').first.type(Keys.PAGE_DOWN) elements_mode = 0 # разные типы комментариев и кнопок elements = self.browser.find_by_id('yt-comments-sb-standin') if len(elements) == 0: # комментарии отключены elements = self.browser.find_by_xpath( '//*[@id="comment-section-renderer"]/div[1]/div[2]') if len(elements) == 0: msg = 'Cannot find field for comment!' self.log.error(msg) raise Exception(msg) #return '' elements_mode = 1 if elements.first.visible: elements.first.click() else: raise Exception('Comment element not visible!') time.sleep(3) print 'elements mode', elements_mode #пишем комментарий if elements_mode == 0: elements = self.browser.find_by_xpath( '//*[@id="yt-comments-sb-container"]/div[2]/div[1]/div[1]') else: elements = self.browser.find_by_xpath( '//*[@id="comment-simplebox"]/div[1]') if len(elements) == 0: raise Exception('Comment element not found!') return '' elements.first.click() comment_text = self.comment_generator.get_comment() try: elements.first.fill(comment_text) except: msg = 'Error when fill comment!' self.log.error(msg) self.comment_errors_counter += 1 if self.comment_errors_counter > 5: raise Exception(msg) return '' #keys = elements.first.type(comment_text, slowly=True) перестало работать, выдает исключение в цикле #for key in keys: # pass # нажимаем кнопку if elements_mode == 0: elements = self.browser.find_by_xpath( '//*[@id="yt-comments-sb-container"]/div[2]/div[1]/div[3]/button[2]' ) else: elements = self.browser.find_by_xpath( '//*[@id="comment-simplebox"]/div[3]/button[2]') if len(elements) == 0: raise Exception('Cannot find send comment button!') elements.first.click() time.sleep(3) #print elements.first.text #self.comments.add(url) self.log.info(u'comment video {}'.format(comment_text)) #подписка subscribed_before = self.press_subscribe() if subscribed_before: # были подписаны ранее self.log.error('already subscribed!') #return '' time.sleep(3) # нажимаем лайк self.press_like() return comment_text def get_channel_list(self): if not self.all_channel_mode: channel_list = self.channels_list else: channel_list = self.subscriptions return channel_list #subs_cache = []
def scrape(): # handle Mars News news_url = 'https://mars.nasa.gov/news' chromedriver = "/usr/local/bin/chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver driver = webdriver.Chrome(chromedriver) driver.get(news_url) time.sleep(5) html = driver.page_source news_soup = BeautifulSoup(html, 'lxml') news_results = news_soup.find_all('li', class_="slide") text = news_results[0].find_all('a') news_title = text[1].text news_p = text[0].find('div', class_="rollover_description_inner").text driver.close() # scrape JPL featured image executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(image_url) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') links_found = browser.find_link_by_partial_href('images/largesize') featured_image_url = links_found['href'] browser.quit() # scrape Mars weather weather_url = 'https://twitter.com/marswxreport?lang=en' weather_response = requests.get(weather_url) weather_soup = BeautifulSoup(weather_response.text, 'html.parser') weather_results = weather_soup.find_all('div', class_="js-tweet-text-container") mars_weather = weather_results[0].find('p').text # scrape Mars facts facts_url = 'https://space-facts.com/mars/' tables = pd.read_html(facts_url) df = tables[0] html_table = df.to_html(header=None,index=False) html_table = html_table.replace('\n', '') # scrape Mars Hemisperes hemisphere_image_urls =[] # define a function to scrape full resolution image link using splinter def find_hemisperes(name): browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) browser.click_link_by_partial_text(name) links_found = browser.find_link_by_partial_href(name.split()[0].lower()) url = links_found['href'] dic = {"title": f"{name} Hemisphere", "img_url": url} hemisphere_image_urls.append(dic) browser.quit() hemisperes_list = ['Cerberus', 'Schiaparelli', 'Syrtis Major', 'Valles Marineris'] for hemispere in hemisperes_list: find_hemisperes(hemispere) time.sleep(2) scrape_dic = { "news_title" : news_title, "news_p" : news_p, "featured_image" : featured_image_url, "weather" : mars_weather, "facts" : html_table, "hemispheres" : hemisphere_image_urls } return scrape_dic
@author: JIANGHOU """ import time from splinter import Browser from selenium import webdriver browser = Browser() url = 'http://geo.ckcest.cn/' browser.visit(url) #人工登陆 #遥感影像 browser.visit('http://geo.ckcest.cn/scientific/InternationalData/list.html') #然后跳转到一个页码 linkspage = browser.find_link_by_partial_href('#') paperurls = [] time.sleep(3) papers = browser.find_link_by_partial_href('remotedetail.html') browserurl = browser.url for j in range(0, 10): #len(papers) papers[j].click() time.sleep(3) window = browser.windows[1] paperurls.append(window.url) window.close() for i in range(1, 10): browser.visit(paperurls[i]) time.sleep(15) remotelist = browser.find_by_id('remotelist') within_elements = remotelist.first.find_by_tag('a')
def line_login(browser, user_name, password, code): """ lineに自動ログインして、パラメータのカードコードを入力し、チャージする。 チャージした結果を返す。 :param browser:ブラウザインスタンス :param user_name:ログインユーザネーム :param password:ログインパスワード :param code:ギフトカードコード :return:チャージ結果 """ # ログインページを開く browser = Browser('firefox') url = 'https://store.line.me/home/' browser.visit(url) # ログインする login_submit = browser.find_link_by_partial_href('login') if login_submit: login_submit.click() else: html_code = browser.html return { 'code': 4, 'message': "サイト上に問題が発生しました。(サイトがアクセスできない、またはネットが遅すぎる可能性があります。)", 'htmlcode': html_code } username_input_field = browser.find_by_id('id') password_input_field = browser.find_by_id('passwd') login_submit = browser.find_by_value('Login') if username_input_field and password_input_field and login_submit: username_input_field.fill(user_name) password_input_field.fill(password) login_submit.click() else: html_code = browser.html return { 'code': 4, 'message': "サイト上に問題が発生しました。(サイトがアクセスできない、またはネットが遅すぎる可能性があります。)", 'htmlcode': html_code } # ログイン画像認識があるかどうかチェックする #captcha_image_field = browser.find_by_css('img.FnCaptchaImg') #メールアドレスまたパスワードをチェックする login_alert_field = browser.find_by_css('p.mdMN02Txt') if browser.is_element_present_by_css('p.mdMN02Txt'): result = login_alert_field.value if result.find(unicode('The password you have entered is invalid, or you have not registered your email address with LINE.')) != -1: html_code = browser.html return { 'code': 2, 'message': 'メールアドレスまたはパスワードが正しくありません。', 'htmlcode': html_code } # チャージ画面に移動する browser.find_by_text('Charge').click() browser.windows.current = browser.windows[1] browser.find_by_id('70002').click() browser.execute_script("charge(this); return false;") # チャージする code_input_field = browser.find_by_id('FnSerialNumber') code_input_field.fill(code) time.sleep(9000) browser.execute_script("javascript:doCharge(this);return false;") result = browser.find_by_css('p.mdLYR11Txt01').value browser.quit() return result
hemises = hemis_soup.find_all('h3') # Append the dictionary with the image url string and the hemisphere title to a list. hemis_dict = {} hemisphere_image_urls = [] for hemis in hemises: hemis_dict["title"] = hemis.text.strip('Enhanced') # Click on the link with the corresponding hemis try: browser.click_link_by_partial_text(hemis.text) except ElementDoesNotExist: print(f"{hemis.text} Image doesn't exist") # Scrape the image url string hemis_dict["img_url"] = browser.find_link_by_partial_href('download')['href'] hemisphere_image_urls.append(hemis_dict) browser.visit(url_hemis) print(hemisphere_image_urls) # In[ ]:
def scrape_info(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Scraping Nasa Mars News # Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) # and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later. source = requests.get('https://mars.nasa.gov/news/').text soup = bs(source, 'html.parser') article = soup.find_all('div', class_='content_title') news_title0 = article[0].a.text news_title1 = article[1].a.text news_title2 = article[2].a.text description = soup.find_all('div', class_="rollover_description_inner") news_p0 = description[0].text news_p1 = description[1].text news_p2 = description[2].text # Scraping JPL Mars Space Images - Featured Image # Return featured_img_url #Visit the url for JPL Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars). #Use splinter to navigate the site url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) try: browser.click_link_by_id('full_image') except: browser.click_link_by_partial_text('FULL IMAGE') else: print("Scraping Full Image Complete") check = 0 try: links_found = browser.find_link_by_partial_href('spaceimages/details') url2 = links_found[0]["href"] browser.click_link_by_partial_text('more info') links_found2 = browser.find_link_by_partial_href( 'spaceimages/images/largesize') f1 = links_found2[0]["href"] check = 1 except: browser.visit(url2) links_found3 = browser.find_link_by_partial_href( 'spaceimages/images/largesize') f2 = links_found3[0]["href"] else: print("Scraping More Info Complete") if check == 1: featured_image_url = f1 else: featured_image_url = f2 # Mars Weather # Returns (mars_weather) #Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en) #and scrape the latest Mars weather tweet from the page. #Save the tweet text for the weather report as a variable called `mars_weather` source3 = requests.get('https://twitter.com/marswxreport?lang=en').text soup = bs(source3, 'html.parser') tweets = soup.find_all( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text") #print(tweets[0].text) mars_weather = tweets[0].text #Mars Facts # Returns (mars_facts_table) facts = pd.read_html("https://space-facts.com/mars/") mars_facts_df = facts[1] mars_facts_df.columns = ['Description', 'Value'] mars_facts_df.set_index("Description", inplace=True) mars_facts_df.head() mars_facts_table = mars_facts_df.to_html() mars_facts_table = mars_facts_table.replace('\n', '') # Mars Hemispheres # Returns hemisphere_image_urls hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemi) html = browser.html # Get titles for all four mars pictures soup = bs(html, 'html.parser') hemi_class = soup.find_all('h3') cerberus_title = hemi_class[0].text schiaparelli_title = hemi_class[1].text syrtis_title = hemi_class[2].text valles_title = hemi_class[3].text # Get Cerberus Information browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') link1 = (browser.find_link_by_partial_text('Original')) cerberus_link = (link1[0]["href"]) + "/full.jpg" browser.back() # Get Schiaparelli Information browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') link2 = (browser.find_link_by_partial_text('Original')) schiaparelli_link = (link2[0]["href"]) + "/full.jpg" browser.back() # Get Syrtis Major Information browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') link3 = (browser.find_link_by_partial_text('Original')) syrtis_link = (link3[0]["href"]) + "/full.jpg" browser.back() # Get Valles Major Information browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced') link4 = (browser.find_link_by_partial_text('Original')) valles_link = (link4[0]["href"]) + "/full.jpg" browser.back() marsdata = { "news_title0": news_title0, "description0": news_p0, "news_title1": news_title1, "description1": news_p1, "news_title2": news_title2, "description2": news_p2, "JPL_link": featured_image_url, "weather_tweet": mars_weather, "facts_table": mars_facts_table, "title1": cerberus_title, "img_url1": cerberus_link, "title2": schiaparelli_title, "img_url2": schiaparelli_link, "title3": syrtis_title, "img_url3": syrtis_link, "title4": valles_title, "img_url4": valles_link } # Close the browser after scraping browser.quit() # Return results return marsdata
""" import time from splinter import Browser from selenium import webdriver browser = Browser() url='http://geo.ckcest.cn/' browser.visit(url) #人工登陆 #文献下载#文献下载 browser.visit('http://geo.ckcest.cn/scientific/literature/techdoc_v.html') #然后跳转到一个页码 linkspage = browser.find_link_by_partial_href('#') paperurls=[] for i in range(16,19): page=linkspage[i] page.click() time.sleep(3) papers = browser.find_link_by_partial_href('techdoc_papers.html') browserurl=browser.url for j in range(0,len(papers)): papers[j].click() time.sleep(3) window = browser.windows[1] paperurls.append(window.url) window.close() for i in range(1,len(paperurls)): browser.visit(paperurls[i])
def get_data(): # 1 Nasa news *** USING BROWSER = SPLINTER *** browser = Browser('chrome') url = "https://mars.nasa.gov/news/" #go to url browser.visit(url) # HTML object html = browser.html # Parse HTML with BeautifulSoup soup = bs(html, "html.parser") # Collect News Title and Paragraph # Print("Start getting the titles...") result["news_title"] = soup.find("div", class_="content_title").text.strip() # print(news_title) result["news_p"] = soup.find('div', class_="article_teaser_body").text # print(news_p) # Print("Got titles and paragraphs...") # Close the browser after scraping browser.quit() #2- JPL Mars Space Images - Featured Image *** USING BROWSER = SPLINTER *** browser = Browser('chrome') jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)" #go to url browser.visit(jpl_url) #navigate to link time.sleep(5) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') image_html = browser.html jpl_soup = bs(image_html, "html.parser") image_path = jpl_soup.find('figure', class_='lede').a['href'] result["featured_image_url"] = "https://www.jpl.nasa.gov/" + image_path # print(featured_image_url) # print("Got feature image url") # Close the browser after scraping browser.quit() #3- Mars Weather *** USING BROWSER = SPLINTER *** browser = Browser('chrome') url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) time.sleep(5) html = browser.html weather_soup = bs(html, 'html.parser') result["mars_weather"] = weather_soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text.strip() # print(mars_weather) # print("Got weather info from twitter...") # Close the browser after scraping browser.quit() #Mars Facts url = "https://space-facts.com/mars/" marsFacts = pd.read_html(url) facts = marsFacts[0] facts.columns = ['fact', 'Number'] facts = facts.set_index('fact')['Number'].to_dict() result['facts'] = facts # print('Got facts...') #5-Mars Hemispheres #create list/dics hemisphere_img_urls = [] hemisphere_dicts = {"title": [], "img_url": []} # url url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser = Browser("chrome") browser.visit(url) time.sleep(5) home_page = browser.html #HTML & Parsing hemispheres_soup = bs(home_page, "html.parser") results = hemispheres_soup.find_all("h3") # Use loop for r in results: title = result.text # print(title) #title without word "Enhanced" title = title[:-9] # print(title) browser.click_link_by_partial_text(title) img_url = browser.find_link_by_partial_href("download")["href"] # print(img_url) hemisphere_dicts = {"title": title, "img_url": img_url} hemisphere_img_urls.append(hemisphere_dicts) browser.visit(url) # Close the browser after scraping browser.quit() result["hemisphere_img_urls"] = hemisphere_img_urls # print("Got hemisphere images...") mars_data = { "title": title, "content": news_p, "featured_image_url": featured_image_url, "latest_weather": mars_weather, "image_data": hemisphere_img_urls, } existing = mars_collection.find_one() if existing: mars_data['_id'] = existing['_id'] mars_collection.save(mars_data) else: mars_collection.save(mars_data) return mars_data
element = browser.find_by_name('avc').first element.select('4') browser.fill('sic', '7759') browser.fill('totalSale', '10') igArray = ['99','98','95','90','87','85','83','80','77','75','73','70','65','60','40','30'] for x in range(16): element = browser.find_by_name('obligorIgCode').first element.select(igArray[x]) browser.find_by_name('UpdateButton').first.click() browser.driver.save_screenshot(typeaName + countryList[conIndex] + '_' + igArray[x] +'.png') browser.find_link_by_partial_href('/CNETCORP/cpmScenarios.do').first.click() else: # create the new scenario if cType == 0: typeaName = 'Existing_Scotia_Public_' elif cType == 1: typeaName = 'Existing_Scotia_Private_' else: typeaName = 'Non_Scotia_Public_' browser.find_by_name('addScenario').first.click() browser.fill('scName', typeaName + countryList[conIndex]) browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b') browser.type('scEffDate', '2014-10-31') browser.find_by_name('update').first.click()
def getRoutes(start,end): browser = Browser( driver_name="firefox" ) browser.visit('https://www.hopstop.com/search?xfr=cityscape') print(browser.url) browser.fill('address1',str(start)) browser.fill('address2',str(end)) browser.find_by_name('get_dirs').click() print(browser.url) if browser.is_text_present('Did you mean?'): print "better at least get here" #browser.click_link_by_href("#") for link in browser.find_link_by_href("#"): print "Okay" if link.visible == True: print link.text browser.click_link_by_text(link.text) break browser.click_link_by_href("#") links = browser.find_link_by_partial_href("/station?stid") results = [] for link in links: results.append(link.value) browser.quit() return results