Python Browser.click_link_by_partial_text示例，splinter.Browser.click_link_by_partial_text Python示例

示例#1

0

显示文件

文件： scrape.py 项目： spandya108/1001tracklistutils

class TrackListScraper(object):

    def __init__(self, artists, year):
        self.browser = Browser('chrome')
        self.artists = artists
        self.year = year
        self.browser.visit('http://1001tracklists.com')

    def execute_full_scrape(self):
        artist_tracklists = {}
        for artist in self.artists:
            artist_tracklists[artist] = self.scrape_per_artist(artist)
        self.browser.quit()
        return artist_tracklists

    def scrape_per_artist(self, artist):
        """Execute the same scrape but instead using the python splinter library
        """

        self.browser.fill('main_search', artist + ' edc ' + self.year)

        self.browser.find_by_id('btn_search').first.click()

        try:
            self.browser.click_link_by_partial_text('2014-06-')
            track_strings = self.get_track_list_for_set(artist)
            return track_strings
        except ElementDoesNotExist:
            pass

    def get_track_list_for_set(self, artist):
        soup = BeautifulSoup(self.browser.html)
        track_values = soup.find_all('div', class_='trackValue')

        track_strings = []
        file = open('tracklist-' + artist + '-edc' + self.year, 'w')
        for track in track_values:
            if track.a:
                track_string = track.a.string
                file.write(track_string)
                # track details in format [artist, trackname]
                track_details = self.parse_track_string(track_string)
                track_strings.append(track_details)
        file.close()
        return track_strings

    def parse_track_string(self, track_string):
        track_info = track_string.strip().split('-')
        for i in range(len(track_info)):
            track_info[i] = track_info[i].strip()
        return track_info

示例#2

0

显示文件

文件： xfinity.py 项目： andres-erbsen/cogs

def xfinity(browser=None):
    if not browser:
        print ("Making browser...")
        browser = Browser('phantomjs')
    print ("Trying google.com...")
    browser.visit('http://google.com/')
    if 'google.' in browser.url:
        print ("google.com connected :)")
        return

    print ("Sign up...")
    browser.click_link_by_partial_text('Sign up')
    print ("Filling form...")
    browser.select("rateplanid", "spn")
    browser.check('spn_terms')
    browser.fill('spn_postal', '12345')
    browser.fill('spn_email', '*****@*****.**')
    print ("Submitting...")
    sleep(3) # it did not work without the sleeps
    browser.find_by_css('.startSessionButton').type(' \n')
    sleep(7)
    browser.ensure_success_response()
    print (browser.screenshot())

示例#3

0

显示文件

文件： scrape_mars.py 项目： Terrence-Cummings/web-scraping-challenge

def scrape_info():
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    chrome_options.add_argument("--headless")   

    #Because the search results at the URL are from Javascript use Selenium to scrape the data

    #URL for NASA Mars News website. This show 40 articles from a search of the criteria "Latest" and "All Categories".
    #Results of the search are generated by Javascript so not viewable in the webpage HTML
    url_mars_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

    #Initialize lists to store Selenium objects
    dates = []
    titles = []
    summarys = []

    #Use Selenium to get the needed fields from the JS results
    #XPath for tags were found by right-clicking on the tag in the Chrome Inspector tool the Copy XPath
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url_mars_news)

    #Add a delay to give the scraper time to acquire the data
    time.sleep(10)
    dates = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[1]')
    titles = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[2]/a')
    summarys = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[3]')

    # create empty array to store text data extracted from Selenium objects
    date_lst = []
    title_lst = []
    summary_lst = []
    news_url_lst = []

    # loop over results and extract text from Selenium objects, add to each list
    for date in dates:
        article_date = date.text
        date_lst.append(article_date)
    for title in titles:
        article_title = title.text
        title_lst.append(article_title)
        href = title.get_attribute('href')
        news_url_lst.append(href)
    for summary in summarys:
        article_summary = summary.text
        summary_lst.append(article_summary)

    #Make dataframe of NASA Mars Latest News Articles
    nasa_mars_articles_df = pd.DataFrame(list(zip(date_lst, title_lst, summary_lst, news_url_lst)), columns =['Date', 'Title', 'Summary', 'URL'])
    driver.quit()

    #Convert to dictionary and confirm results of the scraping
    nasa_mars_articles_dict = nasa_mars_articles_df.to_dict('records')

    #Setup Splinter Browsder and target URL
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=True)
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    #Go to URL and navigate to page with full size image.
    browser.visit(url_jpl)
    browser.click_link_by_partial_text('FULL IMAGE')
    browser.click_link_by_partial_text('more info')

    #Grab the HTM from the webpage with the full size image which contains the link to that image
    html = browser.html
    browser.quit()

    #Use BeautifulSoup to parse the HTML
    soup = BeautifulSoup(html, 'html.parser')

    #Find the image tag for the main image
    main_img = soup.find('img', class_='main_image')

    #Extract the source link for the image
    main_img_url = main_img['src']

    #Build the full URL to the full size featured image
    main_img_url_full = 'https://www.jpl.nasa.gov'+main_img_url

    #Use Selenium because Twitter tweets are populated by JS
    url_mars_tweet = 'https://twitter.com/marswxreport?lang=en'
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url_mars_tweet)
    time.sleep(1)

    #Find the first Twitter post from "Mars Weather" as there are other non-weather posts in this thread
    find_weather = True
    x=1
    while find_weather:
        mars_weather_tweet_obj = driver.find_elements_by_xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div/div/div/div[2]/section/div/div/div/div['+str(x)+']/div/div/div/div/article/div/div[2]/div[2]/div[2]/div[1]/div/span')
        x+=1

        #Extract the text of the tweet and replace line breaks
        mars_weather_tweet = mars_weather_tweet_obj[0].text.replace('\n',', ')
        lead_string = mars_weather_tweet[0:7]
        #Posts from Mars Weather start with the string 'InSight'
        if lead_string=='InSight':
            find_weather=False

    #Close browser
    driver.quit()

    #Send Pandas to read tables from URL
    mars_facts_url = 'https://space-facts.com/mars/'
    mars_facts = pd.read_html(mars_facts_url)

    #Grab the first table of facts, add column headings
    mars_facts_df = mars_facts[0]
    mars_facts_df.columns = ['Parameter', 'Fact']

    #Write as HTML table
    #mars_facts_df.to_html('mars_facts_table.html', index=False)

    #Convert df to dictionary
    mars_facts_dict = mars_facts_df.to_dict('records')

    #Check results
    mars_facts_dict
    
    #Setup Splinter Browsder and target URL
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=True)
    mars_hemis_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    #Go to URL that summarizes the Mars hemispheres.
    browser.visit(mars_hemis_url)

    #Grab the HTML
    html2 = browser.html
    browser.quit()

    #Use BeautifulSoup to parse the HTML
    soup2 = BeautifulSoup(html2, 'html.parser')

    #Find the URL tag for each hemisphere's separate page
    hemi_links = soup2.find_all('a', class_='itemLink')

    #Build a list of the full URL for each hemisphere's separate page so we can go there to find the link to download the full size image.
    full_urls = []
    for link in hemi_links:
        full_url = 'https://astrogeology.usgs.gov/'+link['href']
        full_urls.append(full_url)

    #Remove duplicates from the URL list
    full_urls = list(dict.fromkeys(full_urls))

    #Setup Splinter browser
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=True)

    #Initialize the list of dictionaries that will hold each hemisphere's title and link to full size image download
    mars_hems_dict_lst = []

    #For each hemispher URL
    for i in full_urls:
        #Go to the individual webpage of that hemisphere
        browser.visit(i)
        #Grab the HTML
        html3 = browser.html
        #Use BeautifulSoup to parse the HTML
        soup3 = BeautifulSoup(html3, 'html.parser')
        #Find the link for the Original tif photo download (not the sample JPG)
        image_link = soup3.find('a', string='Sample')
        image_link = image_link['href']
        #Find the title or name of the hemisphere
        image_title = soup3.find('h2', class_='title')
        #Remove unneeded wording at the end of the title
        image_title = image_title.text.replace(' Enhanced', '')
        #Create a dictionary of the title and link for that hemisphere
        temp_dict = {'title': image_title, 'img_url': image_link}
        #Add the dictionary to the list
        mars_hems_dict_lst.append(temp_dict)

    browser.quit()

    mars_data = {
        'article': nasa_mars_articles_dict,
        'weather': mars_weather_tweet,
        'featured_image': main_img_url_full,
        'mars_facts': mars_facts_dict,
        'mars_hems' : mars_hems_dict_lst
    }

    return mars_data

示例#4

0

显示文件

文件： scrape_mars.py 项目： robeaseab/Mars-Web-Scraping

def scrape():
    #Mars News
    #define path & set up browser
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(2)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    titles = soup.find_all('div', class_="content_title")
    news_title = titles[0].text.strip()
    print(news_title)
    p_texts = soup.find_all('div', class_="article_teaser_body")
    news_p = p_texts[0].text.strip()
    print(news_p)
    dates = soup.find_all('div', class_="list_date")
    news_date = dates[0].text.strip()
    print(news_date)
    mars_news = {
        "news_title": news_title,
        "news_p": news_p,
        "news_date": news_date
    }
    print(mars_news)
    #JPL Mars Space Images - Featured Image
    #define path & set up browser
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(2)
    #navigate to top image
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(2)
    #set up beautiful soup for new page
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #extract top image url
    top_img = soup.find('img', class_="fancybox-image")
    top_img['src']
    top_img_url = 'https://www.jpl.nasa.gov' + top_img["src"]
    print(top_img_url)
    #Mars Weather
    # URL of page to be scraped
    url = 'https://twitter.com/marswxreport?lang=en'
    # Retrieve page with the requests module
    response = requests.get(url)
    #create soup object
    soup = BeautifulSoup(response.text, 'html.parser')
    # Examine the results
    # print(soup.prettify())
    mars_weather = soup.find(
        'p',
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text.strip()
    print(mars_weather)
    #Mars Facts
    # URL of page to be scraped
    url = 'https://space-facts.com/mars/'
    # Retrieve page with the requests module
    response = requests.get(url)
    #create soup object
    soup = BeautifulSoup(response.text, 'html.parser')
    # Examine the results
    # print(soup.prettify())
    # Use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    tables = pd.read_html(url)
    tables
    # Use Pandas to convert the data to a HTML table string.
    mars_df = tables[0]
    mars_df
    html_table = mars_df.to_html(na_rep=" ", index=False, header=False)
    #html_table = html_table.replace('\n','')
    #html_table = html_table.replace("'",' ')
    print(html_table)
    #Mars Hemispheres
    #define path & set up browser
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    time.sleep(2)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    time.sleep(2)
    # products = soup.find('div', class_="product-section")
    items = soup.find_all('div', class_="item")

    titles = []
    img_urls = []

    hemisphere_image_urls = []

    for i in items:
        #scrape title
        img_title = i.find('h3').get_text()
        title = img_title.rsplit(' ', 1)[0]
        titles.append(title)

        #scrape hemisphere url
        detail = i.find('a')['href']
        detail_url = 'https://astrogeology.usgs.gov' + detail

        #got to detail_url
        browser.visit(detail_url)
        time.sleep(1)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        #scrape img_url
        downloads = soup.find('div', class_="downloads")
        ul = downloads.find('ul')
        li = ul.find_all('li')
        img = li[0]
        img_url = img.find('a')['href']
        img_urls.append(img_url)

        hemisphere_image_urls.append({"title": title, "img_url": img_url})

        #go back to original url
        browser.visit(url)

    print(hemisphere_image_urls)
    scrape_dict = {
        "mars_news": mars_news,
        "top_img_url": top_img_url,
        "mars_weather": mars_weather,
        "html_table": html_table,
        "hemisphere_image_urls": hemisphere_image_urls
    }

    print(scrape_dict)
    return scrape_dict

示例#5

0

显示文件

文件： mars_scrape.py 项目： nowlansavage/web-scraping-challenge

def scrape():
    #import dependancies
    from bs4 import BeautifulSoup
    import pandas as pd
    from splinter import Browser
    import requests
    import time

    #mars news
    #url to be scraped
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    #set up chromedriver
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(url)
    html = browser.html
    time.sleep(2)
    #scrape html
    soup = BeautifulSoup(html, 'html.parser')
    #get latest headline
    latest_headline = soup.find_all('li', class_='slide')[0].find(
        'div', class_='content_title').text

    news_p = soup.find('div', class_='article_teaser_body').text

    #Scrape image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    #click on featured image
    browser.click_link_by_partial_text('FULL IMAGE')
    #click on more info
    browser.click_link_by_partial_text('more info')

    #scrape html to get picture link name
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #click to full image jpg link
    image_link = soup.find('aside', class_='image_detail_module').find_all(
        'div', class_='download_tiff')[1].find('a').text
    browser.click_link_by_partial_text(image_link)
    #get the url as string
    featured_image_url = browser.url

    #scrape Mars Facts
    #set up browser
    url = 'https://space-facts.com/mars/'
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)
    #scapre tables using pandas
    tables = pd.read_html(browser.html)
    #get stats table into a data frame
    mars_table_df = tables[0]
    #get html for that table
    mars_table_html = mars_table_df.to_html()

    #crape hemispheres
    #set up browser
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)
    #create beautiful soup object for scraping
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #get list of hemispheres
    hemispheres = soup.find('div', class_='collapsible results').find_all(
        'div', class_='item')

    hemi_list = []
    base_url = 'https://astrogeology.usgs.gov'

    for hemisphere in hemispheres:
        mars_dict = {}
        link = hemisphere.find('div', class_='description').a['href']
        title = hemisphere.find('div', class_='description').find('h3').text
        browser.visit(base_url + link)
        time.sleep(2)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        img_url = soup.find('div',
                            class_='downloads').find('a',
                                                     target='_blank')['href']
        mars_dict['title'] = title
        mars_dict['img_url'] = img_url
        hemi_list.append(mars_dict)
    mars_info_dict = {
        'latest_headline': latest_headline,
        'news_p': news_p,
        'featured_image_url': featured_image_url,
        'mars_table_html': mars_table_html,
        'hemi_list': hemi_list
    }
    return mars_info_dict

示例#6

0

显示文件

文件： scrape_mars.py 项目： bman511/PyMars

def scrape():
    mars_dict = {}

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #NASA Mars news
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='article_teaser_body').text
    mars_dict['News'] = {'Title': news_title, 'Description': news_p}

    #3PL Mars Images
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(2)
    browser.click_link_by_partial_text('more info')
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_image = soup.find('img', class_='main_image')['src']
    feat_image_url = 'https://www.jpl.nasa.gov' + mars_image
    mars_dict['Featured Image'] = feat_image_url

    #Mars Weather
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_weather = soup.find_all('div', class_='content')
    indicators = ['Sol', 'InSight']
    for tweet in mars_weather:
        twit_user = tweet.find('a', class_='account-group')['data-user-id']
        if twit_user == '786939553':
            weather_text = tweet.find('p', class_='tweet-text').text
            #if weather_text.split()[0] == 'Sol':
            if weather_text.split()[0] in indicators:
                break
        continue
    mars_dict['Weather'] = weather_text
    print(weather_text)

    #Mars Data
    url = 'http://space-facts.com/mars/'
    tables = pd.read_html(url)
    df = tables[0]
    # df.columns = ['Parameter', 'Value(s)']
    # df.set_index('Parameter',inplace=True)
    web_table = df.to_html(classes='table', index=False)
    mars_dict['Facts'] = web_table
    #print(web_table)

    #Mars Hemispheres
    #First url stopped working, page was changed or deleted, or is down
    #url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    url = 'https://astrogeology.usgs.gov/maps/mars-viking-hemisphere-point-perspectives'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    # hemispheres = soup.find_all('div',class_='item')
    #hemis_array = []
    #url_front = 'https://astrogeology.usgs.gov'

    hemispheres = soup.find_all('a', class_='item')
    hemis_array = []
    url_front = 'https://astrogeology.usgs.gov'
    skip = [0, 2, 4, 6]
    iter_num = 0
    for item in hemispheres:
        if iter_num in skip:
            iter_num += 1
            continue
        else:
            iter_num += 1
            item_dict = {}
            text_header = item.find('h3').text
            item_dict['Title'] = text_header

            #link = item.find('a',class_='itemLink')['href']
            link = item['href']
            full_url = url_front + link
            browser.visit(full_url)

            html = browser.html
            soup = bs(html, 'html.parser')

            big_link = soup.find('img', class_='wide-image')['src']
            item_dict['img_url'] = url_front + big_link

            hemis_array.append(item_dict)

            browser.back()
    mars_dict['Hemispheres'] = hemis_array
    #print(hemis_array)

    #<img class="wide-image" src="/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg">

    # #click functions for elements wouldn't work, apparently a chrome driver issue, so I constructed a full link and used browser.visit
    # for item in hemispheres:

    #     item_dict = {}
    #     text_header = item.find('h3').text
    #     item_dict['Title'] = text_header

    #     link = item.find('a',class_='itemLink')['href']
    #     full_url = url_front + link
    #     browser.visit(full_url)

    #     html = browser.html
    #     soup = bs(html, 'html.parser')

    #     big_link = soup.find('img',class_='wide-image')['src']
    #     item_dict['img_url'] = url_front + big_link

    #     hemis_array.append(item_dict)

    #     browser.back()

    # mars_dict['Hemispheres'] = hemis_array

    return mars_dict

示例#7

0

显示文件

文件： scrape_mars.py 项目： conleyl2/WebScraping

def scrape():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    news_title = soup.find(class_='content_title').find('a').text

    news_description = soup.find(class_='article_teaser_body').text

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    img_link_inc = soup.find(class_="default floating_text_area ms-layer"
                             ).find('a')['data-fancybox-href']
    img_link = "https://www.jpl.nasa.gov" + img_link_inc

    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    weather_link = soup.find(
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
    ).text

    url = 'http://space-facts.com/mars/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    data_table = pd.read_html(url)

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    hemispheres = [
        'Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced',
        'Syrtis Major Hemisphere Enhanced',
        'Valles Marineris Hemisphere Enhanced'
    ]
    hemisphere_photos = []

    for items in hemispheres:
        hemdict = {}
        browser.click_link_by_partial_text(items)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        hemdict['img_url'] = soup.find(class_='downloads').find('a')['href']
        hemdict['title'] = items
        url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
        browser.visit(url)
        html = browser.html
        hemisphere_photos.append(hemdict)

    output = {
        'newsTitle': news_title,
        'newsDesciption': news_description,
        'jplImage': img_link,
        'weather': weather_link,
        'dataTable': data_table,
        'hemispherePhotos': hemisphere_photos
    }
    return output

示例#8

0

显示文件

文件： scrape_mars.py 项目： Michelle39140/Mars

def scrape():
    # Import dependencies ----------------------------------------------------------------
    from splinter import Browser
    from bs4 import BeautifulSoup as bs
    import requests
    import time
    import pandas as pd

    # set up Splinter ----------------------------------------------------------------------
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # 1. NASA Mars News---------------------------------------------------------------------
    ## Scrape the NASA Mars News Site (https://mars.nasa.gov/news) and collect the latest News Title and Paragraph Text
    ## Assign the text to variables to reference later

    #! can't use requests library here, because the news are rendered by js after page is load; if use requests.get, it will only return the contents before rendering

    # 1.1 Retrieve page with splinter
    url_news = "https://mars.nasa.gov/news"
    browser.visit(url_news)
    html = browser.html

    # 1.2 Get the first news from html retrieved
    # Create BeautifulSoup object; parse with 'html.parser'
    bsoup = bs(html, 'html.parser')

    # reach the container of the first news
    li = bsoup.find("li", class_="slide")

    news_t = li.find("div", class_="content_title").text  # title
    news_p = li.find("div", class_="article_teaser_body").text  # paragraph
    news_link = url_news.replace("/news", "") + li.find(
        "div", class_="content_title").a[
            "href"]  # link to the news (added to base url)
    news_date = li.find("div", class_="list_date").text  # date

    # 2. JPL Mars Space Images - Featured Image----------------------------------------------
    ## Get the current Featured Image from JPL (https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)

    url_img = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

    # navigate to to full-size image url with splinter
    browser.visit(url_img)
    browser.click_link_by_partial_text('FULL IMAGE')

    # ---if try to click on the more info button, directly, sometimes it returns an error "element not visible"
    # ---the only way to avoid that it to wait until the element becomes visible, which takes time
    # --- the workaround is to get the href link and visit it insteading of trying to click the link directly
    # time.sleep(30)
    # browser.click_link_by_partial_text('more info')

    href = browser.find_link_by_partial_text("more info")[0]["href"]
    browser.visit(href)

    browser.find_by_css(".main_image").click()

    # store the image url
    featured_image_url = browser.url

    # 3. Mars Weather ------------------------------------------------------------------------
    ## Visit the Mars Weather twitter account page (https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page

    # 3.1 Retrieve page using requests
    url_twitter = "https://twitter.com/marswxreport?lang=en"
    html = requests.get(url_twitter).text

    # 3.2 Get the weather post from html retrieved
    bsoup = bs(html, "html.parser")

    # all tweets are under ol
    ol = bsoup.find(id="stream-items-id")

    # put tweets in lis list
    lis = ol.findAll("li")
    # use a for loop to find the first tweet with weather info (criterion: has hPa in the post)
    mars_weather = ""
    for li in lis:
        tweet = li.find("div", class_="js-tweet-text-container").p.text
        if tweet.find("hPa"):
            mars_weather = tweet
            break

    # 4. Mars Facts----------------------------------------------------------------------------
    ## Visit the Mars Facts webpage (https://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    # Use Pandas to convert the data to a HTML table string.
    url_fact = "https://space-facts.com/mars/"

    # use pandas to scrape tabular data from the page
    tables = pd.read_html(url_fact)
    facts = tables[0]

    # store data in a list of lists
    facts = facts.values.tolist()

    # 5. Mars Hemispheres-------------------------------------------------------------------------
    ## Visit the USGS Astrogeology site (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres.

    # 5.1 Retrieve the html with splinter
    url_hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_hemi)
    html = browser.html

    # 5.2 Get the urls needed from the html retrieved
    bsoup = bs(html, "html.parser")

    items = bsoup.findAll("div", class_="item")

    hemisphere_image_urls = []  # initialize list
    for item in items:
        title = item.find("h3").text  # title
        url = "https://astrogeology.usgs.gov/" + item.find(
            "div",
            class_="description").a["href"]  # get the url for picture details
        browser.visit(url)
        img_url = browser.find_link_by_text("Sample")[0][
            "href"]  # get the url to the full-size picture
        hemisphere_image_urls.append({
            "title": title,
            "img_url": img_url
        })  # append a dictionary to the hemisphere_image_urls list

    # store data scraped into a dictionary--------------------------------------------------------------------
    data = {
        "news": {
            "title": news_t,
            "body": news_p,
            "link": news_link,
            "date": news_date
        },
        "feature_img": featured_image_url,
        "weather": mars_weather,
        "facts": facts,
        "hemi_img": hemisphere_image_urls
    }

    print(data)  # print to console
    return data

示例#9

0

显示文件

文件： scrape_mars.py 项目： Erick-GR/Mission-to-Mars

def scrape():

    full_scrape = {}

    # Splinter connection to chromedriver
    executable_path = {'executable_path' : '/home/erick/Documents/Personal/Bootcamp/Week12 - Web Scrapping/Mission-to-Mars/chromedriver'}
    browser = Browser("chrome", **executable_path, headless=False)


    # # NASA Mars News
    #
    # Section to scrap the NASA Mars webpage.

    # In[3]:


    url_mars = "https://mars.nasa.gov"
    mars_news = "/news"
    browser.visit(url_mars + mars_news)


    # In[4]:


    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    li_slide = soup.find_all('li', class_="slide")

    date = []
    title = []
    url_news = []
    url_img = []
    description = []


    for item in li_slide:
        title.append(item.find("div", class_="content_title").text)
        url_news.append(url_mars + item.find("div", class_="content_title").a['href'])
        url_img.append(url_mars + item.find("div", class_="list_image").img['src'])
        date.append(item.find("div", class_="list_date").text)
        description.append(item.find("div", class_="article_teaser_body").text)

    full_scrape['NASA Mars News'] = {}
    full_scrape['NASA Mars News']['title'] = title
    full_scrape['NASA Mars News']['url_news'] = url_news
    full_scrape['NASA Mars News']['url_img'] = url_img
    full_scrape['NASA Mars News']['date'] = date
    full_scrape['NASA Mars News']['description'] = description

    # for x in range(5):
    #     try:
    #         browser.click_link_by_partial_text('MORE')
    #     except:
    #         print("No more pages")


    # # JPL Mars Space Images
    #
    #

    # In[5]:


    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_jpl)


    # In[6]:


    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    day_descr = soup.find("h1", class_="media_feature_title").get_text(strip=True)

    try:
        browser.click_link_by_partial_text('FULL IMAGE')
    except:
        print('Already on page')

    time.sleep(3)

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    day_img = soup.find('img', class_="fancybox-image")['src']

    day_img_url = 'https://www.jpl.nasa.gov/' + day_img

    full_scrape['JPL Mars Space Images'] = {}
    full_scrape['JPL Mars Space Images']['img_description'] = day_descr
    full_scrape['JPL Mars Space Images']['img_url'] = day_img_url

    # # Mars Weather

    # In[7]:


    url_weather = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url_weather)


    # In[8]:


    html = browser.html

    soup = BeautifulSoup(html, 'html.parser')

    mars_weather = soup.find('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").get_text()
    mars_weather

    full_scrape['Mars Weather'] = {}
    full_scrape['Mars Weather']['weather'] = mars_weather

    # # Mars Facts

    # In[9]:


    url_facts = 'https://space-facts.com/mars/'

    fact = pd.read_html(url_facts)

    # fact[0].to_html("templates/table1.html")
    mars_earth = fact[0]
    mars_earth = mars_earth.set_index('Mars - Earth Comparison')
    mars_earth.to_html("templates/mars_earth.html")

    # In[10]:


    # fact[1].to_html("templates/table2.html")
    mars_facts = fact[1]
    mars_facts = mars_facts.set_index(0)
    mars_facts.to_html("templates/mars_facts.html")


    # # Mars Hemispheres

    # In[11]:


    url_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_hemispheres)


    # In[12]:


    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    hemisphere_image_urls = []

    a = soup.find_all("div", class_='description')

    for i in a:
        d = {}
        d['title'] = i.h3.text

    #     link.append(i.a['href'])
    #     title.append(i.h3.text)

        try:
            browser.click_link_by_partial_text(i.h3.text)
        except:
            print('Already on page')

        time.sleep(3)

        # HTML object
        html = browser.html
        # Parse HTML with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')

        d['img_url'] = 'https://astrogeology.usgs.gov' + soup.find('img', class_='wide-image')['src']
    #     img.append(soup.find('img', class_='wide-image')['src'])

        hemisphere_image_urls.append(d)
        browser.back()

    browser.quit()
    hemisphere_image_urls

    full_scrape['Mars Hemispheres'] = hemisphere_image_urls

    # print(full_scrape)
    return full_scrape

示例#10

0

显示文件

def scrape():
    # browser = init_browser()
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    mars = {}

    url = 'https://mars.nasa.gov/news/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    browser.visit(url)

    # Results are returned as an iterable list
    title = []
    paragraph = []
    results = soup.find_all('div', class_='slide')
    # Title
    for result in results:
        # Identify and return title of listing
        news_title = result.find('div', class_='content_title').text.strip()
        title.append(news_title)
        # Identify and return paragraph of listing
        news_p = result.find('div',
                             class_='rollover_description_inner').text.strip()
        paragraph.append(news_p)
    # Latest Mars News
    mars["title"] = title[0]
    # Latest Mars paragraph
    mars["paragraph"] = paragraph[0]

    # # Featured Link
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    link = soup.find(class_='headerimage fade-in')['src']
    featured_image_url = f"https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{link}"
    mars["img_link"] = featured_image_url

    # Mars Hemishpere
    # Grab all the titles
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    name_lists = []

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    for x in range(1, 2):
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        name_dict = soup.find_all('h3')

    for x in name_dict:
        name_lists.append(x.text)

    mars['hemisphere_1'] = name_lists[0]
    mars['hemisphere_2'] = name_lists[1]
    mars['hemisphere_3'] = name_lists[2]
    mars['hemisphere_4'] = name_lists[3]

    # Grab all the urls
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    urls = []
    img_url = []

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # for y in range (1,2):
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    for name in name_lists:
        browser.click_link_by_partial_text(name)

        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        result = soup.find(class_='downloads')

        for x in result.find_all('a'):
            urlslink = x['href']
            urls.append(urlslink)
        img_url.append(urls[0])
        urls = urls[2:-2]
        browser.back()

    mars['url1'] = img_url[0]
    mars['url2'] = img_url[1]
    mars['url3'] = img_url[2]
    mars['url4'] = img_url[3]
    return mars

示例#11

0

显示文件

文件： scrape_mars.py 项目： mitchklee35/Nasa-Website-Data-Scrape-from-Mars

def scrape():
    # %%
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # %%
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(url)

    # %%
    html = browser.html
    soup = bs(html, 'html.parser')

    # %%
    ###### NASA Mars News
    # ---------------------------------

    # %%
    for result in soup:
        # Identify and return title of listing
        title = soup.find_all("div", class_="content_title")[1].text
        # Identify and return price of listing
        paragraph = soup.find_all("div",
                                  class_="rollover_description_inner")[0].text
        # Print results only if title, price, and link are available
        if (title and paragraph):
            print('-------------')
            print(title)
            print(paragraph)

    # %%
    ###### JPL Mars Space Images - Featured Image
    # ---------------------------------------------

    # %%
    Space_images = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(Space_images)

    # %%
    browser.click_link_by_partial_text('FULL IMAGE')

    # %%
    browser.click_link_by_partial_text('more info')

    # %%
    html = browser.html
    soup = bs(html, 'html.parser')
    image = soup.find_all('figure', class_='lede')
    results = image[0].a['href']
    print_image_url = 'https://www.jpl.nasa.gov/' + results
    print(print_image_url)

    # %%
    ###### Mars Weather
    # ------------------------------

    # %%
    mars_twitter = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_twitter)
    time.sleep(4)

    # %%
    html = browser.html
    soup = bs(html, 'html.parser')

    # %%
    # time.sleep(10)
    mars_weather = soup.find_all(
        'article',
        class_="css-1dbjc4n r-1loqt21 r-18u37iz r-1ny4l3l r-o7ynqc r-6416eg"
    )[0].text.strip().replace('Mars Weather@MarsWxReport·19hInSight ', '')
    mars_weather

    # %%
    ###### Mars Facts
    # ------------------------------

    # %%
    mars_facts = pd.read_html('https://space-facts.com/mars/')
    mars_df = mars_facts[0]
    mars_df.columns = ['Descriptions', 'Value']
    mars_df

    # %%
    ###### Mars Hemispheres
    # ------------------------------

    # %%
    Hemi_Url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(Hemi_Url)
    html = browser.html
    soup = bs(html, 'html.parser')

    image_names = []
    results = soup.find_all('div', class_="collapsible results")
    titles = results[0].find_all('h3')
    for name in titles:
        image_names.append(name.text)

    image_names

    # %%
    thumbnail_results = results[0].find_all('a')
    links = []

    for thumbnail in thumbnail_results:
        if (thumbnail.img):
            thumbnail_url = 'https://astrogeology.usgs.gov' + thumbnail['href']
            links.append(thumbnail_url)

    links

    # %%
    full_imgs = []

    for url in links:

        # Click through each thumbanil link
        browser.visit(url)

        html = browser.html
        soup = bs(html, 'html.parser')

        # Scrape each page for the relative image path
        results = soup.find_all('img', class_='wide-image')
        relative_img_path = results[0]['src']

        # Combine the reltaive image path to get the full url
        img_link = 'https://astrogeology.usgs.gov/' + relative_img_path

        # Add full image links to a list
        full_imgs.append(img_link)

    full_imgs

    # %%
    mars_hemi = list(zip(image_names, full_imgs))

    mars_df_dict = []

    for title, img in mars_hemi:
        mars_dict = {}
        mars_dict['title'] = title
        mars_dict['img_url'] = img
        mars_df_dict.append(mars_dict)

    mars_df_dict

    # %%
    Mars_scrape_dict = {
        "title": title,
        "paragraph": paragraph,
        "print_image_url": print_image_url,
        "mars_weather": mars_weather,
        "mars_df": mars_df.to_html(),
        "mars_hemi": mars_df_dict,
    }

    Mars_scrape_dict

    browser.quit()

    return Mars_scrape_dict

示例#12

0

显示文件

文件： scrape_mars.py 项目： myun3378/13-Web-Scraping-and-Document-Databases

def scrape():
    # NASA Mars News

    # URL of page to be scraped. 
    url = 'https://mars.nasa.gov/news/'

    # Retrieve page with the requests module. Make a request to the url.
    response = requests.get(url)

    # Create a Beautiful Soup object
    soup = BeautifulSoup(response.text, 'html.parser')

    # Latest News Title
    news_title = soup.find('div', class_='content_title').text

    # Get paragraph text
    news_p = soup.find('div', class_='rollover_description_inner').text


    # JPL Mars Space Images - Featured Image

    #chromedriver
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # URL of page to be scraped. 
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    #Feed browser.html into BeautifulSoup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    browser.click_link_by_partial_text('FULL IMAGE')
    browser.click_link_by_partial_text('more info')

    jpl_html = browser.html
    jpl_soup = BeautifulSoup(jpl_html, 'html.parser')
    temp_url = jpl_soup.find('img', class_='main_image')
    img_url = temp_url.get('src')

    feature_image_url = "https://www.jpl.nasa.gov" + img_url

    #Close the chrome browser 
    browser.quit()             


    #Mars Weather

    # URL of page to be scraped. 
    url = 'https://twitter.com/marswxreport?lang=en'

    # Retrieve page with the requests module. Make a request to the url.
    response = requests.get(url)

    # Create a Beautiful Soup object
    soup = BeautifulSoup(response.text, 'html.parser')

    tweets = []
    tweets = soup.find_all('div', class_="js-tweet-text-container")

    for i in range(20):
        t = tweets[i].text
        if "Sol " in t:
            mars_weather = t
            break


    # Mars Facts

    # URL of page to be scraped. 
    url = 'https://space-facts.com/mars/'

    #List of dataframes of any tables it found
    tables = pd.read_html(url)  

    df = tables[0]
    df.columns = ['Profile','Data']

    #DataFrame to HTML
    html_table = df.to_html()
    mission_to_mars['mars_facts_table'] = html_table
    html_table.replace('\n', '')
    df.to_html('mars_table.html')

    # Mars Hemispheres

    #chromedriver
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # URL of page to be scraped. 
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')    

    hemisphere_image_urls = []                    

    products = soup.find('div', class_='result-list')                   
    hemispheres = products.find_all('div', class_='item')

    for hemisphere in hemispheres: 
        title = hemisphere.find('div', class_='description')
        
        title_text = title.a.text  
        title_text = title_text.replace(' Enhanced', '')
        browser.click_link_by_partial_text(title_text)
        
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')   
        image = soup.find('div', class_='downloads').find('ul').find('li') 
        img_url = image.a['href']
        
        hemisphere_image_urls.append({'title': title_text, 'img_url': img_url})
        
        browser.click_link_by_partial_text('Back')

    mars_data = {
     "News_Title": news_title,
     "Paragraph_Text": news_p,
     "Most_Recent_Mars_Image": feature_image_url,
     "Mars_Weather": mars_weather,
     "mars_h": hemisphere_image_urls
     }
    #Close the chrome browser 
    browser.quit()

示例#13

0

显示文件

文件： scrape_mars.py 项目： sach7x/Sach

def scrape():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)


    # # In[9]:


    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)


    # # In[10]:


    html = browser.html
    soup = bs(html, 'html.parser')
    browser.click_link_by_partial_text('FULL IMAGE')


    # # In[11]:


    # #needs a pause or else code runs too fast
    time.sleep(2)
    browser.click_link_by_partial_text('more info')


    # # In[12]:


    html2 = browser.html
    soup2 = bs(html2, 'html.parser')
    image = soup2.find('img', class_='main_image')


    url = image.get('src')

    featured_image_url = 'https://www.jpl.nasa.gov' + url
    # #print(featured_image_url)
    time.sleep(2)
    browser.quit()


    # # In[13]:


    # #Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather.
    url = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    # #print(soup.prettify())


    # # In[14]:


    results = soup.find_all('div', class_='js-tweet-text-container')
    # #print(results)


    # # In[15]:


    mars_tweet= results[0].text
    # #print(mars_tweet)


    # # In[16]:


    # #Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    # #Use Pandas to convert the data to a HTML table string.
    mars_facts_url = 'https://space-facts.com/mars/'


    # # In[17]:


    tables = pd.read_html(url)
    tables


    # # In[18]:


    df = tables[0]
    df.head()


    # # In[19]:


    df.set_index(0, inplace=True)
    clean_df = df
    clean_df


    # # In[20]:


    html_table = clean_df.to_html()
    html_table


    # # In[21]:


    html_table.replace('\n', '')


    # # In[22]:


    df.to_html('mars_table.html')


    # # In[23]:


    # #Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
    # #You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    # #Save both the image url string for the full resolution hemipshere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.
    # #Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)


    # # In[24]:


    # #opening browser
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)


    # # In[25]:


    # #clicking into Cerberbus Hemisphere Enhanced page
    # #this needs to be modified to click into new hyperlink each time (store hyperlinks in a list to access?)
    hemisphere_info = []
    hyperlinks = ['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']

    for hyperlink in hyperlinks:
        browser.click_link_by_partial_text(hyperlink)
        html = browser.html
        soup = bs(html, 'html.parser')
        image = soup.find('img', class_='wide-image')
        url = image.get('src')
        image_url = 'https://astrogeology.usgs.gov' + url
        results = soup.find('h2', class_="title").text
        hemisphere_info.append({'title':results, 'img_url': image_url})
        time.sleep(1)
        browser.back()



    # # In[26]:


    # #print(hemisphere_info)


    # # In[ ]:


    browser.quit()
    mars_info = {
        "image_URL": featured_image_url,
        "Mars_weather": mars_tweet,
        "Mars_table": mars_table(),
       # 'mars_facts': 'foo bar baz', 
        "Hemisphere_info": hemisphere_info
    }
    return mars_info

示例#14

0

显示文件

文件： surfThread.py 项目： mmulazzani/alibiFramework

class SurfThread(threading.Thread):

   
    def __init__(self, hoehe, breite, _format):
        threading.Thread.__init__(self) 
        self.seiten = []
        self.words = []
        self.toWait = None
        self.elemNo = None
        self.wordNo = None
        self.clickNo = None
        self.clickX = None
        self.clickY = None
        self.back = None
        self.changeTabs = None
        self.__browser = Browser("firefox", profile=constants.profile)
        time.sleep(5)
        #self.__maximizeWindow()
        #time.sleep(5)        
        SurfThread.timer = False
        SurfThread.hoehe = hoehe
        SurfThread.breite = breite 
        SurfThread._format = _format


    def __readData(self):
        # read homepages to visit 
        surfListe = open("/home/steffi/Dokumente/surfListe.txt", "rb")
        for line in surfListe: 
            self.seiten.append(line)
        surfListe.close()
        # read words for search in google, wikipedia, amazon, youtube
        keyWords = open("/home/steffi/Dokumente/keyWords.txt", "rb").readlines()
        for line in keyWords: 
            self.words.append(line.decode("utf-8"))
        #keyWords.close(), 
    print "data read"
    
    
    def run(self):
        
        self.__readData()    
       
        rand = random.randint(2,5)
        for i in range(0, rand):
            print "noch "+ str(i) +" mal"
	    print "TIMER:" +str(SurfThread.timer)
            if SurfThread.timer == False :
            
                self.__generateRandom()
                    
                print "visit: "+self.seiten[self.elemNo]
                self.__visitHomepage( self.seiten[self.elemNo].strip())
                print "clickNo: "+ str(self.clickNo)
		print "towait = "+ str(self.toWait)
                time.sleep(self.toWait)
                for i in range(self.clickNo):
                    time.sleep(random.randrange(5,10))
                    if i % 2 == 0:
                        self.__generateRandomClick()
                    if i == 2:
                        self.__pageDown()
                        time.sleep(random.randrange(1,5))
                    if i == (self.clickNo-1):
                        self.__pageBottom()
                        time.sleep(random.randrange(2,10))
                    if i%2 == 0 and self.back == 1:
                        self.__goBack()
                        time.sleep(random.randrange(2,10))  

    	path = self.__browser.driver.firefox_profile.profile_dir
    	print path
    	os.remove(constants.profile+'/places.sqlite')
    	shutil.copyfile(path+'/places.sqlite', constants.profile+'/places.sqlite')
        self.__closeWindow()
    	shutil.rmtree(path)
    	#os.rmdir(path)
        print "Firefox beendet"
        
        
    def starte(self):
        self.run()
    
    def __generateRandom(self):
        self.toWait = random.randrange(5,45)
        self.elemNo = random.randrange(0,len(self.seiten))
        self.clickNo = random.randrange(2,7)
        self.back = random.randrange(0,10)
        self.wordNo = random.randrange(0, len(self.words))
    
    def __generateRandomClick(self):
        self.clickX = random.randrange(100,constants.BREITE - 50) #1366
        self.clickY = random.randrange(50,constants.HOEHE-50) #768
        command = "mousemove "+ str(self.clickX) + " "+ str(self.clickY)
        print command
        subprocess.call(["xte", command])
        subprocess.call(["xte", "mouseclick 1"])
      
    def __followLink(self, text, index=0):
        if index == None:
            index = 0
        
        try:   
            self.__browser.click_link_by_partial_text(text)[index]
        except ElementDoesNotExist:
            print "Element does not exist"
        except TypeError:
            print "Type Error"
        except Exception as e: 
	       print "nix passiert" + e
    
    def __visitGooglePage(self, url):
             
        print "google"
        
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        self.__fillInput('q', searchWord)
        time.sleep(random.randrange(2,15))
        self.__findElementAndClick("btnG", "name", None)
        subprocess.call(["xte", "key Return"])
        wordSplit = str(searchWord).split(" ")
        time.sleep(random.randrange(10,30))
            #baaaad practice
        try:
            self.__followLink(wordSplit[0], self.wordNo%10)
        except Exception:
            try: 
                self.__followLink(wordSplit[1], self.wordNo%10)
            except Exception:
                    pass
        
        
    def __visitHomepage(self, url):
       
        clickNoMod4 = self.clickNo % 4
        toWaitMod4 = self.toWait % 4
        
        if "google" in url:
            self.__visitGooglePage(url)
        elif "wikipedia" in url:
            self.__visitWikipediaPage(url)
        elif "amazon" in url:
            self.__visitAmazonPage(url)
        elif "ebay" in url:
            self.__visitEbayPage(url)
        elif "youtube" in url:
            print "youtube"
            self.__watchYoutubeVideo(url)
        elif "facebook" in url:
            print "facebook"
            self.__visitFacebook(url)
        elif "twitter" in url:
            print "twitter"
            self.__twitterSomething(url)
        else:
	    try:
            	self.__browser.visit(url)
	    except Exception as e:
		print e
		pass
        
       
    def __goBack(self): 
        self.__browser.back()
        
    def shutdown(self):
        print "setze timer um und beende firefox"
        changeTimer()
        
    def __fillInput(self, _id, _input):
        try:
            self.__browser.fill(_id, _input)
        except Exception as e:
            print e.message
            pass
        
    def __findElementAndClick(self, name, identifier, index):
        #check falls keine nummer mitgenommen wurde
        if index == None:
            index = 0
        #suche nach elementen
        try:
            if identifier == "name":
                button = self.__browser.find_by_name(name)[index]
            elif identifier == "id":
                button = self.__browser.find_by_id(name).click
            
                button.click()
        except (exceptions.ElementDoesNotExist, ElementNotVisibleException, URLError):
            print "ElementDoesnotExist OR ElementNotVisible OR URLError"
            pass
	except Exception as e:
	    print e
	    pass
        
    def __closeWindow(self):
        time.sleep(3)  
        subprocess.call(["xte", "keydown Control_L"])
        #subprocess.call(["xte", "keydown Shift_L"])
        subprocess.call(["xte", "key q"])
        #subprocess.call(["xte", "keyup Shift_L"])
        subprocess.call(["xte", "keyup Control_L"])
        print "Fenster geschlossen"
    
    def __maximizeWindow(self):
        time.sleep(2)  
        subprocess.call(["xte", "keydown Control_L"])
        subprocess.call(["xte", "key F10"])
        subprocess.call(["xte", "keyup Control_L"])
        print "Fenster maximiert"
    
    def __pageDown(self):
        time.sleep(3)
        subprocess.call(["xte", "key Page_Down"])
    
    def __pageBottom(self):
        subprocess.call(["xte", "key End"])
    
    def __watchYoutubeVideo(self, url):
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        
        self.__fillInput('search_query', searchWord)
        time.sleep(random.randrange(2,15))

        subprocess.call(["xte", "key Return"])
        time.sleep(random.randrange(2,15))
        
	#nur bei 16:9 monitor
        index = None
        breite = 0
	if SurfThread._format == "16:9":

            index = [int(SurfThread.hoehe // 4.59), 
                     int(SurfThread.hoehe // 3.04),
                     int(SurfThread.hoehe // 2.22),
                     int(SurfThread.hoehe // 1.77)]
            breite = int(SurfThread.breite//4.74)
        else:
            index = [int(SurfThread.hoehe // 4.10), 
                     int(SurfThread.hoehe // 2.19),
                     int(SurfThread.hoehe // 1.54),
                     int(SurfThread.hoehe // 1.28)]
	    breite = int(SurfThread.breite//2.15)
                
        #self.__followLink(searchWord, None)
        #235 1 - 355 2 - 4853   
        rand = random.randint(0, (len(index)-1))
        subprocess.call(["xte", "mousemove "+ str(breite) + " " +str(index[rand])])
        time.sleep(random.randrange(2,15))
        subprocess.call(["xte", "mouseclick 1"])
        
        time.sleep(5)
        print "mousemove + anschauen"
    
        #breite höhe von links oben
        #subprocess.call(["xte", "mousemove "+ str(int(SurfThread.breite//3.17)) + " " + str(int(SurfThread.hoehe//3.2225))])
        #time.sleep(2)
        subprocess.call(["xte", "mouseclick 1"])
        #todo mehr zeit
        time.sleep(random.randrange(2,45))
        
    
        
        
    def __visitWikipediaPage(self, url):
        print "wikipedia"
        
        self.__browser.visit(url)
        time.sleep(2)
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        self.__fillInput('search', searchWord)
        time.sleep(random.randrange(2,15))
        subprocess.call(["xte", "key Return"])
        wordSplit = str(searchWord).split(" ")
        time.sleep(2)    
            #baaaad practice
        try:
            self.__followLink(wordSplit[0], self.wordNo%10)
        except Exception:
            try: 
                self.__followLink(wordSplit[1], self.wordNo%10)
            except Exception:
                    pass
                
    def __visitAmazonPage(self, url):
        print "amazon"
        
        self.__browser.visit(url)

        time.sleep(random.randrange(2,15))
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        self.__fillInput('field-keywords', searchWord+'\n')
        time.sleep(2)
       
	subprocess.call(["xte", "key Return"])
        
        wordSplit = str(searchWord).split(" ")
        time.sleep(random.randrange(2,15))  
            #baaaad practice
        try:
            self.__followLink(wordSplit[0], self.wordNo%10)
        except Exception:
            try: 
                self.__followLink(wordSplit[1], self.wordNo%10)
            except Exception:
                    pass
    
    def __visitEbayPage(self, url):
        print "ebay"
        
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        self.__typeWord(searchWord)
        time.sleep(random.randrange(2,15))
        subprocess.call(["xte", "key Return"])
        wordSplit = str(searchWord).split(" ")
        time.sleep(random.randrange(2,15))
            #baaaad practice
        self.__followLink(wordSplit[0], self.wordNo%10)
        
    def __visitFacebook(self, url):
        print "facebook"
        
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        
        #gegenebenefalls einloggen
        if self.__browser.is_text_present(constants.FB_USER) == False:
            print "noch nicht eingeloggt"
            self.__fillInput('email', constants.FB_EMAIL)
            time.sleep(2)
            self.__fillInput('pass', constants.FB_PW)
            time.sleep(2)
            subprocess.call(["xte", "key Return"])
            time.sleep(5)
            
    def __twitterSomething(self, url):
        print "twitter"
        
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        #todo wenns tart seite nicht sichtbar, einloggen
        if self.__browser.is_text_present('Startseite') == False:
            print "noch nicht eingeloggt"
            
            '''name = self.__browser.find_by_name('session[username_or_email]').first
            if name != None:
                print "name gefunden"
            name.click()
            time.sleep(3)
            self.__typeWord('steffi_spam')
            
            passW = self.__browser.find_by_id('signin-password').first
            passW.click()
            time.sleep(3)
            self.__typeWord('steffispam')'''
            
            
            #self.__fillInput("session[username_or_email]", "*****@*****.**")
            #time.sleep(2)
            #self.__fillInput('signin-pass', "steffispam")
            #self.__fillInput('signin-pass', "session[password]")
            #time.sleep(2)
            #subprocess.call(["xte", "key Return"])
            #time.sleep(5)
            
            # so gehts 13.5.13
            time.sleep(random.randrange(2,15))
            subprocess.call(["xte", "key Tab"])
            time.sleep(3)
            subprocess.call(["xte", "key Tab"])
            time.sleep(3)
            subprocess.call(["xte", "key Tab"])
            time.sleep(random.randrange(2,15))
            self.__typeWord(constants.TWITTER_USER)
            subprocess.call(["xte", "key Tab"])
            time.sleep(2)
            self.__typeWord(constants.TWITTER_PW)
            time.sleep(2)
            subprocess.call(["xte", "key Return"])
            time.sleep(random.randrange(2,15))
            ''' self.__followLink("Kleine Zeitung")
           # time.sleep(5)
           # self.back()
           # self.__followLink("ORF Sport")
           # time.sleep(5)
           # self.back()'''
        
        self.__followLink("Startseite")
        time.sleep(3)
        print "input twitter"
        field = self.__browser.find_by_id("tweet-box-mini-home-profile").first
        field.click()
        print "geklickt"
        self.__typeWord(twittertext[random.randrange(0,len(twittertext)-1)])
        time.sleep(random.randrange(2,15))
        subprocess.call(["xte", "key Tab"])
        time.sleep(2)   
        subprocess.call(["xte", "key Return"])
        print "tweet gepostet"
        
            
            
    def __typeWord(self, word):
        spell = ""
        for i in range(0, len(word)):
            #special character
            if spell == "/":
                spell = "/"+word[i]
            else:    
                spell = word[i]
                
            # todo algorithmus der entescheidet, zuerst spezialzeichen oder normales zeichen               
            if spell == "@":
                subprocess.call(["xte", "keydown Control_L"])
                subprocess.call(["xte", "key at"])
                subprocess.call(["xte", "keyup Control_L"])
            #sonderzeichen
            elif spell not in string.ascii_letters:
                spell = keySyms[spell]
                #sonderzeichen mit shift
                if spell in upKeys:
                    subprocess.call(["xte", "keydown Shift_L"])
                    subprocess.call(["xte", "key "+spell])
                    subprocess.call(["xte", "keyup Shift_L"])
                #sonderzeichen mit altgr   
                elif spell in altGrKeys:
                    subprocess.call(["xte", "keydown Alt_R"])
                    subprocess.call(["xte", "key "+spell])
                    subprocess.call(["xte", "keyup Alt_R"])
                else:     
                    subprocess.call(["xte", "key "+spell])
            elif spell == "ß":
                spell = "question"
                subprocess.call(["xte", "key "+spell])
            else:    
                subprocess.call(["xte", "key "+spell])

示例#15

0

显示文件

文件： mission_to_mars.py 项目： jkarrik4/Mission_to_Mars

def mars_scrape():

    # Mars News URL# Mars
    url = "https://mars.nasa.gov/news/"

    # Retrieve page with the requests module
    html = requests.get(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html.text, 'html.parser')

    # Get title & description
    news_title = soup.find('div', 'content_title', 'a').text
    news_p = soup.find('div', 'rollover_description_inner').text

    # In[6]:

    news_title

    # JPL Mars Space Images - Featured Image

    # In[8]:

    # JPL Mars URL# JPL Ma
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

    # Setting up splinter
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path)
    browser.visit(url)

    # Moving through the pages
    time.sleep(5)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(5)
    browser.click_link_by_partial_text('more info')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Get featured image
    results = soup.find('article')
    extension = results.find('figure', 'lede').a['href']
    link = "https://www.jpl.nasa.gov"
    featured_image_url = link + extension

    # Mars Weather

    # In[10]:

    mars_weather_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_weather_url)
    time.sleep(1)
    mars_weather_html = browser.html
    mars_weather_soup = BeautifulSoup(mars_weather_html, 'html.parser')

    tweets = mars_weather_soup.find('ol', class_='stream-items')
    mars_weather = tweets.find('p', class_="tweet-text").text
    print(mars_weather)

    # Mars Facts

    # In[12]:

    # visit space facts and scrap the mars facts table# visit
    mars_facts_url = 'https://space-facts.com/mars/'
    browser.visit(mars_facts_url)
    time.sleep(1)
    mars_facts_html = browser.html
    mars_facts_soup = BeautifulSoup(mars_facts_html, 'html.parser')

    fact_table = mars_facts_soup.find('table',
                                      class_='tablepress tablepress-id-mars')
    column1 = fact_table.find_all('td', class_='column-1')
    column2 = fact_table.find_all('td', class_='column-2')

    facets = []
    values = []

    for row in column1:
        facet = row.text.strip()
        facets.append(facet)

    for row in column2:
        value = row.text.strip()
        values.append(value)

    mars_facts = pd.DataFrame({"Facet": facets, "Value": values})

    mars_facts_html = mars_facts.to_html(header=False, index=False)
    mars_facts

    # Mars Hemispheres

    # In[37]:

    def marsHemisphere():
        hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
        driver.get(hemisphere_url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        mars_hemisphere_list = []
        products = soup.find("div", class_="result-list")
        hemispheres = products.find_all("div", class_="item")

        for hemisphere in hemispheres:
            title = hemisphere.find("h3").text
            title = title.replace("Enhanced", "")
            end_link = hemisphere.find("a")["href"]
            image_url = "https://astrogeology.usgs.gov/" + end_link
            mars_hemisphere_list.append({"title": title, "img_url": image_url})

        def get_high_res_url(some_url):
            response = requests.get(some_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all("a")
            tifs = [j for j in links if ".tif" in j.attrs.get('href')]
            return tifs[0].get('href')

        updated_photos = []

        for data in mars_hemisphere_list:
            link_to_check = data.get('img_url')
            title = data.get('title')
            final_image_url = get_high_res_url(link_to_check)
            updated_photos.append({'Title': title, 'Url': final_image_url})
        return updated_photos

示例#16

0

显示文件

文件： scrape_mars.py 项目： gcanales85/web-scraping-challenge

def scrape():
    # browser = init_browser()
    browser = Browser('chrome')
    #Visit the URL
    Nasa_news_url = 'https://mars.nasa.gov/news/'
    browser.visit(Nasa_news_url)
    html = browser.html

    #Parse HTML with Beautiful Soup
    soup_nasa = BeautifulSoup(html, 'html.parser')
    type(soup_nasa)

    ### NASA Mars News
    #<div class="content_title"><a href="/news/8782/sensors-on-mars-2020-spacecraft-answer-long-distance-call-from-earth/" target="_self">
    #Sensors on Mars 2020 Spacecraft Answer Long-Distance Call From Earth</a></div>
    #<div class="article_teaser_body">Instruments tailored to collect data during the descent of NASA's next rover through the Red Planet's atmosphere have been checked in flight.</div>
    #news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text
    news_titles = soup_nasa.find_all('div', class_="content_title")[0].text
    news_paragraphs = soup_nasa.find_all('div',
                                         class_="article_teaser_body")[0].text
    print(news_titles)
    print('------------------')
    print(news_paragraphs)

    ### JPL Mars Space Images - Featured Image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(5)
    #print(soup.prettify())
    #go to the full image
    #data-fancybox-href
    image = browser.find_by_id('full_image')
    image.click()
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    url_image_find = soup.find('img', class_='main_image').get("src")

    featured_image_url = 'https://www.jpl.nasa.gov' + url_image_find
    featured_image_url
    ### Mars Facts
    url = 'https://space-facts.com/mars/'
    mars_facts_df = pd.read_html('https://space-facts.com/mars/')[2]
    mars_facts_df
    mars_facts_df.columns = ["Details", "Measures"]
    mars_facts_df
    mars_facts_df = mars_facts_df.to_html()
    mars_facts_df
    ### Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    browser.visit(url)
    web_links = browser.find_by_css("a.product-item h3")
    len(web_links)
    web_list = []
    for i in range(len(web_links)):
        web_hemispheres = {}
        browser.find_by_css("a.product-item h3")[i].click()
        web_hemispheres["link"] = browser.find_link_by_text(
            'Sample').first["href"]
        web_hemispheres["Title"] = browser.find_by_css('h2.title').text
        web_list.append(web_hemispheres)
        browser.back()
        web_list

    browser.quit()

示例#17

0

显示文件

文件： Mission_to_Mars_Challenge.py 项目： kcharb7/Mission-to-Mars

hemisphere_image_urls = []

hem_dict = {}

# Parse the resulting html with soup
html = browser.html
hem_soup = soup(html, 'html.parser')

# Write code to retrieve the image urls and titles for each hemisphere.
# Find all titles
titles = hem_soup.find_all('h3')

for i in titles:
    t = i.get_text()
    title = t.strip()
    browser.click_link_by_partial_text(t)

    href = browser.find_link_by_partial_href('_enhanced.tif/full.jpg')['href']
    img_url = f'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars/{href}'

    hem_dict = {'title': title, 'img_url': img_url}
    hemisphere_image_urls.append(hem_dict)

    browser.visit(url)

# Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

# Quit the browser
browser.quit()

示例#18

0

显示文件

文件： scrape_mars.py 项目： jade128/web-scraping-challenge

def scrape():
    browser=init_browser()
    
    # NASA Mars News scraping:
    # Visit the NASA Mars news website and parse results HTML with BeautyfulSoup
    news_url = 'https://mars.nasa.gov/news/'
    browser.visit(news_url)
    html = browser.html
    soup = bs(html, 'html.parser')

    #find articles
    article=soup.find_all('div', class_='list_text')
    #collect and save latest news title and paragraph 
    news_title = article[0].find('div', class_='content_title').text
    news_p=article[0].find('div', class_='article_teaser_body').text

    # JPL Mars Space Images-- Featured image url scraping
    #create a JPL url and have browser to visit it
    JPL_url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(JPL_url)

    #use splinter click_link_by_partial_text method to click button
    browser.click_link_by_partial_text('FULL IMAGE')
    #wait time 10 to click once more tome with "more info" button
    time.sleep(10)
    browser.click_link_by_partial_text('more info')

    #parse results html with beautyfulSoup
    html1 = browser.html
    soup1 = bs(html1, 'html.parser')
    rel_img_path=soup1.find('img',class_='main_image').get('src')
    img_url="https://www.jpl.nasa.gov"+rel_img_path

    # Mars Weather
    #create a twitter weather url and have requests to get response from it,parse it to bs4
    weather_url='https://twitter.com/marswxreport?lang=en'
    twitter_response=requests.get(weather_url)
    soup2=bs(twitter_response.text,'html.parser')

    weather_twitter=soup2.find('div', class_="js-tweet-text-container")
    mars_weather=weather_twitter.find('p','tweet-text').text

    # Mars Facts
    #Visit the Mars Facts webpage and use Pandas to scrape the table containing facts
    facts_url='https://space-facts.com/mars/'
    mars_facts_df=pd.read_html(facts_url)[0]
    mars_facts_df.columns=['Description','Value']
    facts_df=mars_facts_df.set_index('Description')
    

    #Use Pandas to convert the data to a HTML table string.
    facts_html=facts_df.to_html()

    #  Mars Hemospheres
    #visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres.
    Hemisph_url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(Hemisph_url)

    Hemisph_html=browser.html
    Hemisph_soup=bs(Hemisph_html,'html.parser')

    #seach for titles for all four hemisphares and store them in a list
    hemisph_names=[]
    results=Hemisph_soup.find('div',class_='collapsible results')
    hemisphs=results.find_all('h3')
    for title in hemisphs:
        hemisph_names.append(title.text)

    #Mac user: set Executable Path and InitialChrome Browser
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    image_urls=[]

    for name in hemisph_names:
        #visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres.
        Hemisph_url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
        browser.visit(Hemisph_url)

        #search for each image url
        #use splinter click_link_by_partial_text method to click button
        browser.click_link_by_partial_text(name)
        #wait time 25 to click once more tome with "more info" button
        time.sleep(25)
        #visit the site to obtain high resolution images for each of Mar's hemispheres.
        html_1 = browser.html
        soup_1 = bs(html_1, 'html.parser')
        #seaerch for anchor tag and pull the image with "full" in the name
        image_url1=soup_1.find('div', class_='downloads').find('a')['href']
        image_urls.append({"title":name,"img_url":image_url1})


    #store data in a Python dictionary containing all of the scraped data
    mars_data={
    "news_title":news_title,
    "news_paragraph":news_p,
    "featured_image":img_url,
    "weather":mars_weather,
    "facts":facts_html,   
    "hemispheres":image_urls
    }
    
    # close the browser after scrape
    browser.quit()
    
    return mars_data

示例#19

0

显示文件

def scrape():
#latest news    
    marsinfo_url = 'https://mars.nasa.gov/news'
    response = requests.get(marsinfo_url)

    soup = BeautifulSoup(response.text, 'html5lib')

    marstitle = soup.find('div', class_= 'content_title').text
    marspar = soup.find('div', class_='rollover_description_inner').text.strip('\n\r\t": ')

#space image
    executable_path = {'executable_path' : 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    imageurl = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(imageurl)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    browser.click_link_by_partial_text('FULL IMAGE')
    browser.click_link_by_partial_text('more info')
    mars_image = browser.find_by_tag("figure").first.find_by_tag("a")["href"]


#Mars Twitter Info
    mars_twitter = requests.get("https://twitter.com/marswxreport?lang=en")
    mars_twittersoup = BeautifulSoup(mars_twitter.text, 'html.parser')
    
    mars_twitterreport = mars_twittersoup.find_all('div', class_="js-tweet-text-container")
    
    mars_weather = mars_twitterreport[0].text

#Facts
    mars_facts = requests.get("https://space-facts.com/mars/")
    mars_space_facts =  pd.read_html(mars_facts.text)
    
    table = mars_space_facts[0]
    table.set_index(0, inplace =True)
    mars_table = table
    facts_html = mars_table.to_html()
    
#Archeologywebsites - hemispheres images
    images = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    images = requests.get(images)
    soup = BeautifulSoup(images.text, "html.parser")
    
    images_links = soup.find_all('div', class_='item')
    images_url = 'https://astrogeology.usgs.gov'
    
    hemisphere_urls = []
    for img in images_links:
        img_title = img.find('h3').text
        img_url = img.find('a', class_='itemLink product-item')['href']
        browser.visit(images_url + img_url)
        img_html = browser.html
        soup = BeautifulSoup(img_html, 'html.parser')
        fullimg_url = images_url + soup.find('img', class_='wide-image')['src']
        hemisphere_urls.append({"title" : img_title, "img_url" : fullimg_url})
        
    mars_data = {
        "News_Title": marstitle,
        "Paragraph_Text": marspar,
        "Most_Recent_Mars_Image": mars_image,
        "Mars_Weather": mars_weather,
        "mars_h": hemisphere_urls
     }

    return mars_data

示例#20

0

显示文件

文件： scrape_mars.py 项目： hplam08/mission_to_mars_Huy_Lam

def scrape():

    scraped_data = {}

    # URL of page to be scraped - Launch page first
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    # Use Beautiful Soup to parse the data
    html = browser.html
    soup = bs(html, 'html.parser')
    # Retrieve the Latest News Title and paragraph text
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='rollover_description').text
    scraped_data['News_Title'] = news_title
    scraped_data['News_Paragraph'] = news_p

    # JPL Mars Space Images - Featured Image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    # Request and parse the HTML
    html = browser.html
    soup = bs(html, 'html.parser')
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(4)
    browser.click_link_by_partial_text('more info')

    # Request and parse again
    html_code = browser.html
    soup = BeautifulSoup(html_code, "html.parser")
    image = soup.find('figure', class_='lede').a['href']
    featured_image_url = 'https://www.jpl.nasa.gov' + image
    scraped_data['Featured_Img_URL'] = featured_image_url

    ## JPL Mars Space Images - Featured Image
    url = 'https://twitter.com/marswxreport?lang=en'
    time.sleep(3)
    browser.visit(url)
    # Request and parse
    html_code = browser.html
    soup = BeautifulSoup(html_code, "html.parser")
    mars_weather = soup.find(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
    ).text
    scraped_data['Mars_Weather'] = mars_weather

    ## Mars Facts
    url = 'https://space-facts.com/mars/'
    browser.visit(url)

    # Request and parse
    html_code = browser.html
    soup = BeautifulSoup(html_code, "html.parser")
    fact_table = soup.find('table',
                           {'class': 'tablepress tablepress-id-p-mars'})

    fact_table_rows = fact_table.find_all('tr')
    col_1 = []
    col_2 = []

    for row in fact_table_rows:
        rows = row.find_all('td')
        col_1.append(rows[0].text)
        col_2.append(rows[1].text)

    facts_df = pd.DataFrame({'facts': col_1, 'values': col_2})
    facts_html = facts_df.to_html()
    scraped_data['Mars_Facts'] = facts_html

    ## Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # Request and parse the HTML
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #print(soup.prettify())
    images = soup.find_all('h3')
    #     print(images)
    titles = []
    for image in images:
        titles.append(image.text)
#     for link in soup.find_all('a'):
#         print(link.get('href'))
    for title in titles:
        print(title)

    links = []
    for title in titles:
        browser.click_link_by_partial_text(title)
        time.sleep(1)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        link_addr = soup.find('img', class_='wide-image')
        links.append('https://astrogeology.usgs.gov' + link_addr.attrs['src'])
        browser.back()

    hemisphere_image_urls = {}
    combine = list(zip(titles, links))
    title_link = []
    for title, link in combine:
        title_link.append({'title': title, 'img_url': link})
    scraped_data['Hemisphere_Image_URLs'] = title_link

    return scraped_data

示例#21

0

显示文件

def scrape_all():
    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser('chrome', **executable_path, headless=True)

    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    browser.visit(url)
    time.sleep(3)

    page = browser.html
    soup = bs(page, 'html.parser')


    #Title text and description
    results = soup.find('div', class_='image_and_description_container')

    title = results.find('div', class_='content_title')

    #Returns
    title_text = title.a.text
    description = results.find('div', class_='article_teaser_body').text

    #Image scraping

    #Setup 
    #Browser navigation
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    time.sleep(2)
    browser.click_link_by_id('full_image')
    time.sleep(2)
    browser.click_link_by_partial_text('more info')

    #Large Image HTML
    page = browser.html
    soup = bs(page, 'html.parser')

    #Store large Image URL
    results = soup.find('img', class_='main_image')
    image_link = results['src']

    #return
    featured_image_url = ("https://www.jpl.nasa.gov" + image_link)


    #Table scraping
    url = "https://space-facts.com/mars/"
    tables = pd.read_html(url)
    mars_facts = tables[0]
    mars_facts.columns = ['Facts', 'Mars']
    mars_facts.set_index('Facts', inplace=True)

    #return
    fact_table = mars_facts.to_html(classes="table table-striped")

    #Mars Hemispheres
    #browser navigation
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    time.sleep(2)

    #page html
    page = browser.html
    soup = bs(page, 'html.parser')
    results = soup.find_all('a', class_='description')
    results = soup.find_all('div', class_='description')

    hemispheres = []

    for result in results:
        url = result.a['href']
        url_full = ("https://astrogeology.usgs.gov" + url)
        browser.visit(url_full)
        time.sleep(2)
        url_page = browser.html
        url_soup = bs(url_page, 'html.parser')
        url_results = url_soup.find('img', class_="wide-image")['src']
        img_url = ("https://astrogeology.usgs.gov" + url_results)
        title = url_soup.find('h2', class_='title').text
        hem_dic = {
            "title": title,
            "img_url": img_url
        }
        hemispheres.append(hem_dic)

    data = {
        "latest_title": title_text,
        "latest_description": description,
        "featured_image": featured_image_url,
        "mars_fact_table": fact_table,
        "hemispheres": hemispheres
    }

    browser.quit()
    return data

示例#22

0

显示文件

def scrape():

    #-------------------
    ## NASA Mars News
    #-------------------

    #set up url
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

    #set up splinter brouser
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    #visit url
    browser.visit(url)

    #pull html + needed info
    html = browser.html
    soup = bs(html, 'html.parser')

    news_title = soup.find('div', class_="content_title").text
    news_p = soup.find('div', class_='article_teaser_body').text

    #----------------------------------------
    ## JPL Mars Space Images - Featured Image
    #----------------------------------------

    #set up url
    jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    #set up browser and visit url
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(jpl_url)

    #navigate to the required html-page
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    #pull html + needed info from the web-page
    jpl_html = browser.html
    soup = bs(jpl_html, 'html.parser')

    result = soup.find('figure', class_='lede')
    featured_image_path = result.a['href']
    featured_image_url = f'https://www.jpl.nasa.gov/{featured_image_path}'

    #----------------
    ## Mars Weather
    #----------------

    #set up  + visit url
    weather_url = 'https://twitter.com/marswxreport?lang=en'
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(weather_url)

    #pull html + needed info from the web-page
    for x in range(1, 6):

        weather_html = browser.html
        soup = bs(weather_html, 'html.parser')
        results = soup.find_all('div', class_='js-tweet-text-container')

        #get tweets that consist of only weather info
        tweets = []
        errors = []
        for result in results:
            try:
                mars_weather = result.find('p', {
                    'data-aria-label-part': '0'
                }).text
                if 'daylight' in mars_weather:

                    tweets.append(mars_weather)

            except AttributeError as e:
                errors.append(e)

    #get the latest tweets
    mars_weather = tweets[0]

    #----------------
    ## Mars Facts
    #----------------

    #set up  + visit url
    facts_url = 'https://space-facts.com/mars/'
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(facts_url)

    #pull html + needed info from the web-page
    facts_html = browser.html
    soup = bs(facts_html, 'html.parser')
    data = soup.find('table', class_='tablepress tablepress-id-mars')

    #get only table rows
    table_data = data.find_all('tr')

    #extract needed info from the table
    keys = []
    values = []

    for x in table_data:
        col_1 = x.find('td', class_="column-1").text
        col_2 = x.find('td', class_="column-2").text
        keys.append(col_1)
        values.append(col_2)

    #create a dictionary from keys and values
    dictionary = dict(zip(keys, values))

    #create a dataframe from the dictionary
    mars_df = pd.DataFrame.from_dict(dictionary,
                                     orient='index',
                                     columns=['Values'])

    #convert dataframe into html
    mars_html = mars_df.to_html()

    #--------------------
    ## Mars Hemispheres
    #--------------------

    #set up and visit url
    hem_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(hem_url)

    #pull html + get all the links that hold img urls
    hem_html = browser.html
    soup = bs(hem_html, 'html.parser')
    hem_results = soup.find('div', class_="collapsible results")
    hemispheres = hem_results.find_all('a')

    #get all the titles + img urls
    hem_links = []
    hem_titles = []
    for a in hemispheres:
        hem_link = a['href']
        hem_title = a.text

        hem_links.append(f'https://astrogeology.usgs.gov{hem_link}')
        hem_titles.append(hem_title)

    #get only unique values for titles and img urls
    titles = list(set(hem_titles))
    titles.pop(0)
    titles.sort()
    links = list(set(hem_links))
    links.sort()

    #get large size img urls
    img_results = []
    for a in links:
        browser.visit(a)
        time.sleep(5)
        img_html = browser.html
        soup = bs(img_html, 'html.parser')
        img_result = soup.find('div', class_="downloads").find('li').a['href']
        img_results.append(img_result)

    #create a list of dictionaries of titles and img_results
    hemisphere_image_urls = []
    hemisphere_image_urls.append({
        "title": titles[0],
        "img_url": img_results[0]
    })
    hemisphere_image_urls.append({
        "title": titles[1],
        "img_url": img_results[1]
    })
    hemisphere_image_urls.append({
        "title": titles[2],
        "img_url": img_results[2]
    })
    hemisphere_image_urls.append({
        "title": titles[3],
        "img_url": img_results[3]
    })

    mars_dict = {
        "id": 1,
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "mars_weather": mars_weather,
        "mars_html": mars_html,
        "hemisphere_images": hemisphere_image_urls
    }

    return mars_dict

示例#23

0

显示文件

def scrape():

    #MARS NEWS
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    browser.visit(url)
    html = browser.html
    news_soup = bs(html, "html.parser")

    news_title = news_soup.find_all("div", class_ = "content_title")[1].text
    news_p = news_soup.find("div", class_ = 'article_teaser_body').text

    #JPL IMAGES
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    html = browser.html
    jpg_soup = bs(html, "html.parser")

    jpg_container = jpg_soup.find("div", class_="carousel_items")
    image_url = jpg_container.find("article")["style"]

    url_clean = image_url.split("'")[1]

    jpl_base_url = "https://www.jpl.nasa.gov"
    feat_image_url = jpl_base_url + url_clean

    #print(feat_image_url)

    #MARS WEATHER
    #ryan helped me and talked me through this code to not use
    #big repetitive "css..." classes i kept finding on twitter
    url = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    twit_soup = bs(response.text, 'html.parser')
    mars_w = twit_soup.find_all('p', class_="TweetTextSize")
    for tweet in mars_w:
        tweet.find('a').extract()
        if 'InSight sol' in tweet.text:
            mars_weather = tweet.text
            break
    mars_weather
    
   
    #MARS FACTS
    url = "https://space-facts.com/mars/"
    mars_table = pd.read_html(url)
    mars_table = mars_table[0]
    mars_table.columns = ["Parameter", "Value"]
    mars_table
    mars_Tstring = mars_table.to_html()
    mars_Tstring

    #HEMISPHERES
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    response = requests.get(url)
    hemi_soup = bs(response.text, 'html.parser')
    hemi_img_urls = []
    hemi_dict = {}

    hemispheres = hemi_soup.find_all('div', class_="description")

    #splinter through 
    for hemisphere in hemispheres:
        title = hemisphere.text
        browser.visit(url)
        browser.click_link_by_partial_text(title)
        html = browser.html
        hemi_soup_img = bs(html, 'html.parser')
        img_url = hemi_soup_img.find('li').a['href']
        
        hemi_dict["title"] = title
        hemi_dict["img_url"] = img_url
        
        hemi_img_urls.append(hemi_dict)
        hemi_dict = {}

    scrape_output = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image": feat_image_url,
        "mars_weather": mars_weather,
        "mars_facts": mars_Tstring,
        "hemispheres": hemi_img_urls
    }
    return scrape_output

#print("run-it")

示例#24

0

显示文件

文件： scrape_mars.py 项目： sdarby713/MarsWebPage

def scrape_info():
    # ## Get Mars News
    executable_path = {"executable_path" : "chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    html = browser.html
    soup = bs(html, 'html.parser')

    news_title = soup.find("div", class_="content_title").text
    news_p     = soup.find("div", class_="article_teaser_body").text

    # ## Get Mars Featured Image
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    browser.click_link_by_partial_text("FULL IMAGE")
    time.sleep(3)
    browser.click_link_by_partial_text("more info")

    html = browser.html
    soup = bs(html, 'html.parser')
    
    featured_image = soup.find("figure", class_="lede")
    print(featured_image)

    featured_image_url = "https://www.jpl.nasa.gov" + featured_image.find("a")["href"]
    print(featured_image_url)


    # ## Get Mars Weather

    url = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'lxml')

    def getText(parent):
        return ''.join(parent.find_all(text=True, recursive=False)).strip()

    result = soup.find("p", class_="tweet-text")
    weather_report = getText(result)
    print(weather_report)


    # ## Get Mars Facts

    url = "https://space-facts.com/mars/"
    response = requests.get(url)
    soup = bs(response.text, "lxml")

    result_labels = soup.find_all("td", class_="column-1")
    result_values = soup.find_all("td", class_="column-2")

    result_labels_text = []
    result_values_text = []
    for rlabel in result_labels:
        result_labels_text.append(rlabel.text)
    for rvalue in result_values:
        result_values_text.append(rvalue.text)

    mars_df = pd.DataFrame({"Stats": result_labels_text,
                            "Values":  result_values_text})

    mars_df.set_index("Stats",inplace=True)
    
    mars_facts_html = mars_df.to_html()
   
    # ## Get Hemisphere Images

    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    hemisphere_list = []

    hemispheres = ["Cerberus", "Schiaparelli", "Syrtis Major", "Valles Marineris"]
    for x in range(0,4):
        browser.click_link_by_partial_text(hemispheres[x])
        
        html = browser.html
        soup = bs(html, 'html.parser')
        
        img_url = "https://astrogeology.usgs.gov" + (soup.find("img", class_="wide-image")["src"])
        title = (soup.find("h2", class_="title").text)
        
        hemisphere_dict = {"title": title, "img_url":img_url}
        hemisphere_list.append(hemisphere_dict)
        
        browser.back()
 
    browser.quit()

        # Store data in a dictionary
    mars_data = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "weather_report" : weather_report,
        "mars_facts_html" : mars_facts_html,
        "hemisphere_list" : hemisphere_list
    }

    return mars_data

示例#25

0

显示文件

文件： scrape_mars.py 项目： mreyes9406/TEC_Bootcamp

def scrape():
    # Import dependencies
    from bs4 import BeautifulSoup as bs
    from splinter import Browser
    import pandas as pd

    # Start splinter
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Visit NASA Mars News
    nasa_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(nasa_url)

    # Scrape NASA Latest Mars News
    html = browser.html
    soup = bs(html, 'html.parser')
    latest_news_title = soup.find_all('div', class_='content_title')[0].text
    latest_news_teaser = soup.find_all('div',
                                       class_='article_teaser_body')[0].text

    # Visit JPL Mars Space Images
    jpl_mars_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl_mars_url)

    # Scrape JPL Mars Space Images
    html = browser.html
    soup = bs(html, 'html.parser')
    feat_img_path = soup.find_all(
        'a', class_='button fancybox')[0]['data-fancybox-href']
    feat_img_url = 'https://www.jpl.nasa.gov' + feat_img_path

    # Visit Mars Weather Twitter Account
    mars_weather_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_weather_url)

    # Scrape Mars Weather Twitter Account
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_weather = soup.find_all(
        'p',
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    )[0].text

    # Visit Mars Facts Website
    mars_facts_url = 'https://space-facts.com/mars/'
    browser.visit(mars_facts_url)

    # Scrape Mars Weather Twitter Account
    html = browser.html
    soup = bs(html, 'html.parser')
    facts_table = soup.find_all('table')
    facts_df = pd.read_html(str(facts_table))[0]
    facts_dict = {row[0]: row[1] for row in facts_df.itertuples(index=False)}

    # Visit USGS Astrogeology for Martian hempsphere HD pictures
    mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    base_url = 'https://astrogeology.usgs.gov'
    browser.visit(mars_hemi_url)

    # Get HD images
    html = browser.html
    soup = bs(html, 'html.parser')
    items = soup.find_all('div', class_='item')
    button_texts = [item.h3.text for item in items]
    hems_url = []

    for button_text in button_texts:
        # Retrieve URL
        browser.click_link_by_partial_text(button_text)
        html = browser.html
        soup = bs(html, 'html.parser')
        img_url = soup.find_all('img', class_='wide-image')[0]['src']
        hem_url = base_url + img_url

        # Retrieve image title
        img_data = {}
        title = soup.find_all('h2', class_='title')[0].text
        img_data['title'] = title
        img_data['img_url'] = hem_url
        hems_url.append(img_data)
        browser.click_link_by_partial_text('Back')

    mars_info_dict = {}
    mars_info_dict['latest_news_title'] = latest_news_title
    mars_info_dict['latest_news_teaser'] = latest_news_teaser
    mars_info_dict['feat_img_url'] = feat_img_url
    mars_info_dict['mars_weather'] = mars_weather
    mars_info_dict['mars_facts'] = facts_dict
    mars_info_dict['hemispheres_url'] = hems_url

    return mars_info_dict

示例#26

0

显示文件

文件： scrape_mars.py 项目： Erik-AR/web_scraping-

teaser_body_content = soup.find(class_='article_teaser_body')
latest_title = news_content_title.find('a').get_text()

teaser_body = teaser_body_content.text
teaser_body

# Part 2: Getting featured image with use of splinter
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

#visit img url & saving the html
browser.visit(url)
featured_image = browser.find_by_id('full_image')
featured_image.click()
time.sleep(5)
more_info = browser.click_link_by_partial_text('more info')
#more_info.click()
html = browser.html
img_soup = bs(html, 'lxml')

featured_image = img_soup.find('figure', class_='lede')

# print(featured_image)
latest_image = "https://www.jpl.nasa.gov" + featured_image.find('a')['href']

# Twitter weather
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
time.sleep(random.random() * 3)
html = browser.html
tweet_soup = bs(html, 'lxml')

示例#27

0

显示文件

文件： scraping.py 项目： Eblakeiii/Mission-to-Mars

def mars_hemi():
    # scraping the hemisphere urls and title
    # Windows users
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome',
                      r'C:\Users\eblak\Class_Folder\Mission-to-Mars',
                      headless=False)

    # 1. Use browser to visit the hemisphere URL
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # b. Cerberus
    browser.click_link_by_partial_text('Cerberus')
    cerberus_html = browser.html
    cerberus_soup = soup(cerberus_html, 'html.parser')

    # find title
    cerberus_title = cerberus_soup.find("h2", class_='title').text

    # Find the relative image url
    cerberus = cerberus_soup.find('img', class_='wide-image')
    cerberus_img = cerberus['src']

    # add base url to rel url
    hemi_url = 'https://astrogeology.usgs.gov'
    cerberus_url = hemi_url + cerberus_img

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # c. Schiaparelli
    browser.back()
    browser.click_link_by_partial_text('Schiaparelli')
    schiaparelli_html = browser.html
    schiaparelli_soup = soup(schiaparelli_html, 'html.parser')

    # find title
    schiaparelli_title = schiaparelli_soup.find("h2", class_='title').text

    # find the relative image url
    schiaparelli = schiaparelli_soup.find('img', class_='wide-image')
    schiaparelli_img = schiaparelli['src']

    # add base url to rel url
    hemi_url = 'https://astrogeology.usgs.gov'
    schiaparelli_url = hemi_url + schiaparelli_img

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # d. Syrtis Major
    browser.back()
    browser.click_link_by_partial_text('Syrtis')
    syrtis_html = browser.html
    syrtis_soup = soup(syrtis_html, 'html.parser')

    # find title
    syrtis_title = syrtis_soup.find("h2", class_='title').text

    # find the relative image url
    syrtis = syrtis_soup.find('img', class_='wide-image')
    syrtis_img = syrtis['src']

    # add base url to rel url
    hemi_url = 'https://astrogeology.usgs.gov'
    syrtis_url = hemi_url + syrtis_img

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # e. Valles Marineris
    browser.back()
    browser.click_link_by_partial_text('Valles')
    valles_html = browser.html
    valles_soup = soup(valles_html, 'html.parser')

    # find title
    valles_title = valles_soup.find("h2", class_='title').text

    # find the relative image url
    valles = valles_soup.find('img', class_='wide-image')
    valles_img = valles['src']

    # add base url to rel url
    hemi_url = 'https://astrogeology.usgs.gov'
    valles_url = hemi_url + valles_img

    return [{
        'img_url': cerberus_url,
        'title': cerberus_title
    }, {
        'img_url': schiaparelli_url,
        'title': schiaparelli_title
    }, {
        'img_url': syrtis_url,
        'title': syrtis_title
    }, {
        'img_url': valles_url,
        'title': valles_title
    }]

示例#28

0

显示文件

def Scrape():
    """Function to scrape OFO history by:
     1) Initiating ChromeDriver
     2) Navigating to relevant SoCal Gas html sites (high and low OFO history)
     3) Pulling, reformatting, and exporting tabular data in csv file format """


    # Define the path to ChromeDriver, initiate the Browser instance
    # Print a message
    executable_path = {"executable_path": "C:/Users/LBro/Desktop/chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=True)
    print("CHROME DRIVER NOW RUNNING...")

    # List of http addresses to visit and scrape from the interwebs
    ofo_list = ["https://scgenvoy.sempra.com/#nav=/Public/ViewExternalOFO.getOFO",
                "https://scgenvoy.sempra.com/#nav=/Public/ViewExternalLowOFO.getLowOFO"]

    # For each item in the OFO list - visit that html and use the proper 'click' to navigate to place containing data
    # Also obtaining part of the string that will be used in CSV export
    for ofo_i in ofo_list:
        print(ofo_i)
        browser.visit(ofo_i)
        time.sleep(2)
        if 'Low' in ofo_i:
            browser.click_link_by_partial_text('Low OFO/EFO Event History')
            file_name = 'lowofo'
        else:
            browser.click_link_by_partial_text('High OFO Event History')
            file_name = 'highofo'

        # Obtain all the html on the site
        # 'table' is everything, 'header_rows' are headers from 'table', and ledger_data is content from 'table'
        time.sleep(5)
        html = browser.html
        soup = bs(html, 'html.parser')
        table = soup.find('table', {'class': 'ledger_table'})
        header_rows = table.find_all("td", {"class": "header_row"})
        ledger_data = table.find_all("td", {"class": "ledger_data"})

        # Get the OFO data into lists for easy comprehension (first headers then body ('ledger') of table)
        # Initiate empty list then append html data into a list
        # Remove the '\xa0' string to obtain solely the content
        # All headers (YYYY) and data are in these lists (lists contains data from many years)
        headers_raw = []
        for each in header_rows:
            headers_raw.append(each.text)

        headers = []
        for each in headers_raw:
            header_rows_cleaned = each.strip('\xa0')
            headers.append(header_rows_cleaned)

        ledger_raw = []
        for each in ledger_data:
            ledger_raw.append(each.text)

        ledger = []
        for each in ledger_raw:
            header_rows_cleaned = each.strip('\xa0')
            ledger.append(header_rows_cleaned)

        # This is a way of getting only the most recent year's data (left-hand column) - a clever way
        # The data in ledger is pulled in one long string with the top row first, then second, etc.
        # ledger[1::len(headers)] uses the number of columns of data to pull only left-most column because:
        # The 1 means pull data starting at position 1 where data in position 1 is the first entry in the left-most column
        # (Actual first item in position 0 is blank)
        # And then pull every element after that in position spaced at same interval length as number of columns
        # So skips over all the other column values and pulls the second row item in the first column, etc
        ofo_final = []
        for each in ledger[1::len(headers)]:
            print(each)
            ofo_final.append(each)

        # Convert list to to DataFrame: clean header (just most recent year 'YYYY') and ledger content
        # Remove NA values, split headers by comma
        # Rename the columns
        df = pd.DataFrame({headers[0]: ofo_final})
        df = df.dropna(axis=0, how='any')
        df = df[str(headers[0])].str.split(',', expand=True)

        # Create date column first position column (Month and Day) and year obtained from the header
        # Rename rows where the data are blank in first position column
        df = df.rename(columns={0: 'Month_Day', 1: 'Stage', 2: 'Percent'})
        df['Date'] = df['Month_Day'] + ", " + str(headers[0])
        df = df[df['Month_Day'] != '']
        df = df.drop(columns=['Month_Day'])

        # Export the file
        return df.to_csv(file_name + str(headers[0]) + '.csv', index=False)

示例#29

0

显示文件

文件： scrape_test.py 项目： etumi/web-scrapping-challenge

#print(news_title)
#print(news_p)

#----------------------------------------------------------------------------------------------------#
#SECTION 2
# URL of page to be scraped
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

time.sleep(2)

#Scrape Page
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

browser.click_link_by_partial_text('FULL IMAGE')

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

img_tag = soup.find('img', class_='fancybox-image')

try:
    image_relative_path = img_tag['src']
except:
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    img_tag = soup.find('img', class_='fancybox-image')
    image_relative_path = img_tag['src']

featured_image_url = 'https://www.jpl.nasa.gov' + image_relative_path

示例#30

0

显示文件

def scrape():
    # Initialize PyMongo to work with MongoDBs
    conn = 'mongodb://*****:*****@MarsWxReport':
                #            print('NO')
                #collection.insert_one(post)
                #        browser.quit()
                x += 1
    #    browser.click_link_by_partial_text('Next')
    #except (ElementDoesNotExist):

        url_facts = 'https://space-facts.com/mars/'

        type(tables)

    #Comparision information
    df_facts = tables[1]
    #df.columns = ['Equatorial Diameter', 'Polar Diameter', 'Mass', 'Moons',
    #              'Orbit Distance', 'Orbit Period', 'Surface Temperature', 'First Record',
    #              'Recorded By']

    df_facts.columns = ['Comparision', 'Mars', 'Earth']

    df_facts.head()

    #Information
    df_factd = tables[0]
    #df_factd.columns = ['Equatorial Diameter', 'Polar Diameter', 'Mass', 'Moons',
    #              'Orbit Distance', 'Orbit Period', 'Surface Temperature', 'First Record',
    #              'Recorded By']

    df_factd.columns = ['data_name', 'mars_data']

    df_factd.head(9)

    #make html page
    html_fact_table = df_factd.to_html()
    html_fact_table

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url_mars_img = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_mars_img)

    html = browser.html
    soup_mars_img = bs(html, 'html.parser')

    img_links = soup_mars_img.find_all('div', class_='item')
    img_links

    img_link_news = []

    for img_link in img_links:
        next_url = img_link.find('a')['href']
        #        next_url = link
        print(next_url)
        #    url_list.append(book_url)
        print('https://astrogeology.usgs.gov/' + next_url)
        long_next_url = ('https://astrogeology.usgs.gov' + next_url)
        img_link_news.append(long_next_url)

    print('---------new link-----------------------')
    print(img_link_news)

    print('-----------begin large image---------------------')
    collection2 = db.large_image_mars

    for img_link_new in img_link_news:
        #    for x in range(1, 1):
        browser.visit(img_link_new)
        browser.click_link_by_partial_text('Sample')
        html = browser.html
        #        print(html)
        soup_largeimage = bs(html, 'html.parser')
        #        print(soup_largeimage)
        bigger_image = soup_largeimage.find(
            'div', class_='downloads').find('a')['href']
        #.find_by_text('Sample')
        post2 = {
            'href': bigger_image,
        }

        collection2.insert_one(post2)

        print(bigger_image)
        print('--------end large image------------------------')

示例#31

0

显示文件

def scrape():
    #set up connection
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)

    #visit nasa news site
    nasa_url = 'https://mars.nasa.gov/news/'
    browser.visit(nasa_url)
    html = browser.html
    nasasoup = BeautifulSoup(html,'html.parser')

    #find most recent news title and description
    result = nasasoup.find_all(class_="slide")
    news_title = result[0].find('h3').text
    news_p = result[0].find(class_='rollover_description_inner').text

    #visit jpl.nasa site
    nasa_url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(nasa_url2)
    html = browser.html
    nasasoup2 = BeautifulSoup(html, 'html.parser')

    #get imageurl for featured image
    featuredimageurl = 'https://www.jpl.nasa.gov' + nasasoup2.select('#full_image')[0]['data-fancybox-href']

    #visit twitter
    twitterfeed_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(twitterfeed_url)
    html = browser.html
    twittersoup = BeautifulSoup(html,'html.parser')

    #get most recent weather tweet
    mars_weather = twittersoup.find('p',class_="TweetTextSize").text

    #visit space-facts.com
    spacefacts_url = 'https://space-facts.com/mars/'
    browser.visit(spacefacts_url)
    html = browser.html
    spacefactsoup = BeautifulSoup(html,'html.parser')

    #read in table via pandas
    spacefacttabledf = pd.read_html(html)[0]

    #convert table back to html
    spacefacttable = spacefacttabledf.to_html(index=False)

    #visit usgs.gov
    usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(usgs_url)

    #grab hemisphere name and img_url for each of the four hemispheres
    imagelinks = []
    for x in range(4):
        links = browser.find_link_by_partial_text('Enhanced')
        browser.click_link_by_partial_text(links[x].text)
        html = browser.html
        imagesoup = BeautifulSoup(html,'html.parser')
        result = imagesoup.find('a',text='Sample')
        hemistring = imagesoup.find('h2').text
        imagelinks.append({'title':hemistring[:len(hemistring)-9],'img_url':result.attrs['href']})
        browser.back()

    output = {'news_title':news_title, 'news_p':news_p, 'featuredimageurl':featuredimageurl,
              'mars_weather':mars_weather,'spacefacttable':spacefacttable, 'imagelinks':imagelinks}

    return output

示例#32

0

显示文件

文件： scrape_mars.py 项目： MabelDS89/web-scraping-challenge

def scrape():

    # In[2]:

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # # Step 1 - Scraping

    # NASA Mars News

    # In[9]:

    mars_dict = {}

    #URL of NASA Mars News Site
    url1 = '''https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=
    19%2C165%2C184%2C204&blank_scope=Latest'''

    browser.visit(url1)

    # In[10]:

    #HTML object
    html1 = browser.html

    #Parse HTML with BeautifulSoup
    soup1 = BeautifulSoup(html1, 'html.parser')

    # In[11]:

    #Retrieve first article
    # first_art = soup1.find('li', class_= 'slide')

    # In[12]:

    #Use Beautiful Soup's find() method to navigate and retrieve attributes

    # step1 = soup1.find('div', class_='image_and_description_container')
    # step2 = step1.find('div', class_='list_text')
    # news_title = step2.find('div', class_='content_title').get_text

    try:
        step1 = soup1.select_one(
            'div.image_and_description_container div.list_text')
        #find news title
        news_title = step1.find("div", class_="content_title").text
        #find news paragraph
        news_p = step1.find("div", class_="article_teaser_body").text
    except:
        return None, None

    #Add news_title to the mars_dict dictionary
    mars_dict['News Title'] = news_title

    # news_p = soup1.find('div', class_= 'article_teaser_body').get_text

    #Add news_p to the mars_dict dictionary
    mars_dict["News Para."] = news_p

    # JPL Mars Space Images - Featured Image

    # In[17]:

    #URL of JPL Mars Space Images Site
    url2 = '''https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'''

    browser.visit(url2)

    # In[18]:

    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(3)

    # In[19]:

    browser.click_link_by_partial_text('more info')
    time.sleep(3)

    #HTML object
    html2 = browser.html

    #Parse HTML with BeautifulSoup
    soup2 = BeautifulSoup(html2, 'html.parser')

    # In[24]:

    image_url = soup2.find('figure', class_="lede").a['href']
    image_url

    # In[25]:

    featured_image_url = 'https://www.jpl.nasa.gov' + image_url

    #Add featured_image_url to the mars_dict dictionary
    mars_dict['Featured Image URL'] = featured_image_url

    # Mars Facts

    # In[28]:

    #URL of Space Facts Site
    url3 = 'https://space-facts.com/mars/'

    # In[29]:

    #Read in table
    mars_table = pd.read_html(url3)
    mars_table

    # In[32]:

    #Create a DataFrame with the 1st table available on the site
    df = mars_table[0]
    df

    #Remove header column
    df.columns = df.iloc[0]
    df = df[1:]

    # In[33]:

    #Convert the DataFrame table to HTML
    html_table = df.to_html(index=False)
    html_table

    # In[38]:

    #Remove escape sequences
    html_table = html_table.replace('\n', '')

    #Add html_table to the mars_dict dictionary
    mars_dict['Mars Table'] = html_table

    # Mars Hemispheres

    # In[3]:

    # URL of page to be scraped
    url4 = '''https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'''

    browser.visit(url4)

    #HTML object
    html4 = browser.html

    # In[8]:

    # Find titles and image urls and build the dictionary
    titles = browser.find_by_css('a.product-item h3')

    hemi_list = []

    for i in range(len(titles)):
        hemi_dict = {}
        browser.find_by_css('a.product-item h3')[i].click()
        sample = browser.find_by_text('Sample')
        image_url = sample['href']
        hemi_dict['Title'] = browser.find_by_css('h2.title').text
        hemi_dict['ImageURL'] = image_url
        hemi_list.append(hemi_dict)
        browser.back()
        print("---")
        print(hemi_dict['Title'])
        print(image_url)

    # In[9]:

    #Add hemi_list to the mars_dict dictionary
    mars_dict['Hemispheres'] = hemi_list

    return mars_dict

示例#33

0

显示文件

文件： scrape_mars.py 项目： ziangzhang10/hw10-web-scrapping

def scrape():

    # ===========================================
    # declare dictionary for all results
    all_dict = {
        "mars_news_title": "",
        "mars_news_text": "",
        "featured_image_url": "",
        "mars_weather": "",
        "mars_facts": "",
        "hemisphere_list": ""
    }

    # ===========================================
    # Mars news url to be scraped
    mars_news_url = "https://mars.nasa.gov/news/"

    # module to call API
    response = requests.get(mars_news_url)

    # scrape raw text from page
    soup = bs(response.text, "html.parser")

    # print soup
    #print(soup.prettify())

    # get all the responses as an iterable list
    results = soup.find_all('div', class_="slide")

    # print the latest news
    #print(results[0].prettify)

    # get news title
    mars_news_title = results[0].find(
        "div", class_="content_title").find("a").text.strip()
    print(mars_news_title)

    # get news text
    mars_news_text = results[0].find(
        "div", class_="rollover_description_inner").text.strip()
    print(mars_news_text)

    all_dict["mars_news_title"] = mars_news_title
    all_dict["mars_news_text"] = mars_news_text

    # ===========================================
    # open browser
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # visit the page for image
    mars_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(mars_image_url)

    # find the button to clicked the feature image
    button = browser.click_link_by_partial_text("FULL IMAGE")

    # Otherwise, this code cannot run in one flow; please blame Splinter
    time.sleep(1)

    # get image url
    soup = bs(browser.html, "html.parser")
    whatever = soup.find("img", {"class": "fancybox-image"})
    print(type(whatever))
    featured_image_url = "https://www.jpl.nasa.gov" + whatever["src"]
    print(featured_image_url)

    browser.quit()

    all_dict["featured_image_url"] = featured_image_url

    # ===========================================
    # Mars weather url to be scraped
    mars_weather_url = "https://twitter.com/marswxreport?lang=en"

    # module to call API
    response = requests.get(mars_weather_url)

    # scrape raw text from page
    soup = bs(response.text, "html.parser")

    # print soup
    #print(soup.prettify())

    # get all the responses as an iterable list
    results = soup.find_all('div', class_="js-tweet-text-container")

    # print the latest weather tweet
    # print(results[0].prettify)

    # get tweet text
    for result in results:
        # get rid of the unwanted tail
        trash = result.find("a", class_="twitter-timeline-link")
        _ = trash.extract()
        # now get the "pure" output
        mars_weather = result.find("p", class_="js-tweet-text").text.strip()
        # if it's a valid weather tweet
        if "InSight" in mars_weather:
            print(mars_weather)
            break

    all_dict["mars_weather"] = mars_weather

    # ===========================================
    # Mars facts url to be scraped
    mars_facts_url = "https://space-facts.com/mars/"

    # read table into pandas
    tables = pd.read_html(mars_facts_url)
    table = tables[0]

    # change name of columns
    table.columns = ['Parameter', 'Value']
    #display(table)

    # convert table to html
    mars_facts = table.to_html()
    mars_facts

    all_dict["mars_facts"] = mars_facts

    # ===========================================
    # open browser (if closed already)
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # visit the page for image
    mars_hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(mars_hemis_url)

    # find the button to clicked the feature image
    buttons = browser.find_by_css('img[class="thumb"]')
    buttons_length = len(buttons)
    button = buttons[0]

    dict_list = []

    # loop over all the buttons
    for i in range(buttons_length):
        button.click()

        #extract elements with beautifulsoup
        soup = bs(browser.html, "html.parser")
        img_title = soup.find('h2', class_="title").text.strip()
        img_url = soup.find('a', target="_blank")['href']

        # append list of dictionaries
        this_dict = {"title": "", "img_url": ""}
        this_dict["title"] = img_title
        this_dict["img_url"] = img_url
        dict_list.append(this_dict)

        # go back one level
        browser.back()
        buttons = browser.find_by_css('img[class="thumb"]')
        if i + 1 in range(buttons_length):
            button = buttons[i + 1]
        else:
            pass

    browser.quit()

    all_dict["hemisphere_list"] = dict_list
    print(all_dict)

    return all_dict

示例#34

0

显示文件

def Scrape():

    print("COMMENCING SCRAPE")
    print("----------------------------------")

    # Empty dictionary
    mars_dict = {}

    # ## NASA Mars News

    # Mars News URL
    url = "https://mars.nasa.gov/news/"

    #pointing to the directory where chromedriver exists
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    # Retrieve page with the requests module
    browser.visit(url)
    html = browser.html

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html, 'html.parser')

    # Get title & description
    news_title = soup.find('div', 'content_title', 'a').text
    news_p = soup.find('div', class_='article_teaser_body').text
    # Adding to dict
    mars_dict["news_title"] = news_title
    mars_dict["news_p"] = news_p

    print("NEWS TITLE & DESCRIPTION ACQUIRED")

    # ## JPL Mars Space Images

    # JPL Mars URL
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

    # Setting up splinter
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through the pages
    time.sleep(5)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(5)
    browser.click_link_by_partial_text('more info')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Get featured image
    results = soup.find('article')
    extension = results.find('figure', 'lede').a['href']
    link = "https://www.jpl.nasa.gov"
    featured_image_url = link + extension

    mars_dict["featured_image_url"] = featured_image_url

    print("FEATURED IMAGE ACQUIRED")

    # ## Mars Weather

    # ## Mars Weather

    # visit the mars weather report twitter and scrape the latest tweet
    mars_weather_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_weather_url)
    time.sleep(5)
    html_weather = browser.html
    soup = BeautifulSoup(html_weather, "html.parser")

    mars_weather = soup.find(string=re.compile("Sol"))
    print(mars_weather)

    mars_dict["mars_weather"] = mars_weather

    print("WEATHER ACQUIRED")

    # ## Mars Facts

    # Mars Facts URL
    url = "https://space-facts.com/mars/"

    # Retrieve page with the requests module
    html = requests.get(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html.text, 'html.parser')

    # Empty dictionary for info
    mars_profile = {}

    # Get info
    results = soup.find('tbody').find_all('tr')

    # Storing profile information
    for result in results:
        key = result.find('td', 'column-1').text.split(":")[0]
        value = result.find('td', 'column-2').text

        mars_profile[key] = value

    # Creating a DataFrame
    profile_df = pd.DataFrame([mars_profile]).T.rename(columns={0: "Value"})
    profile_df.index.rename("Description", inplace=True)

    # Converting to html
    profile_html = "".join(profile_df.to_html().split("\n"))

    # Adding to dictionary
    mars_dict["profile_html"] = profile_html

    print("FACTS ACQUIRED")

    # ## Mars Hemispheres

    # Mars Hemispheres URL
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    # Empty list of image urls
    hemisphere_image_urls = []

    # ### Valles Marineris

    # Setting up splinter
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through pages
    time.sleep(5)
    browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Store link
    valles_link = soup.find('div', 'downloads').a['href']

    # Create dictionary
    valles_marineris = {
        "title": "Valles Marineris Hemisphere",
        "img_url": valles_link
    }

    # Appending dictionary
    hemisphere_image_urls.append(valles_marineris)

    # ### Cerberus

    # Setting up splinter
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through pages
    time.sleep(5)
    browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Store link
    cerberus_link = soup.find('div', 'downloads').a['href']

    # Create dictionary
    cerberus = {"title": "Cerberus Hemisphere", "img_url": cerberus_link}

    # Appending dictionary
    hemisphere_image_urls.append(cerberus)

    # ### Schiaparelli

    # Setting up splinter
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through pages
    time.sleep(5)
    browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Store link
    schiaparelli_link = soup.find('div', 'downloads').a['href']

    # Create dictionary
    schiaparelli = {
        "title": "Schiaparelli Hemisphere",
        "img_url": schiaparelli_link
    }

    # Appending dictionary
    hemisphere_image_urls.append(schiaparelli)

    # ### Syrtis Major

    # Setting up splinter
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through pages
    time.sleep(5)
    browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Store link
    syrtis_link = soup.find('div', 'downloads').a['href']

    # Create dictionary
    syrtis_major = {"title": "Syrtis Major Hemisphere", "img_url": syrtis_link}

    # Appending dictionary
    hemisphere_image_urls.append(syrtis_major)

    # Adding to dictionary
    mars_dict["hemisphere_image_urls"] = hemisphere_image_urls

    print("HEMISPHERE IMAGES ACQUIRED")
    print("----------------------------------")
    print("SCRAPING COMPLETED")

    return mars_dict

示例#35

0

显示文件

文件： scrape_mars.py 项目： abubekerali/Mars_Mission

def scrape_mars():
    from bs4 import BeautifulSoup
    from splinter import Browser
    import pandas as pd
    import selenium
    import time

    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser("chrome",
                      **executable_path,
                      headless=False,
                      incognito=True)

    # scraping news
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(3)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #scraping the latest news title
    news_title = soup.find('ul', class_='item_list ').find('li', class_='slide').find('div', class_='content_title')\
    .find('a').get_text()

    # scrapping latest news paragraph
    news_p = soup.find('ul', class_='item_list').find(
        'li', class_='slide').find('div',
                                   class_='article_teaser_body').get_text()
    print(news_p)
    print(news_title)

    # scraping weather
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    time.sleep(3)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    weather_features = 'Sol' and 'high' and 'low' and 'pressure'
    all_weather_tweets = soup.find_all(
        'li', class_="js-stream-item stream-item stream-item ")

    for tweets in all_weather_tweets:
        if weather_features in tweets.find(
                'div', class_='js-tweet-text-container').find('p').text:
            mars_weather = tweets.find(
                'div', class_='js-tweet-text-container').find('p').text
            break

    print(mars_weather)

    # scraping featured image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(3)

    browser.find_by_css('div[class="default floating_text_area ms-layer"]').find_by_css('footer')\
    .find_by_css('a[class="button fancybox"]').click()
    time.sleep(3)

    browser.find_by_css('div[id="fancybox-lock"]').find_by_css('div[class="buttons"]')\
    .find_by_css('a[class="button"]').click()

    featured_image_url = browser.find_by_css('div[id="page"]').find_by_css('section[class="content_page module"]')\
    .find_by_css('figure[class="lede"]').find_by_css('a')['href']

    print(featured_image_url)

    #scraping facts
    url = 'http://space-facts.com/mars/'

    tables = pd.read_html(url)

    df = tables[0]
    df.columns = ['Description', 'Value']
    df = df.set_index('Description')

    mars_info_table = df.to_html()
    print(mars_info_table)
    #scraping hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    time.sleep(3)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    hemispheres = soup.find('div', class_='collapsible results').find_all(
        'div', class_='item')

    hemisphere_image_urls = []

    for i in range(len(hemispheres)):
        title = hemispheres[i].find('div',
                                    class_="description").find('h3').text

        browser.find_by_css('div[class="collapsible results"]').find_by_css('div[class="item"]')[i]\
        .find_by_css('div[class="description"]').find_by_css('a').click()

        for img in browser.find_by_css('div[class="downloads"]').find_by_css(
                'a'):
            if ('Original' in img.text):
                img_url = img['href']

        browser.click_link_by_partial_text('Back')

        dic = {'title': title, 'img_url': img_url}
        hemisphere_image_urls.append(dic)

        time.sleep(3)

    print(hemisphere_image_urls)

    scrape_dic = {
        'news_title': news_title,
        'news_paragraph': news_p,
        'weather': mars_weather,
        'image': featured_image_url,
        'facts_table': mars_info_table,
        'hemispheres': hemisphere_image_urls
    }

    browser.quit()
    return scrape_dic

示例#36

0

显示文件

文件： splinter_demo.py 项目： AugustLONG/medusa

browser.visit('http://www.baidu.com')
print browser.url
print browser.title
print browser.html

# Input search text
browser.fill('wd', '12306')

# Press the search button
button = browser.find_by_id('su')
button.click()

# Interacting with elements in the page
# (the find_* method returns a list of all found elements)
# (If an element is not found, the find_* methods return an empty list.
#  But if you try to access an element in this list,
#  the method will raise splinter.exceptions.ElementDoesNotExist )

# [1] Get value of an element
content_left = browser.find_by_id('content_left')
print len(content_left)
print content_left[0].value

# [2] Clicking links
browser.click_link_by_partial_text(u'铁道部火车票网上订票唯一官网 - 铁路客户服务中心')

# Close the browser
import time
time.sleep(10)
browser.quit()