/
scrape_mars.py
181 lines (111 loc) · 5.01 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
def scrape():
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
# NASA MARS NEWS
news_url = ('https://mars.nasa.gov/news/?page=0&per_page=40'
'&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest')
browser.visit(news_url)
time.sleep(2)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
news_title = soup.body.find_all('div', class_ = "content_title")[1].text.strip()
news_p = soup.body.find_all('div', class_ = "article_teaser_body")[0].text.strip()
#JPL MARS SPACE IMAGES - FEATURED IMAGE
image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(image_url)
time.sleep(2)
my_xpath = '/html/body/div[1]/div/div[3]/section[1]/div/div/article/div[1]/footer/a'
results = browser.find_by_xpath(my_xpath)
img = results[0]
img.click()
browser.click_link_by_partial_text('more info')
html1 = browser.html
soup = BeautifulSoup(html1, 'html.parser')
feat_img = soup.find_all('figure', class_='lede')
feat_img_result = feat_img[0].a['href']
featured_image_url = 'https://www.jpl.nasa.gov' + feat_img_result
# MARS FACTS
facts_url = 'https://space-facts.com/mars/'
facts_table = pd.read_html(facts_url)
table_df = facts_table[0]
# mars_table_df = table_df.rename(columns={0: 'Mars: Measurement', 1: 'Measurement: Value'})
mars_table_df = table_df.to_html(header = False, index = False)
# mars_table_df.to_html(classes="table table-striped")
print(mars_table_df)
# MARS HEMISPHERES
#Note the inconsistent url
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
#alternate site, if previous site is unavailable
# hemispheres_url = 'https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)
time.sleep(2)
hemisphere_image_urls = []
url_links = browser.find_by_css('a.product-item h3')
for i in range(len(url_links)):
# create an empty dictionary for each hemisphere
hemisphere={}
browser.find_by_css('a.product-item h3')[i].click()
#get hemisphere title
hemisphere['title'] = browser.find_by_css("h2.title").text
#next find the sample image anchor tag and get href
sample_elem = browser.find_link_by_text('Sample').first
hemisphere['img_url'] = sample_elem['href']
#Append hemisphere object to list
hemisphere_image_urls.append(hemisphere)
#Finally navigate back to start again on loop
browser.back()
#*************** CREATE A DICTOIONARY *********************
mars_info={}
mars_info['news_title'] = news_title
mars_info['news_detail'] = news_p
mars_info['featured_img_url'] = featured_image_url
mars_info['mars_facts_html'] = mars_table_df
mars_info['hemisphere_image_urls'] = hemisphere_image_urls
# Close the browser
browser.quit()
# Return results
return mars_info
#
# xpaths = [
# '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[1]/div/a',
# '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[2]/div/a',
# '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[3]/div/a',
# '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[4]/div/a'
# ]
# hem_title = []
# hem_url = []
# mars_hem_title_url = []
# for path in xpaths :
# results = browser.find_by_xpath(path)
# img = results[0]
# img.click()
# html = browser.html
# soup = BeautifulSoup(html, 'html.parser')
# title = soup.find('h2', class_ = 'title').text
# hem_title.append(title)
# hem = soup.find('div', class_='downloads')
# hem_result = hem
# img_url = hem_result.find('a')['href']
# hem_url.append(img_url)
# mars_hem_title_url.append({'title': title, 'img_url': img_url})
# browser.visit(hemispheres_url)
# browser.quit()
# #Store results in dictionary
# notebook_dict = {}
# notebook_dict = {
# 'article_title': news_title,
# 'article_paragraph': news_p,
# 'mars_image': featured_image_url,
# 'mars_data_table': mars_table_df,
# 'hemisphere_image_urls': mars_hem_title_url}
# print(f"index 0 {notebook_dict['article_title']}")
# print(f"index 1 {notebook_dict['article_paragraph']}")
# print(f"index 2 {notebook_dict['mars_image']}")
# print(f"index 3 {notebook_dict['mars_data_table']}")
# print(f"index 4 {notebook_dict['hemisphere_image_urls']}")
# return notebook_dict