def get_nearby_places_for_site(national_site): coordinates = get_site_coordinates(national_site) if coordinates[0] == 0: print(">>> UNABLE TO RETRIEVE NEARBY PLACES") return None latitude = str(coordinates[0]) longitude = str(coordinates[1]) location = latitude + "," + longitude site = "GOOGLE" national_site = national_site topic = "nearby " + national_site cache = Cache(cache_file) base2 = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?" params_d2 = {} params_d2["key"] = google_places_key params_d2["location"] = location params_d2["radius"] = 10000 UID = create_id(site, topic) nearby_response = cache.get(UID) if nearby_response == None: nearby_response = requests.get(base2, params_d2).text testurl = requests.get(base2, params_d2).url #print(testurl) #response = nearby_response.json() cache.set(UID, nearby_response) responses = json.loads(nearby_response) responses = responses["results"] NearbyList = [] for i in responses: name = i["name"] latitude = i["geometry"]["location"]["lat"] longitude = i["geometry"]["location"]["lng"] place = NearbyPlace(name, latitude, longitude) NearbyList.append(place) return NearbyList
def get_nearby_places_for_site(national_site): cache_file = "part2_nearbysearch.json" cache = Cache(cache_file) base = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?" params_diction = {} params_diction["key"] = google_places_key params_diction["location"] = get_location_for_site(national_site) params_diction["radius"] = 10000 identifier = base + params_diction["key"] + params_diction[ "location"] + str(params_diction["radius"]) response = cache.get(identifier) while response is None: response = json.loads(requests.get(base, params_diction).text) cache.set(identifier, response, 10) nearby_result_list = response["results"] nearby_list = [] for nearby in nearby_result_list: name = nearby["name"] lat = nearby["geometry"]["location"]["lat"] lng = nearby["geometry"]["location"]["lng"] nearby_list.append(NearbyPlace(name, lat, lng)) return nearby_list
def get_site_coordinates(national_site): site = "GOOGLE" topic = national_site cache = Cache(cache_file) base1 = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json?" params_d = {} params_d["key"] = google_places_key params_d["input"] = national_site params_d["inputtype"] = "textquery" params_d['fields'] = 'geometry,formatted_address' # params_d["locationbias"] = "point:lat,lng" UID = create_id(site, topic) get_data = cache.get(UID) if get_data == None: get_data = requests.get(base1, params_d).text #testurl = requests.get(base1, params_d).url #print(testurl) cache.set(UID, get_data) lat = 0 long = 0 site_data = json.loads(get_data) try: place = site_data['candidates'][0] latitude = place['geometry']['location']['lat'] longitude = place['geometry']['location']['lng'] site_coordinates = latitude, longitude except: site_coordinates = lat, long print( "Sorry! There was an error retrieving coordinates for {}. We will not be able to list its nearby places or map it." .format(national_site)) return site_coordinates
def website_scraping_and_cache(url): cache = Cache("national_sites.json") result = cache.get(url) if not result: result = requests.get(url).text cache.set(url, result, 30) return BeautifulSoup(result, 'html.parser')
def get_sites_for_state(state_abbr): cache_file = "part1.json" url_to_scrape = "https://www.nps.gov/state/{}/index.htm".format(state_abbr) cache = Cache(cache_file) while cache.get(url_to_scrape) is None: html_text = requests.get(url_to_scrape).text cache.set(url_to_scrape, html_text, 10) soup = BeautifulSoup(cache.get(url_to_scrape), features='html.parser') parks = soup.find(id="list_parks").find_all(class_='clearfix') ### Information you should get for each National Site will include the site name, site type, and the physical (or mailing) address. national_park_list = [] for park in parks: site_name = park.find('h3').text site_type = park.find('h2').text site_desc = park.find('p').text address_url = park.find_all('a')[2].get('href') cache_file = "part1_address.json" url_to_scrape = address_url cache = Cache(cache_file) while cache.get(url_to_scrape) is None: html_text = requests.get(url_to_scrape).text cache.set(url_to_scrape, html_text, 10) soup_add = BeautifulSoup(cache.get(url_to_scrape), features='html.parser') address_street = soup_add.find(itemprop='streetAddress').text address_city = soup_add.find(itemprop='addressLocality').text address_state = soup_add.find(itemprop='addressRegion').text address_zip = soup_add.find(itemprop='postalCode').text national_park_list.append( NationalSite(site_type, site_name, site_desc, address_street, address_city, address_state, address_zip)) return national_park_list
def get_sites_for_state(state_abbr): state = state_abbr topic = state cache = Cache(cache_file) base = project_dictionary[state] UID = create_id(site, topic) state_response = cache.get(UID) if state_response == None: state_response = requests.get(base).text cache.set(UID, state_response) NationalSiteList = state_process(state_response) return NationalSiteList
def get_nearby_places_for_site(national_site): nearby_places_list = [] # the result list that stores the nearby places lng, lat = get_geolocation_info(national_site) national_site.lat = lat national_site.lng = lng if lng == None and lat == None: print("There is no geolocation info for " + str(national_site) + ".") else: params_dic = { "key": google_places_key, "location": str(lat) + "," + str(lng), "radius": 10000 } unique_identifier = params_unique_combination(nearbysearch_base_url, params_dic) cache = Cache("nearby_places.json") places_json = cache.get(unique_identifier) if not places_json: result = requests.get(nearbysearch_base_url, params=params_dic) places_json = json.loads(result.text) cache.set(unique_identifier, places_json, 30) try: places = places_json["results"] for place in places: place_class = NearbyPlace(place['name']) try: place_class.lat = place['geometry']['location']['lat'] place_class.lng = place['geometry']['location']['lng'] except: pass nearby_places_list.append(place_class) except: pass return nearby_places_list
def get_location_for_site(national_site): cache_file = "part2_textsearch.json" cache = Cache(cache_file) base = "https://maps.googleapis.com/maps/api/place/textsearch/json?" params_diction = {} params_diction["query"] = "{},{}".format(national_site.name, national_site.type) params_diction["key"] = google_places_key identifier = base + params_diction["query"] + params_diction["key"] response = cache.get(identifier) while response is None: response = json.loads(requests.get(base, params_diction).text) cache.set(identifier, response, 10) try: lat = str((response["results"][0]["geometry"]["location"]["lat"])) lng = str((response["results"][0]["geometry"]["location"]["lng"])) return lat + ',' + lng except: return None
def get_geolocation_info(national_site): params_dic = { "key": google_places_key, "address": national_site.name + " " + national_site.type } unique_identifier = params_unique_combination(geolocation_base_url, params_dic) cache = Cache("geolocation_info.json") geolocation_json = cache.get(unique_identifier) if not geolocation_json: result = requests.get(geolocation_base_url, params=params_dic) geolocation_json = json.loads(result.text) cache.set(unique_identifier, geolocation_json, 30) try: geolocation = geolocation_json["results"][0]['geometry']['location'] lng = geolocation['lng'] lat = geolocation['lat'] except: lng = None lat = None return lng, lat
import requests from datetime import datetime import json import plotly import plotly.plotly as py import plotly.graph_objs as go ################################# # PART 1: SCRAPING DATA FROM TA # ################################# CACHE_FNAME = "top_destination.json" url_to_scrape = "https://www.tripadvisor.com/TravelersChoice-Destinations" cache_html = Cache(CACHE_FNAME) while cache_html.get(url_to_scrape) is None: data = requests.get(url_to_scrape) html_text = data.text cache_html.set(url_to_scrape, html_text, 10) print("DATA NOT IN CACHE, SCRAPING FROM URL NOW") soup = BeautifulSoup(cache_html.get(url_to_scrape), features="html.parser") mainnames = soup.find_all("div", class_="mainName") destination_name_lst = [] for mainname in mainnames: name = mainname.find("a").text destination_name_lst.append(name) city_name_lst = [] for items in destination_name_lst: city_name = items.split(",")[0] city_name_lst.append(city_name)
project_dictionary[t.attrs['href'] [7:9]] = "https://www.nps.gov" + t.attrs["href"] return project_dictionary ################################# # CONFIG & RUN LIST SCRAPE # ################################# cache_file = "NPS.json" site = "NPS" topic = "states" cache = Cache(cache_file) base = "https://www.nps.gov/index.htm" UID = create_id(site, topic) response = cache.get(UID) if response == None: response = requests.get(base).text cache.set(UID, response) process(response) ##################################### ## NATIONAL SITE CLASS ##################################### class NationalSite(): def __init__(self, type, name, desc, address, url=None): self.type = type
def process(response): name_lst = [] url_lst = [] site_lst = [] soup = BeautifulSoup(response, 'html.parser') national_site_container = soup.find_all( 'div', class_='col-md-9 col-sm-9 col-xs-12 table-cell list_left') for container in national_site_container: # Name name = container.h3.text name_lst.append(name) # print(name) # Type type = container.h2.text # print(type) # Description process.desc = container.p.text # print(desc) # URL process.url = "https://www.nps.gov" + container.h3.a.get( 'href') + "index.htm" url_lst.append(process.url) # print(url) # Look at each URL and scrape that page for urls in url_lst: cache_file = "nps_address.json" cache_address = Cache(cache_file) UID = create_id_sites(urls) response2 = cache_address.get(UID) if response2 == None: response2 = requests.get(urls).text cache_address.set(UID, response2, 100) soup2 = BeautifulSoup(response2, "html.parser") try: ## Address Street address_street_fndr = soup2.find( attrs={"itemprop": "streetAddress"}) process.address_street = address_street_fndr.text process.address_street = process.address_street.replace( '\n', '') # print(process.address_street) ## Address City address_city_fndr = soup2.find( attrs={"itemprop": "addressLocality"}) process.address_city = address_city_fndr.text # print(process.address_city) ## Address State address_state_fndr = soup2.find( attrs={"itemprop": "addressRegion"}) process.address_state = address_state_fndr.text # print(process.address_state) ## Address ZIP address_zip_fndr = soup2.find(attrs={"itemprop": "postalCode"}) process.address_zip = address_zip_fndr.text process.address_zip = process.address_zip.strip() # print(process.address_zip) except: # If address is not found # print("No address found for {}".format(urls)) process.address_street = "Not found" process.address_city = "Not found" process.address_state = "Not found" process.address_zip = "Not found" national_sites = NationalSite( type, name) # Create a new NationalSite instance site_lst.append(national_sites ) # Append each NationalSite instance to site_lst list return site_lst