import requests, json, csv from bs4 import BeautifulSoup import numpy as np import pandas as pd #helps convert data into tabular structure from urllib.request import urlopen #new module meets 1/6 requirements import re #secondmodule # tools that will make it easier to build on things from advanced_expiry_caching import Cache START_URL = "https://en.wikipedia.org/wiki/List_of_Asian_countries_by_area" FILENAME = "countries.json" # So I can use 1 (one) instance of the Cache tool -- just one for my whole program, even though I'll get data from multiple places PROGRAM_CACHE = Cache(FILENAME) # assuming constants exist as such # use a tool to build functionality here def access_page_data(url): data = PROGRAM_CACHE.get(url) if not data: data = requests.get(url).text PROGRAM_CACHE.set( url, data ) # default here with the Cache.set tool is that it will expire in 7 days, which is probs fine, but something to explore return data ####### main_page = access_page_data(START_URL)
from flask_sqlalchemy import SQLAlchemy from flask import Flask, render_template, session, redirect, url_for, flash, request from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField, BooleanField import pandas as pd import csv, json from advanced_expiry_caching import Cache # use tool from the other file for caching import requests, os from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.keys import Keys from sqlalchemy.orm import relationship import time ##########scraping data from the website: states, topics, activities FILENAME = "allinfo_parks.json" # saved in variable with convention of all-caps constant program_cache = Cache( FILENAME) # create a cache -- stored in a file of this name url = "https://www.nps.gov/findapark/advanced-search.htm?p=1&v=0" #url can act as identifier for caching in a scraping situation -- it IS frequently unique here, unlike in query requests data = program_cache.get(url) if not data: data = requests.get(url).text program_cache.set(url, data, expire_in_days=1) soup = BeautifulSoup( data, "html.parser" ) # html.parser string argument tells BeautifulSoup that it should work in the nice html way states = soup.find_all(id="form-park") activities = soup.find_all(id="form-activity") topics = soup.find_all(id="form-topic")
from bs4 import BeautifulSoup # need beautifulsoup for scraping import requests, json # need these to access data on the internet and deal with structured data in my cache from advanced_expiry_caching import Cache # use tool from the other file for caching import csv states_abbr= ['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 'nj', 'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'] for i in range(len(states_abbr)): url = "https://www.nps.gov/state/%s/index.htm" % states_abbr[i] FILENAME = "%s_national.json" %states_abbr[i]# saved in variable with convention of all-caps constant program_cache = Cache(FILENAME) # create a cache -- stored in a file of this name data = program_cache.get(url) if not data: data = requests.get(url).text # get the text attribute from the Response that requests.get returns -- and save it in a variable. This should be a bunch of html and stuff program_cache.set(url, data, expire_in_days=1) # just 1 day here because news site / for an example in class
import json from bs4 import BeautifulSoup from advanced_expiry_caching import Cache from flask import Flask, render_template, session, redirect, url_for # tools that will make it easier to build on things # from flask_sqlalchemy import SQLAlchemy # handles database stuff for us - need to pip install flask_sqlalchemy in your virtual env, environment, etc to use this and run this from db import db from db_models import President, Religon, Education #Constants FNAME = "example_json_file.json" START_URL = "https://millercenter.org/president" CSV_FILE = "example_csv_table.csv" PROGRAM_CACHE = Cache(FNAME) # Function which either gets data from cache or creates new get request, then returns data def scrape_function(some_url): # print ("getting data") data = PROGRAM_CACHE.get_data(some_url) if not data: print("MAKING NEW REQUEST") data = requests.get(some_url).text PROGRAM_CACHE.set(some_url, data) return data # scrape main page and turn data into BeautifulSoup object main_page = scrape_function(START_URL) main_soup = BeautifulSoup(main_page, features="html.parser")
def scrapeNPS(): ##### CACHE FILENAME = "nps_cache.json" program_cache = Cache(FILENAME) url = "https://www.nps.gov" + "/index.htm" data = program_cache.get(url) if not data: data = requests.get(url).text program_cache.set(url, data, expire_in_days=1) soup = BeautifulSoup(data, "html.parser") ##### Get all state links state_lst = [] for link in soup.find_all('a'): if '/state/' in link['href']: # print(link['href']) state_lst.append(link['href']) ##### Creating a new CSV called 'park_info' new_file = open('park_info.csv', 'w', encoding='utf8') new_file.write('name,type,location,description,state') new_file.write('\n') for states in state_lst: ##### Cache by states name = states.split("/") cache_each_state = "nps_cache_" + name[2] + ".json" program_cache = Cache(cache_each_state) url = "https://www.nps.gov" + states data = program_cache.get(url) if not data: data = requests.get(url).text program_cache.set(url, data, expire_in_days=1) soup = BeautifulSoup(data, "html.parser") ##### Scrap state's name and all parks state = soup.find("h1", "page-title") list = soup.find_all('div', {'class': 'list_left'}) for park in list: name = str(park.find('h3').string) type = str(park.find('h2').string) loc = str(park.find('h4').string) des = str(park.find('p').string) des = des.replace('\n', ' ') des = des.replace('"', "'") state = state.string row_string = '"{}","{}","{}","{}","{}"'.format( name, type, loc, des, state) new_file.write(row_string) new_file.write('\n') new_file.close() ##### Save all States info and save as a csv new_state_file = open('states.csv', 'w', encoding='utf8') new_state_file.write('state,abbreviation,url') new_state_file.write('\n') for states in state_lst: ##### Cache by states name = states.split("/") abbr = name[2].upper() url = "https://www.nps.gov" + states data = requests.get(url).text soup = BeautifulSoup(data, "html.parser") ##### Scrap state's name and all parks state = soup.find("h1", "page-title") list = soup.find_all('div', {'class': 'list_left'}) state_name = "" for park in list: state = state.string row_string = '"{}","{}","{}"'.format(state, abbr, url) new_state_file.write(row_string) new_state_file.write('\n') new_state_file.close()
from matplotlib.offsetbox import ( OffsetImage, AnnotationBbox) # importing a function that helps display the image from flask import Flask, render_template, session, redirect, url_for from flask_sqlalchemy import SQLAlchemy #Part 1: scraping from rotten tomatoes #Part 2: REST API for OMDB #Part 3: Creating class models for database ############################################################################################### ############# Webscrapting functions to get and cache all Rotten Tomatoes data ################ ############################################################################################### FILENAME = "rt_cache.json" # saved in variable with convention of all-caps constant program_cache = Cache(FILENAME) # creating a cache url = "https://www.rottentomatoes.com/franchise/marvel_cinematic_universe/" data = program_cache.get(url) if not data: # use the .get function from the Cache class to see if we can get this data from the cache -- do we already have data associated with this url? if not, # make a request to get the data from the internet -- all the junk at that page data = requests.get( url ).text # get the text attribute from the Response that requests.get returns -- and save it in a variable. This should be a bunch of html and stuff #print(data) # to prove it - this will print out a lot # set data in cache: program_cache.set( url, data, expire_in_days=7 ) # new marvel movie is coming out so I want to have the scores updated at least every week
from bs4 import BeautifulSoup import requests, json from advanced_expiry_caching import Cache import re FILENAME = "park_cache.json" program_cache = Cache(FILENAME) states = [ "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY" ] def parse_parks(): ''' 1. cache the content from www.nps.gov ''' for i in range(len(states)): eachState = states[i] url = "https://www.nps.gov/state/" + eachState + "/index.htm" # check if data from this url already exists in cache, if not request.get, then put in cache data = program_cache.get(url) if not data: data = requests.get(url).text program_cache.set(url, data, expire_in_days=10) '''
# -*- coding: utf-8 -*- from decimal import Decimal from bs4 import BeautifulSoup import requests, json, csv from advanced_expiry_caching import Cache ########################################################################################################### ## CACHING DATA FILENAME = "cfb_cache.json" URL = "https://www.sports-reference.com/cfb/years/2018.html" URL_into_conf = "https://www.sports-reference.com" # THIS URL WILL BE USED TO CONCATENATE WITH THE href OF EACH CONFERENCE. URL_overall = "https://www.sports-reference.com/cfb/years/2018-ratings.html" PROGRAM_CACHE = Cache( FILENAME) # create a cache -- stored in a file of this name ## THIS FUNCTION CHECKS FOR DATA IN THE CACHE FILE OR EXECUTES THE REQUEST TO THE WEB PAGE WHEN THE CHACE FILE IS EMPTY def access_page_data(url): data = PROGRAM_CACHE.get( url ) # .get() METHOD IS IMPORTED FROM advanced_expiry_caching IT GENERATES THE UNIQUE IDENTIFIER. if not data: data = requests.get(url).text # .get() THIS METHOD IS FROM BS4. # print(data) PROGRAM_CACHE.set( url, data ) # default here with the Cache.set tool is that it will expire in 7 days, which is probs fine, but something to explore return data
from bs4 import BeautifulSoup import requests, json, csv, re, random from advanced_expiry_caching import Cache from flask import Flask, render_template, session, redirect, url_for from flask_sqlalchemy import SQLAlchemy from PIL import Image FILENAME = "dogs_cache.json" program_cache = Cache(FILENAME) url = "https://www.petwave.com/Dogs/Breeds.aspx" data = requests.get(url).text soup = BeautifulSoup(data, features="html.parser") # print(soup.prettify()) # nice for investigation all_urls = soup.findAll('div', attrs={'class': 'pw-rid-small-headline'}) for url in all_urls: links = url.findAll('a') for a in links: new_url = "https://www.petwave.com" + a['href'] #cache all the urls into a json file data = program_cache.get(new_url) # print(new_url) if not data: data = requests.get(new_url).text program_cache.set(new_url, data) try: cache_file = open(FILENAME, 'r') cache_contents = cache_file.read() cache_diction = json.loads(cache_contents)