import requests, json, csv
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd  #helps convert data into tabular structure
from urllib.request import urlopen  #new module meets 1/6 requirements
import re  #secondmodule
# tools that will make it easier to build on things
from advanced_expiry_caching import Cache

START_URL = "https://en.wikipedia.org/wiki/List_of_Asian_countries_by_area"
FILENAME = "countries.json"

# So I can use 1 (one) instance of the Cache tool -- just one for my whole program, even though I'll get data from multiple places
PROGRAM_CACHE = Cache(FILENAME)


# assuming constants exist as such
# use a tool to build functionality here
def access_page_data(url):
    data = PROGRAM_CACHE.get(url)
    if not data:
        data = requests.get(url).text
        PROGRAM_CACHE.set(
            url, data
        )  # default here with the Cache.set tool is that it will expire in 7 days, which is probs fine, but something to explore
    return data


#######

main_page = access_page_data(START_URL)
from flask_sqlalchemy import SQLAlchemy
from flask import Flask, render_template, session, redirect, url_for, flash, request
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField, BooleanField
import pandas as pd
import csv, json
from advanced_expiry_caching import Cache  # use tool from the other file for caching
import requests, os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from sqlalchemy.orm import relationship
import time

##########scraping data from the website: states, topics, activities
FILENAME = "allinfo_parks.json"  # saved in variable with convention of all-caps constant
program_cache = Cache(
    FILENAME)  # create a cache -- stored in a file of this name

url = "https://www.nps.gov/findapark/advanced-search.htm?p=1&v=0"  #url can act as identifier for caching in a scraping situation -- it IS frequently unique here, unlike in query requests

data = program_cache.get(url)
if not data:
    data = requests.get(url).text
    program_cache.set(url, data, expire_in_days=1)

soup = BeautifulSoup(
    data, "html.parser"
)  # html.parser string argument tells BeautifulSoup that it should work in the nice html way
states = soup.find_all(id="form-park")
activities = soup.find_all(id="form-activity")
topics = soup.find_all(id="form-topic")
from bs4 import BeautifulSoup # need beautifulsoup for scraping
import requests, json # need these to access data on the internet and deal with structured data in my cache
from advanced_expiry_caching import Cache # use tool from the other file for caching
import csv


states_abbr= ['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'hi',
          'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma', 'mi', 'mn',
          'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 'nj', 'nm', 'ny', 'nc', 'nd', 'oh',
          'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy']


for i in range(len(states_abbr)):
    url = "https://www.nps.gov/state/%s/index.htm" % states_abbr[i]
    FILENAME = "%s_national.json" %states_abbr[i]# saved in variable with convention of all-caps constant
    program_cache = Cache(FILENAME) # create a cache -- stored in a file of this name
    data = program_cache.get(url)

    if not data:
        data = requests.get(url).text # get the text attribute from the Response that requests.get returns -- and save it in a variable. This should be a bunch of html and stuff
        program_cache.set(url, data, expire_in_days=1) # just 1 day here because news site / for an example in class
import json
from bs4 import BeautifulSoup
from advanced_expiry_caching import Cache
from flask import Flask, render_template, session, redirect, url_for # tools that will make it easier to build on things
# from flask_sqlalchemy import SQLAlchemy # handles database stuff for us - need to pip install flask_sqlalchemy in your virtual env, environment, etc to use this and run this
from db import db
from db_models import President, Religon, Education



#Constants
FNAME = "example_json_file.json"
START_URL = "https://millercenter.org/president"
CSV_FILE = "example_csv_table.csv"

PROGRAM_CACHE = Cache(FNAME)
# Function which either gets data from cache or creates new get request, then returns data

def scrape_function(some_url):
    # print ("getting data")
    data = PROGRAM_CACHE.get_data(some_url)
    if not data:
        print("MAKING NEW REQUEST")
        data = requests.get(some_url).text
        PROGRAM_CACHE.set(some_url, data)
    return data

# scrape main page and turn data into BeautifulSoup object
main_page = scrape_function(START_URL)
main_soup = BeautifulSoup(main_page, features="html.parser")
Пример #5
0
def scrapeNPS():
    ##### CACHE

    FILENAME = "nps_cache.json"
    program_cache = Cache(FILENAME)

    url = "https://www.nps.gov" + "/index.htm"
    data = program_cache.get(url)

    if not data:
        data = requests.get(url).text
        program_cache.set(url, data, expire_in_days=1)

    soup = BeautifulSoup(data, "html.parser")

    ##### Get all state links
    state_lst = []
    for link in soup.find_all('a'):
        if '/state/' in link['href']:
            # print(link['href'])
            state_lst.append(link['href'])

    ##### Creating a new CSV called 'park_info'
    new_file = open('park_info.csv', 'w', encoding='utf8')
    new_file.write('name,type,location,description,state')
    new_file.write('\n')
    for states in state_lst:

        ##### Cache by states
        name = states.split("/")
        cache_each_state = "nps_cache_" + name[2] + ".json"
        program_cache = Cache(cache_each_state)
        url = "https://www.nps.gov" + states
        data = program_cache.get(url)

        if not data:
            data = requests.get(url).text
            program_cache.set(url, data, expire_in_days=1)
        soup = BeautifulSoup(data, "html.parser")

        ##### Scrap state's name and all parks
        state = soup.find("h1", "page-title")
        list = soup.find_all('div', {'class': 'list_left'})

        for park in list:
            name = str(park.find('h3').string)
            type = str(park.find('h2').string)
            loc = str(park.find('h4').string)
            des = str(park.find('p').string)
            des = des.replace('\n', ' ')
            des = des.replace('"', "'")
            state = state.string

            row_string = '"{}","{}","{}","{}","{}"'.format(
                name, type, loc, des, state)
            new_file.write(row_string)
            new_file.write('\n')

    new_file.close()

    ##### Save all States info and save as a csv
    new_state_file = open('states.csv', 'w', encoding='utf8')
    new_state_file.write('state,abbreviation,url')
    new_state_file.write('\n')

    for states in state_lst:

        ##### Cache by states
        name = states.split("/")
        abbr = name[2].upper()
        url = "https://www.nps.gov" + states
        data = requests.get(url).text

        soup = BeautifulSoup(data, "html.parser")

        ##### Scrap state's name and all parks
        state = soup.find("h1", "page-title")
        list = soup.find_all('div', {'class': 'list_left'})

        state_name = ""
        for park in list:
            state = state.string

        row_string = '"{}","{}","{}"'.format(state, abbr, url)
        new_state_file.write(row_string)
        new_state_file.write('\n')

    new_state_file.close()
Пример #6
0
from matplotlib.offsetbox import (
    OffsetImage,
    AnnotationBbox)  # importing a function that helps display the image
from flask import Flask, render_template, session, redirect, url_for
from flask_sqlalchemy import SQLAlchemy
#Part 1: scraping from rotten tomatoes
#Part 2: REST API for OMDB
#Part 3: Creating class models for database

###############################################################################################
############# Webscrapting functions to get and cache all Rotten Tomatoes data ################
###############################################################################################

FILENAME = "rt_cache.json"  # saved in variable with convention of all-caps constant

program_cache = Cache(FILENAME)  # creating a cache

url = "https://www.rottentomatoes.com/franchise/marvel_cinematic_universe/"

data = program_cache.get(url)
if not data:  # use the .get function from the Cache class to see if we can get this data from the cache -- do we already have data associated with this url? if not,
    # make a request to get the data from the internet -- all the junk at that page
    data = requests.get(
        url
    ).text  # get the text attribute from the Response that requests.get returns -- and save it in a variable. This should be a bunch of html and stuff
    #print(data) # to prove it - this will print out a lot

    # set data in cache:
    program_cache.set(
        url, data, expire_in_days=7
    )  # new marvel movie is coming out so I want to have the scores updated at least every week
Пример #7
0
from bs4 import BeautifulSoup
import requests, json
from advanced_expiry_caching import Cache
import re

FILENAME = "park_cache.json"
program_cache = Cache(FILENAME)

states = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI",
    "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN",
    "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH",
    "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA",
    "WV", "WI", "WY"
]


def parse_parks():
    '''
    1. cache the content from www.nps.gov
    '''
    for i in range(len(states)):
        eachState = states[i]
        url = "https://www.nps.gov/state/" + eachState + "/index.htm"

        # check if data from this url already exists in cache, if not request.get, then put in cache
        data = program_cache.get(url)
        if not data:
            data = requests.get(url).text
            program_cache.set(url, data, expire_in_days=10)
    '''
Пример #8
0
# -*- coding: utf-8 -*-
from decimal import Decimal
from bs4 import BeautifulSoup
import requests, json, csv
from advanced_expiry_caching import Cache

###########################################################################################################
## CACHING DATA

FILENAME = "cfb_cache.json"
URL = "https://www.sports-reference.com/cfb/years/2018.html"
URL_into_conf = "https://www.sports-reference.com"  # THIS URL WILL BE USED TO CONCATENATE WITH THE href OF EACH CONFERENCE.
URL_overall = "https://www.sports-reference.com/cfb/years/2018-ratings.html"

PROGRAM_CACHE = Cache(
    FILENAME)  # create a cache -- stored in a file of this name


## THIS FUNCTION CHECKS FOR DATA IN THE CACHE FILE OR EXECUTES THE REQUEST TO THE WEB PAGE WHEN THE CHACE FILE IS EMPTY
def access_page_data(url):
    data = PROGRAM_CACHE.get(
        url
    )  # .get() METHOD IS IMPORTED FROM advanced_expiry_caching IT GENERATES THE UNIQUE IDENTIFIER.
    if not data:
        data = requests.get(url).text  # .get() THIS METHOD IS FROM BS4.
        # print(data)
        PROGRAM_CACHE.set(
            url, data
        )  # default here with the Cache.set tool is that it will expire in 7 days, which is probs fine, but something to explore
    return data
Пример #9
0
from bs4 import BeautifulSoup
import requests, json, csv, re, random
from advanced_expiry_caching import Cache
from flask import Flask, render_template, session, redirect, url_for
from flask_sqlalchemy import SQLAlchemy
from PIL import Image

FILENAME = "dogs_cache.json"
program_cache = Cache(FILENAME)

url = "https://www.petwave.com/Dogs/Breeds.aspx"
data = requests.get(url).text
soup = BeautifulSoup(data, features="html.parser")
# print(soup.prettify()) # nice for investigation

all_urls = soup.findAll('div', attrs={'class': 'pw-rid-small-headline'})
for url in all_urls:
    links = url.findAll('a')
    for a in links:
        new_url = "https://www.petwave.com" + a['href']
        #cache all the urls into a json file
        data = program_cache.get(new_url)
        # print(new_url)
        if not data:
            data = requests.get(new_url).text
            program_cache.set(new_url, data)

try:
    cache_file = open(FILENAME, 'r')
    cache_contents = cache_file.read()
    cache_diction = json.loads(cache_contents)