示例#1
0
文件: wiki.py 项目: pshrmn/wikinfo
 def __init__(self, page_config, fetch_config=None):
     """
     page_config is a required dict to create a gatherer.Page
     fetch_config is a dict used to setup the fetcher. possible keys are:
         sleep_time - number, default 5
         cache - a gatherer.Cache, default None
         headers - a dict for requests to send with a request, default None
     """
     self.page = Page.from_json(page_config)
     if fetch_config is None:
         fetch_config = {}
     self.fetcher = Fetch(**fetch_config)
示例#2
0
import json
import argparse

from gatherer import Page, Fetch, Cache

with open("pages/schedule.json") as fp:
    schedule_json = json.load(fp)

c = Cache("cache")
f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c)

schedule = Page.from_json(schedule_json)


def get_season(year):
    BASE_URL = "http://www.nfl.com/schedules/{}/REG{}"

    for week in range(1, 18):
        dom = f.get(BASE_URL.format(year, week))
        s = schedule.gather(dom)
        with open("data/{}-{:02d}.json".format(year, week), "w") as fp:
            json.dump(s, fp, indent=2)


def get_week(year, week):
    BASE_URL = "http://www.nfl.com/schedules/{}/REG{}"
    dom = f.get(BASE_URL.format(year, week))
    s = schedule.gather(dom)
    with open("data/{}-{:02d}.json".format(year, week), "w") as fp:
        json.dump(s, fp, indent=2)
示例#3
0
with open("roster.json") as fp:
    roster_rules = json.load(fp)

# load a dict with the urls for all of the FBS D1-A teams' roster urls
with open("team_pages.json") as fp:
    team_urls = json.load(fp)

cache = Cache("cache")
wiki_city = city.City(city.city_rule_set, {
    "headers": {"User-Agent": "gatherer"},
    "sleep_time": 0,
    "cache": cache
})

fetcher = Fetch(headers={"User-Agent": "gatherer"}, cache=cache)
roster_page = Page.from_json(roster_rules)
KNOWN_CITIES = {}


def get_roster(url):
    """
    given the url (on espn.com) for a football team, return an array of dicts
    with hometown and position keys
    """
    dom = fetcher.get(url)
    return roster_page.gather(dom)


def get_coordinates(hometown):
    # if a player does not live in the US or Canada, his hometown is listed as --
    if hometown == "--":
示例#4
0
import json
import os

from gatherer import Page
from snl.fetch import fetcher
from snl.fetch.helpers import repertory_cast_member

LOCAL_DIR = os.path.dirname(__file__)
RULES_DIR = os.path.join(LOCAL_DIR, "rules")

with open(os.path.join(RULES_DIR, "cast.json")) as fp:
    cast_json = json.load(fp)

cast_page = Page.from_json(cast_json)


def sort_actors(casts):
    repertory = []
    featured = []
    for cast_group in casts:
        for members in cast_group.get("members"):
            if repertory_cast_member(members.get("description")):
                repertory.extend(members.get("actors"))
            else:
                featured.extend(members.get("actors"))
    return {"repertory": repertory, "featured": featured}


def cast(season):
    """
    return a dict with the data for a season of Saturday Night Live episodes.
示例#5
0
import json
import os

from gatherer import Page
from snl.fetch import fetcher
from snl.fetch.helpers import full_month, infer_gender

LOCAL_DIR = os.path.dirname(__file__)
RULES_DIR = os.path.join(LOCAL_DIR, "rules")

with open(os.path.join(RULES_DIR, "actor.json")) as fp:
    actor_json = json.load(fp)

actor_page = Page.from_json(actor_json)


def clean_profile(data):
    if data is None:
        return
    return {
        "name": data.get("name"),
        "birthdate": full_month(data.get("birthdate")),
        "hometown": data.get("hometown"),
        "gender": infer_gender(data.get("description")),
        "roles": data.get("roles")
    }


def profile(actor_url):
    """
    return a dict with the data for a season of Saturday Night Live episodes.
示例#6
0
import json
import os
import re

from gatherer import Page
from snl.fetch import fetcher
from snl.fetch.helpers import abbr_month

LOCAL_DIR = os.path.dirname(__file__)
RULES_DIR = os.path.join(LOCAL_DIR, "rules")

with open(os.path.join(RULES_DIR, "episode.json")) as fp:
    episode_json = json.load(fp)

episode_page = Page.from_json(episode_json)
"""
episode_page gathers data formatted as:
{
    "air_date": <string>,
    "cast": [
        "name": <string>,
        "profile": <string>
    ]
}
"""


def episode_url(season, episode):
    # {index:<fill><len><type>}
    return "https://www.rottentomatoes.com/tv/saturday-night-live/s{0:02d}/e{1:02d}/".format(
        season, episode)
示例#7
0
import json

from gatherer import Page, Fetch, Cache

with open("pages/stadiums.json") as fp:
    stadium_data = json.load(fp)

c = Cache("cache")
f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c)

stadiums = Page.from_json(stadium_data, f)

output = stadiums.get(
    "http://en.wikipedia.org/wiki/List_of_current_National_Football_League_stadiums"
)

with open("data/stadium_info.json", "w") as fp:
    json.dump(output, fp, indent=2)
示例#8
0
    nums = re.compile(r'(?P<degree>\d+)\u00b0(?P<minutes>\d+)')
    match = nums.search(coord)
    if match is None:
        return
    return match.groupdict()


def coordinate_decimal(coord):
    return float(coord["degree"]) + (float(coord["minutes"])/60)

c = Cache("cache")
f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c)

with open("pages/stadiums.json") as fp:
    stadium_json = json.load(fp)
    stadiums = Page.from_json(stadium_json)

with open("pages/coordinates.json") as fp:
    coord_json = json.load(fp)
    coords = Page.from_json(coord_json)


# get the basic stadium data
stadium_dom = f.get("http://en.wikipedia.org/wiki/List_of_current_National_Football_League_stadiums")
stadium_data = stadiums.gather(stadium_dom)

stadium_coords = {}

for stadium in stadium_data['stadiums']:
    dom = f.get(stadium["url"])
    c = coords.gather(dom)
示例#9
0
import json
import os

from gatherer import Page
from snl.fetch import fetcher
from snl.fetch.helpers import Y_m_d, infer_gender

LOCAL_DIR = os.path.dirname(__file__)
RULES_DIR = os.path.join(LOCAL_DIR, "rules")

with open(os.path.join(RULES_DIR, "profile.json")) as fp:
    profile_json = json.load(fp)

profile_page = Page.from_json(profile_json)


def clean_profile(data):
    if data is None:
        return
    return {
        "name": data.get("name"),
        "hometown": data.get("birthplace"),
        "birthdate": Y_m_d(data.get("birthdate")),
        "gender": infer_gender(data.get("description"))
    }


def profile(url):
    dom = fetcher.get(url)
    if dom is None:
        print("failed to get profile data")
示例#10
0
# load a dict with the urls for all of the FBS D1-A teams' roster urls
with open("team_pages.json") as fp:
    team_urls = json.load(fp)

cache = Cache("cache")
wiki_city = city.City(city.city_rule_set, {
    "headers": {
        "User-Agent": "gatherer"
    },
    "sleep_time": 0,
    "cache": cache
})

fetcher = Fetch(headers={"User-Agent": "gatherer"}, cache=cache)
roster_page = Page.from_json(roster_rules)
KNOWN_CITIES = {}


def get_roster(url):
    """
    given the url (on espn.com) for a football team, return an array of dicts
    with hometown and position keys
    """
    dom = fetcher.get(url)
    return roster_page.gather(dom)


def get_coordinates(hometown):
    # if a player does not live in the US or Canada, his hometown is listed as --
    if hometown == "--":
示例#11
0
import json

from gatherer import Page, Fetch, Cache

with open("rules/en_wikipedia_org/surnames.json") as fp:
    surname_json = json.load(fp)

c = Cache("cache")
f = Fetch(cache=c)

surnames = Page.from_json(surname_json, f)

URL = "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_North_America"
names = surnames.get(URL)
popular_names = names["names"]

sigma = 0
for name in popular_names:
    count = float(name["frequency"])
    name["frequency"] = count
    sigma += count

running_total = 0
for name in names["names"]:
    percent = name["frequency"]/sigma
    name["percent"] = percent
    running_total += 1000*percent
    name["threshold"] = running_total

with open("data/surnames.json".format(names), "w") as fp:
    json.dump(popular_names, fp, indent=2)
示例#12
0
import os
import json
import argparse

from gatherer import Page, Fetch

os.makedirs("data", exist_ok=True)
with open("submissions.json") as fp:
    sub_json = json.load(fp)

f = Fetch(headers={"User-Agent": "gatherer"})
p = Page.from_json(sub_json)


def fetch_and_save(filename, subreddit=None):
    if subreddit is None:
        url = "http://www.reddit.com"
    else:
        url = "http://www.reddit.com/r/{}".format(subreddit)
    dom = f.get(url)
    if dom is not None:
        data = p.gather(dom)
        path = "data/{}".format(filename)
        with open(path, "w") as fp:
            json.dump(data, fp)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-subreddit', dest='subreddit',
                        help='subreddit to get data from')
    parser.add_argument('-filename', dest='filename',
示例#13
0
文件: dom.py 项目: pshrmn/foraging
import json
from gatherer import Fetch, Cache, Page

cache = Cache("cache")
fetcher = Fetch(cache=cache)

with open("rules/www_rottentomatoes_com/actor.json") as fp:
    actor_json = json.load(fp)
actor_page = Page.from_json(actor_json)

with open("rules/www_rottentomatoes_com/movie.json") as fp:
    movie_json = json.load(fp)
movie_page = Page.from_json(movie_json)


def get_actor(url):
    """
    return a dict with the data from an actor's profile
    """
    dom = fetcher.get(url, True)
    if dom is not None:
        return actor_page.gather(dom)


def get_movie(url):
    """
    return a dict with the data from a movie's profile
    """
    dom = fetcher.get(url)
    if dom is not None:
        return movie_page.gather(dom)
示例#14
0
import json

from gatherer import Page, Fetch, Cache

with open("rules/www_ssa_gov/firstnames.json") as fp:
    name_json = json.load(fp)

c = Cache("cache")
f = Fetch(cache=c)

names = Page.from_json(name_json, f)

URL = "http://www.ssa.gov/oact/babynames/decades/century.html"
all_names = names.get(URL)


def strip_commas(num):
    return int(num.replace(",", ""))

male_names = []
female_names = []
male_sum, female_sum = 0, 0
for name_pair in all_names["ranks"]:
    male_name = name_pair["male_name"]
    male_count = strip_commas(name_pair["male_count"])
    male_sum += male_count

    female_name = name_pair["female_name"]
    female_count = strip_commas(name_pair["female_count"])
    female_sum += female_count
示例#15
0
import json
import os

from gatherer import Page
from snl.fetch import fetcher
from snl.fetch.helpers import day_month_year

LOCAL_DIR = os.path.dirname(__file__)
RULES_DIR = os.path.join(LOCAL_DIR, "rules")

with open(os.path.join(RULES_DIR, "season.json")) as fp:
    season_json = json.load(fp)

season_page = Page.from_json(season_json)


def season_url(season_number):
    """
    returns the url for the imdb page for a given season of saturday night live
    currently there are 41 seasons. Any numbers outside of the range 1-41 will
    return the most recent season.
    """
    return "http://www.imdb.com/title/tt0072562/episodes?season={}".format(
        season_number)


def clean_episodes(data, season):
    """
    convert episodes to the desired format
    """
    if data is None: