def getAll(dt, files=None): dbobj = db.Db(config.config['db']) step_time = time.time() #logging.info("MASTER - starting ETL job - date: %s - database: %s" % (dt, db_credentials['db'])) # Default set of files/tables to populate if not files: files = [ 'boxscore_nbacom', 'boxscore_cbssports', 'playbyplay_espn', 'playbyplay_nbacom', 'shotchart_cbssports', 'shotchart_espn', 'shotchart_nbacom' ] # Choose games games = chooseGames(dt, dbobj) # MAIN ETL PROCESS #print "+++ MASTER ETL - files: %s - database: %s" % (str(files), db_credentials['db']) # Get source gamedata = source.main.go(games, files) extract.main.go(gamedata) clean.main.go(gamedata, dbobj) load.main.go(gamedata, dbobj) #afterclean2.main.go(gamedata, dbobj) tomorrow = dt + datetime.timedelta(days=1) #findgames.go(tomorrow) time_elapsed = "Total time: %.2f sec" % (time.time() - step_time) print time_elapsed logging.info(time_elapsed)
def cleanOnly(dt): dbobj = db.Db(db.dbconn_nba_test) games = chooseGames(dt) gamedata = getExistingSourceDocs(games) clean.main.go(gamedata, dbobj)
def scrapeDailyAuto(dt, files = None): step_time = time.time() dbobj = db.Db(config.config['db']) config_no_pw = config.config['db'].copy() del config_no_pw['passwd'] if not files: files = [ 'boxscore_nbacom', 'boxscore_cbssports', 'playbyplay_espn', 'playbyplay_nbacom', 'shotchart_cbssports', 'shotchart_espn', 'shotchart_nbacom' ] # MAIN ETL PROCESS print "+++ MASTER ETL - files: %s - database: %s" % (str(files), config_no_pw) logging.info("MASTER - starting ETL job - date: %s - database: %s" % (dt, config_no_pw)) scrape(dbobj, dt, files) tomorrow = dt + datetime.timedelta(days=1) time_elapsed = "Total time: %.2f sec" % (time.time() - step_time) logging.info(time_elapsed)
def restartFromClean(dt): dbobj = db.Db(db.dbconn_nba) games = chooseGames(dt) gamedata = getExistingSourceDocs(games) clean.main.go(gamedata, dbobj) load.main.go(gamedata, dbobj)
def main(): dbobj = db.Db(db.dbconn_nba) game = dbobj.query_dict("SELECT * FROM game WHERE id = %s" % (2734))[0] filename = '%s_playbyplay_nbacom' % (game['abbrev']) obj = Clean(filename, game, dbobj) data = obj.getPlayByPlayData()
def main(): dbobj = db.Db(config.dbconn_prod_nba) game = dbobj.query_dict("SELECT * FROM game WHERE id = %s" % (4766))[0] filename = '%s_boxscore_statsnbacom' % (game['abbrev']) obj = Clean(filename, game, dbobj) obj.clean()
def test(): dbobj = db.Db(db.dbconn_nba) files = [f for f in os.listdir(LOGDIR_CLEAN) if 'game_stats' in f] obj = Load(dbobj) for f in files: obj.loadBoxScoreNbaComGameStats(f)
def setUp(self): self.game_id = 1 self.pbp_nbacom = pbp_nbacom.Clean( 'fake_filename', { 'away_team_id': 1, 'home_team_id': 2, 'abbrev': 'game', 'id': self.game_id, 'date_played': '2013-01-01' }, db.Db(config.dbconn_prod_nba))
def main(): dbobj = db.Db(config.dbconn_prod_nba) gamedata = dbobj.query_dict("SELECT * FROM game WHERE id = 4766")[0] filename = '%s_boxscore_statsnbacom' % (gamedata['abbrev']) #obj = PlayerStatsNbaCom(filename, gamedata, dbobj) #obj.resolveNewPlayers() obj = Resolve(dbobj) obj.resolveStatsNbacom()
def main(): dbobj = db.Db(config.dbconn_prod_nba) files = [ f for f in os.listdir('../../dump/extract') if 'shotchart_statsnbacom' in f ] f = '2013-11-26_LAL@WAS_shotchart_statsnbacom' gamedata = dbobj.query_dict("SELECT * FROM game WHERE id = 4356")[0] obj = Clean(f, gamedata, dbobj) obj.clean()
def main(): dbobj = db.Db(db.dbconn_prod) files = [ f for f in os.listdir('../../dump/extract') if 'shotchart_nbacom' in f ] f = '2013-06-20_SA@MIA_shotchart_nbacom' gamedata = dbobj.query_dict("SELECT * FROM game WHERE id = 4043")[0] obj = CleanWnba(f, gamedata, dbobj) shots = obj.getShots() print obj.resolveShotCoordinates(shots)
def aftercleanOnly(dt, files=None): dbobj = db.Db(config.config['db']) if not files: files = [ 'boxscore_nbacom', 'boxscore_cbssports', 'playbyplay_espn', 'playbyplay_nbacom', 'shotchart_cbssports', 'shotchart_espn', 'shotchart_nbacom' ] games = chooseGames(dt, dbobj) gamedata = source.main.go(games, files) #afterclean2.main.go(gamedata, dbobj) afterclean2.gm.main.postCleanGame(gamedata, dbobj)
def main(): files = [f for f in os.listdir(LOGDIR_EXTRACT) if '2011-12' in f and 'boxscore' in f] f = '2012-01-18_OKC@WAS_boxscore_nbacom' gamedata = db.nba_query_dict("SELECT * FROM game where date_played <= '2012-02-10'") dbobj = db.Db(db.dbconn_nba) for game in gamedata: #print game['abbrev'] filename = game['abbrev'] + '_boxscore_nbacom' obj = CleanBoxScore(filename, game, dbobj) result = obj.getGameInfo()
def mainfunc(): dbobj = db.Db(db.dbconn_nba_test) gamedata = dbobj.query_dict("SELECT * FROM game WHERE date_played <= '2012-02-01' ORDER BY date_played ASC") for g in gamedata: fnba = g['abbrev'] + '_boxscore_nbacom' fcbs = g['abbrev'] + '_shotchart_cbssports' if fnba in os.listdir(LOGDIR_EXTRACT): print g['abbrev'] obj = player.PlayerNbaCom(LOGDIR_EXTRACT + fnba, g, dbobj) obj.resolveNewPlayers() obj = player.PlayerCbsSports(LOGDIR_EXTRACT + fcbs + '_players', g, dbobj) obj.resolveNewPlayers()
def main(): dbobj = db.Db(db.dbconn_prod) game = dbobj.query_dict(""" SELECT g.*, home_team.city home_team_city, away_team.city away_team_city FROM game g INNER JOIN team home_team on home_team.id = g.home_team_id INNER JOIN team away_team on away_team.id = g.away_team_id WHERE g.id = 1 AND g.should_fetch_data = 1 """)[0] filename = game['abbrev'] + '_playbyplay_espn' obj = Extract(open(LOGDIR_SOURCE + filename, 'r').read(), filename, game) data = obj._getData() for row in data: print row
def saveStatsNbaCom(): dbobj = db.Db(config.dbconn_prod_nba) f = open('games.json','r') lines = f.readlines() for line in lines: data = json.loads(line.rstrip()) if data: for row in data: sql = """ UPDATE game SET national_tv = '%s', statsnbacom_game_id = '%s', gametime = '%s' WHERE nbacom_game_id = '%s' """ % (row['NATL_TV_BROADCASTER_ABBREVIATION'], row['GAME_ID'], row['GAME_STATUS_TEXT'], row['GAMECODE']) dbobj.query(sql)
def getCoreData(dt, files=None): dbobj = db.Db(config.config['db']) step_time = time.time() # Default set of files/tables to populate if not files: files = [ 'boxscore_nbacom', 'boxscore_cbssports', 'playbyplay_espn', 'playbyplay_nbacom', 'shotchart_cbssports', 'shotchart_espn', 'shotchart_nbacom' ] # Choose games games = chooseGames(dt, dbobj) # MAIN ETL PROCESS #print "+++ MASTER ETL - files: %s - database: %s" % (str(files), db_credentials['db']) # Get source gamedata = source.main.go(games, files) extract.main.go(gamedata) clean.main.go(gamedata, dbobj) load.main.go(gamedata, dbobj)
def go(dt, files=None): conf = config.league['wnba'] league = 'wnba' dbobj = db.Db(conf['db']) step_time = time.time() config_no_pw = conf['db'] del config_no_pw['passwd'] logging.info("MASTER - starting ETL job - date: %s - database: %s" % (dt, config_no_pw)) # Default set of files/tables to populate if not files: files = [ 'boxscore_nbacom', 'playbyplay_espn', 'playbyplay_nbacom', 'shotchart_espn', 'shotchart_wnbacom' ] # Choose games games = _chooseGames(dt, dbobj) # MAIN ETL PROCESS print "+++ MASTER ETL - files: %s - database: %s" % (str(files), conf['db']) # Get source gamedata = source.main.go(games, files, league) extract.main.go(gamedata) clean.main.go(gamedata, dbobj) load.main.go(gamedata, dbobj) tomorrow = dt + datetime.timedelta(days=1) time_elapsed = "Total time: %.2f sec" % (time.time() - step_time) logging.info(time_elapsed)
from bs4 import BeautifulSoup import urllib2 import re import datetime from libscrape.config import db from libscrape.config import config import time import json # A script that scrapes ESPN.com's scoreboard to retrieve ESPN's game_id and store into MySQL db dbobj = db.Db(config.dbconn_prod_nba) def getEspnScoreboardDoc(dt): url = 'http://espn.go.com/nba/scoreboard?date=%s' % dt.isoformat().replace('-','') response = urllib2.urlopen(url) return response.read() def getEspnData(html, season): soup = BeautifulSoup(html) links = soup.findAll(href=re.compile("/nba/conversation.*")) links = list(set(links)) game_ids = [] for l in links: match = re.search("/nba/conversation\?gameId=(?P<game_id>[0-9]+)$",l['href']) if match: found = match.groupdict() game_ids.append(found['game_id'])
def main(): dbobj = db.Db(db.dbconn_nba) game = dbobj.query_dict("SELECT * FROM game WHERE id = 2366")[0] obj = Clean(game['abbrev'] + '_playbyplay_espn', game, dbobj) obj._clean()
from collections import defaultdict from pulp import * sys.path.append("../") from libscrape.config import constants from libscrape.config import db from libscrape.config import config LOGDIR_SOURCE = constants.LOGDIR_SOURCE LOGDIR_EXTRACT = constants.LOGDIR_EXTRACT logging.basicConfig(filename='etl.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') dbobj = db.Db(config.config['db']) logging.info("Starting create table with database: %s" % (dbobj)) def aggregate_possessions_from_dates(cursor, start_date=datetime.date(2010, 10, 26), end_date=datetime.date.today(), max_salary=45): # get the stats sql = """ SELECT pgp.player_id, p.full_name, sum(pgp.points_scored) / sum(pgp.offensive_possessions), sum(pgp.points_allowed) / sum(pgp.defensive_possessions),
from BeautifulSoup import BeautifulSoup import urllib2 import re import datetime from libscrape.config import db import time # A script that scrapes ESPN.com's scoreboard to retrieve ESPN's game_id and store into MySQL db dbobj = db.Db(db.dbconn_nba) def getScoreboardDoc(dt): url = 'http://espn.go.com/nba/scoreboard?date=%s' % dt.isoformat().replace( '-', '') response = urllib2.urlopen(url) return response.read() def getGameIdsAndTeams(html): soup = BeautifulSoup(html) links = soup.findAll(href=re.compile("/nba/conversation.*")) links = list(set(links)) game_ids = [] for l in links: match = re.search("/nba/conversation\?gameId=(?P<game_id>[0-9]+)$", l['href']) if match: found = match.groupdict() game_ids.append(found['game_id'])
def loadOnly(dt): dbobj = db.Db(db.dbconn_nba) games = chooseGames(dt) gamedata = getExistingSourceDocs(games) load.main.go(gamedata, dbobj)