def getArtistModValFiles(self, modVal, previousDays=5, force=False): artistDir = self.disc.getArtistsDir() maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() dirVal = setDir(artistDir, str(modVal)) files = findExt(dirVal, ext='.p') dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal)) now = datetime.now() if isFile(dbname): lastModified = datetime.fromtimestamp(path.getmtime(dbname)) if force is True: lastModified = None else: lastModified = None newFiles = None if lastModified is None: newFiles = files print(" ===> Parsing all {0} files for modval {1}".format(len(newFiles), modVal)) else: numNew = [ifile for ifile in files if (now-datetime.fromtimestamp(path.getmtime(ifile))).days < previousDays] numRecent = [ifile for ifile in files if datetime.fromtimestamp(path.getmtime(ifile)) > lastModified] newFiles = list(set(numNew).union(set(numRecent))) print(" ===> Found new {0} files (< {1} days) to parse for modval {2}".format(len(newFiles), previousDays, modVal)) return newFiles
def processAACTACategoryData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext="*.p") from collections import OrderedDict movies = OrderedDict() print(files) for ifile in files: if debug: print("Processing {0}".format(ifile)) category = getBaseFilename(ifile) results = self.parseAACTACategoryData(ifile, category, debug=debug) if len(results) == 0: raise ValueError("No results for {0}".format(ifile)) for year, yearData in results.items(): for category, categoryData in yearData.items(): if movies.get(year) is None: movies[year] = [] for movie in categoryData: movies[year].append(movie) for year in movies.keys(): movies[year] = list(set(movies[year])) yearlyMovies = movies[year] movies[year] = [] for movie in yearlyMovies: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of AACTA Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def findSearchTerms(self, minCnts=25): from collections import Counter from time import sleep from glob import glob artistsCntr = Counter() known = getFile(self.knownFile) files = getFlatList([findExt(dirval, ext='.p') for dirval in self.getModValDirs()]) for ifile in files: #for ifile in glob("/Volumes/Piggy/Discog/artists-datpiff/*/*.p"): if ifile.endswith("datPiffKnown.p"): continue tmp = getFile(ifile) #print(ifile,'\t',len(tmp)) results = [x["ArtistName"] for x in tmp] for artist in results: artists = self.mulArts.getArtistNames(artist) for artist in artists.keys(): key = artist.title() if len(key) > 1 and key not in known: artistsCntr[key] += 1 searchTerms = [item[0] for item in artistsCntr.most_common() if item[1] >= minCnts] print("There are {0} new searches".format(len(searchTerms))) return searchTerms
def createRawOscarData(self, debug=True): print("Checking for poorly parsed oscar data.") indir = self.wikiData.getResultsDir() files = sorted(findExt(indir, ext=".json")) if debug: print("Found {0} oscar files".format(len(files))) yearlyData = {} for ifile in files: year = getBaseFilename(ifile) yearlyData[year] = getFile(ifile) savename = setFile(self.getCorrectionsDir(), "saved.yaml") if not isFile(savename): savedData = {} else: savedData = getFile(savename) for year in savedData.keys(): for title in savedData[year].keys(): savedWinner = savedData[year][title].get("Winner") savedNominees = savedData[year][title].get("Nominees") if savedWinner is not None: print("Overwritting {0} {1} winner".format(year, title)) yearlyData[year][title]["Winner"] = savedWinner if savedNominees is not None: print("Overwritting {0} {1} nominees".format(year, title)) yearlyData[year][title]["Nominees"] = savedNominees savename = setFile(self.getCorrectionsDir(), "raw.yaml") saveFile(idata=yearlyData, ifile=savename)
def parseAndDownloadTeamYearlyStandings(self): files = findExt(self.getSeasonDir(), ext=".p", debug=False) for ifile in files: year = getBaseFilename(ifile) htmldata = getFile(ifile) bsdata = getHTML(htmldata) idVals = {} links = bsdata.findAll("a") for link in links: attrs = link.attrs if attrs.get("data-clubhouse-uid") is not None: href = attrs['href'] name = getBasename(href) idval = getBasename(getDirname(href)) if idVals.get(idval) is not None: if idVals[idval] != name: raise ValueError("Error in ID for this year!") idVals[idval] = name for idVal, name in idVals.items(): self.downloadTeamDataByYear(idVal, name, season=str(year), debug=True)
def processWikiFilmYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in sorted(files): if debug: print("Processing {0}".format(ifile)) year = getBaseFilename(ifile) movies[year] = self.parseWikiFilmYearlyData(ifile, debug=False) yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=False) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} WikiFilm data to {1}".format(len(yearlyData), savename)) saveFile(savename, yearlyData)
def parseRottenTomatoes(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".p") movies = {} for ifile in files: result = self.parseRottenTomatoesFile(ifile, debug=debug) for year, yearlyResult in result.items(): if movies.get(year) is None: movies[year] = yearlyResult else: movies[year] = {**movies[year], **yearlyResult} yearlyData = {} for year in movies.keys(): yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "rottentomatoes.json") print("Saving", len(yearlyData), "yearly results to", savename) saveFile(savename, yearlyData)
def processFlopsData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".html") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in files: htmldata = getFile(ifile) bsdata = getHTML(htmldata) tables = bsdata.findAll("table", {"class": "wikitable"}) for table in tables: trs = table.findAll("tr") try: ths = trs[0].findAll("th") ths = [x.text for x in ths] ths = [x.replace("\n", "") for x in ths] except: raise ValueError("Could not get headers") print(ths) for itr, tr in enumerate(trs[2:]): ths = tr.findAll("th") try: movie = ths[0].text movie = movie.replace("\n", "").strip() movie = movie.replace("[nb 2]", "") except: raise ValueError( "Could not find movie in {0}".format(ths)) tds = tr.findAll("td") try: year = tds[0].text year = int(year) except: raise ValueError( "Could not find year in {0}".format(tds)) print(year, '\t', movie) if yearlyData.get(year) is None: yearlyData[year] = [] yearlyData[year].append(movie) for year in sorted(yearlyData.keys()): movies[year] = [] for movie in yearlyData[year]: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of flops Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def downloadKWorbSpotifyYouTubeArtists(self, update=False): url = "https://kworb.net/youtube/archive.html" savename = "kworb_youtubeartists.p" if update is True: self.dutils.downloadArtistURL(url=url, savename=savename, force=True) bsdata = getHTML(savename) data = [] artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "youtube") print(artistDir) for table in bsdata.findAll("table"): ths = [th.text for th in table.findAll("th")] for tr in table.findAll("tr")[1:]: item = dict(zip(ths, tr.findAll("td"))) data.append(item) print(data) if False: bsdata = getHTML(savename) artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "youtube") for div in bsdata.findAll("div", {"class": "subcontainer"}): if div.find("span", {"class": "pagetitle"}) is None: continue for ref in div.findAll("a"): href = ref.attrs['href'] url = "{0}/{1}".format(self.youtubeURL, href) savename = "{0}/{1}".format(saveDir, href.replace(".html", ".p")) if isFile(savename): print("Y\t", savename, '\t', url) else: print("-\t", savename, '\t', url) #dbArtistsKWorb().dutils.downloadArtistURL(url=fullURL, savename=savename, force=True) for ifile in findExt(saveDir, ".p"): bsdata = getHTML(ifile) for table in bsdata.findAll("table"): trs = table.findAll("tr") for tr in trs[1:]: ref = tr.find("a") href = ref.attrs['href'] name = ref.text url = "{0}/{1}".format(self.youtubeURL, href) savename = "{0}/{1}".format( setDir(saveDir, "artist"), href.replace(".html", ".p")) print(url, savename) if isFile(savename) is False: data, code = downloadURL(url) from ioUtils import getFile, saveFile saveFile(idata=data, ifile=savename) sleep(3) break
def parseArtistFiles(self, force=False, debug=False): from glob import glob artistDir = self.disc.getArtistsDir() artistDBData = {} files = findExt(self.knownDir, ext='.p') files = glob("/Volumes/Biggy/Discog/artists-datpiff/*/*.p") print("Found {0} downloaded search terms".format(len(files))) for i,ifile in enumerate(files): if ifile.endswith("datPiffKnown.p"): continue fileresults = getFile(ifile) if debug: print(i,'/',len(files),'\t',ifile) for j,fileresult in enumerate(fileresults): if debug: print(" ",j,'/',len(fileresults)) mixArtists = fileresult["ArtistName"] albumName = fileresult["AlbumName"] albumURL = fileresult["AlbumURL"] mixArtistNames = self.mulArts.getArtistNames(mixArtists) mixArtistNames = [x.title() for x in mixArtistNames.keys()] for artistName in mixArtistNames: artistID = str(self.dutils.getArtistID(artistName)) albumID = str(self.dutils.getArtistID(albumName)) modval = self.dutils.getArtistModVal(artistID) if artistDBData.get(modval) is None: artistDBData[modval] = {} if artistDBData[modval].get(artistName) is None: artistDBData[modval][artistName] = {"Name": artistName, "ID": artistID, "URL": None, "Profile": None, "Media": []} albumData = {"Artists": mixArtistNames, "Name": albumName, "URL": albumURL, "Code": albumID} artistDBData[modval][artistName]["Media"].append(albumData) maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() totalSaves = 0 for modVal,modvaldata in artistDBData.items(): dbData = {} for artistName, artistData in modvaldata.items(): self.artist.setData(artistData) artistVal = self.artist.parse() dbData[artistVal.ID.ID] = artistVal savename = setFile(artistDBDir, "{0}-DB.p".format(modVal)) print("Saving {0} artist IDs to {1}".format(len(dbData), savename)) totalSaves += len(dbData) saveFile(idata=dbData, ifile=savename) self.createArtistModValMetadata(modVal=modVal, db=dbData, debug=debug) self.createArtistAlbumModValMetadata(modVal=modVal, db=dbData, debug=debug) print("Saved {0} new artist IDs".format(totalSaves))
def main(args): files = findExt(getcwd(), ext=".mp3") for ifile in files: fname = getBasename(ifile) dname = getDirname(ifile) fname = fname.replace(args.remove, "").strip() dst = join(dname, fname) if ifile != dst: moveFile(ifile, dst, debug=True)
def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movieData = OrderedDict() for ifile in sorted(files): #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p" htmldata = getFile(ifile) bsdata = getHTML(htmldata) year = getBaseFilename(ifile) data = {} done = False tables = bsdata.findAll("table") #, {"id": "table_3"}) movies = {} for it, table in enumerate(tables): ths = table.findAll("th") trs = table.findAll("tr") for itr, tr in enumerate(trs): tds = tr.findAll("td") if len(tds) == 11: val = removeTag(tds[1], 'span') film = val.text film = film.replace(" ({0})".format(year), "") try: rank = float(tds[-1].text) except: try: rank = float(tds[-2].text) except: raise ValueError(tds[-1], tds[-2], tr) movies[film] = rank movieData[year] = movies yearlyData = {} for year in sorted(movieData.keys()): yearlyData[year] = sorted(movieData[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format( len(yearlyData), savename)) saveFile(savename, yearlyData)
def parseFilms101Data(self, debug=False): outdir = self.getDataDir() resultsdir = self.getResultsDir() files = findExt(outdir, ext=".p") movies = {} for ifile in sorted(files): year = getBaseFilename(ifile) results = self.parseFilms101YearlyData(ifile, debug=debug) movies[year] = [] for movie in results: movies[year].append([movie,10]) print("Found {0} movies in {1}".format(len(movies[year]),year)) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of films101 Data to {1}".format(len(movies), savename)) saveFile(savename, movies)
def downloadTeamStatisticsData(self, debug=False): resultsDir = self.getSeasonResultsDir() files = findExt(resultsDir, ext=".p", debug=False) sleep(3) for ifile in files: seasonData = getFile(ifile) year = seasonData.getYear() gamesDir = self.getYearlyGamesDir(year) if year != 2014: continue teams = seasonData.teams for teamID, teamData in teams.items(): name = teamData.teamName self.downloadTeamStatisticsDataByYear(teamID, name, year, debug)
def getArtistModValExtraFiles(self, modVal, previousDays=5, force=False): artistDir = self.disc.getArtistsDir() maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() dirVal = setDir(artistDir, str(modVal)) dirVal = setDir(dirVal, "extra") files = findExt(dirVal, ext='.p') dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal)) now = datetime.now() newFiles = None if lastModified is None: newFiles = files print(" ===> Parsing all {0} extra files for modval {1}".format(len(newFiles), modVal)) else: numFiles = [ifile for ifile in files if (now-datetime.fromtimestamp(path.getmtime(ifile))).days < previousDays] print(" ===> Found new {0} extra files (< {1} days) to parse for modval {2}".format(len(newFiles), previousDays, modVal)) return newFiles
def mergeBoxOfficeMojoResults(self, debug=False): retval = {} files = findExt(self.getResultsDir(), ext=".json") if debug: print("Found {0} files in the results directory".format( len(files))) for ifile in sorted(files): year = getBaseFilename(ifile) try: int(year) except: continue data = getFile(ifile) retval[year] = data if debug: print(" Adding {0} entries from {1}".format(len(data), ifile)) savename = setFile(self.getResultsDir(), "results.json") if debug: print("Saving", len(retval), "years of movie data to", savename) saveFile(savename, retval)
def downloadGameData(self, debug=False, verydebug=False): resultsDir = self.getSeasonResultsDir() files = findExt(resultsDir, ext=".p", debug=False) gameType = "playbyplay" print("Sleeping for 5 seconds...") sleep(5) for ifile in files: seasonData = getFile(ifile) year = seasonData.getYear() if year not in [2013, 2014, 2015]: continue gamesDir = self.getYearlyGamesDir(year) teams = seasonData.teams for teamID, teamData in teams.items(): teamGames = teamData.games for gameData in teamGames: gameResult = gameData["Result"] gameObject = gameData["Game"] gameID = gameObject.gameID if False: prevLocation = "/Volumes/Seagate/Football/Games/Plays/{0}.html".format( gameID) if isFile(prevLocation): savename = setFile(gamesDir, "{0}.p".format(gameID)) if not isFile(savename) or True: data = open(prevLocation, "rb").read() saveFile(idata=data, ifile=savename, debug=True) continue continue self.downloadGameDataByID(gameID, year, debug)
def processWikipediaYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movies = OrderedDict() for ifile in files: if debug: print("Processing {0}".format(ifile)) year = getBaseFilename(ifile) #if year == "1985": continue htmldata = getFile(ifile) bsdata = getHTML(htmldata) results = self.parseWikipediaOscarData(ifile, debug=False) if len(results) == 0: results = self.parseWikipediaOscarDataSpecial(ifile, debug=debug) if len(results) == 0: raise ValueError("No results for {0}".format(ifile)) for k, v in results.items(): print("====>", year, '\t', k) print(" Winner :", results[k]["Winner"]) if debug: print(" Nominees:", results[k]["Nominees"]) print("") savename = setFile(self.getResultsDir(), "{0}.json".format(year)) print("Saving {0} wikipedia oscar data to {1}".format( year, savename)) saveFile(savename, results)
def processRollingStoneData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".html") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in files: htmldata = getFile(ifile) bsdata = getHTML(htmldata) h3s = bsdata.findAll("h3", {"class": "c-list__title t-bold"}) h3s = [x.text for x in h3s] h3s = [x.replace("\n", "").strip() for x in h3s] for h3 in h3s: try: year = int(h3[-5:-1]) except: raise ValueError("Could not get year from {0}".format(h3)) movie = h3[1:-8] print(year, '\t', movie) if yearlyData.get(year) is None: yearlyData[year] = [] yearlyData[year].append(movie) for year in sorted(yearlyData.keys()): movies[year] = [] for movie in yearlyData[year]: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of rollingstone Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def collect(self, hist, test=False, debug=False): files = findExt(hist.getGamesResultsDir(), ext=".p", debug=debug) for ifile in files: print(ifile) try: year = int(getBaseFilename(ifile).split("-")[0]) except: raise ValueError("Could not get year from {0}".format(ifile)) if year not in [2014, 2015, 2016]: continue yearData = getFile(ifile) seasonFilename = setFile(hist.getSeasonResultsDir(), "{0}.p".format(year)) seasonData = getFile(seasonFilename) statsData = {} self.runners = {} self.passers = {} self.punters = {} self.kickers = {} self.fgkickers = {} for teamID, teamData in seasonData.teams.items(): games = [x["Game"] for x in teamData.games] for game in games: gameID = game.gameID try: gameData = yearData[gameID] except: continue teamsMetaData = gameData["Teams"] homeTeamMetaData = teamsMetaData["Home"] awayTeamMetaData = teamsMetaData["Away"] driveData = gameData["Plays"] fieldMap = {} fieldMap[ homeTeamMetaData["ID"]] = homeTeamMetaData["Abbrev"] fieldMap[ homeTeamMetaData["Abbrev"]] = homeTeamMetaData["ID"] fieldMap[ awayTeamMetaData["ID"]] = awayTeamMetaData["Abbrev"] fieldMap[ awayTeamMetaData["Abbrev"]] = awayTeamMetaData["ID"] fieldMap["Home"] = homeTeamMetaData["Abbrev"] fieldMap["Away"] = awayTeamMetaData["Abbrev"] copMap = {} copMap[homeTeamMetaData["ID"]] = awayTeamMetaData["ID"] copMap[awayTeamMetaData["ID"]] = homeTeamMetaData["ID"] self.getRunners(driveData, fieldMap, debug=False) self.getPassers(driveData, fieldMap, debug=False) self.getPunters(driveData, fieldMap, debug=False) self.getKickers(driveData, copMap, debug=False) self.getFieldGoalKickers(driveData, fieldMap, debug=False) ### ### Now Assign Player To A Team ### ### Passers from math import sqrt mapping = { "Passers": self.passers, "Runners": self.runners, "Punters": self.punters, "Kickers": self.kickers, "FGKickers": self.fgkickers } for position, players in mapping.items(): for name, passerTeams in players.items(): mc = passerTeams.most_common(1)[0] frac = mc[1] / sum(dict(passerTeams).values()) if frac < 0.75: continue sig = sqrt(sum(dict(passerTeams).values())) if sig < 2: continue teamID = mc[0] if statsData.get(teamID) is None: statsData[teamID] = {} if statsData[teamID].get(position) is None: statsData[teamID][position] = {} statsData[teamID][position][name] = [ round(frac, 1), round(sig, 1) ] ## Show team stats if debug: for teamID, teamStats in statsData.items(): print(teamID) for pos, names in teamStats.items(): statsData[teamID] print('\t', pos, names) if test is False: augmentedStatsFilename = setFile( hist.getStatisticsResultsDir(), "{0}-stats-extra.json".format(year)) saveFile(idata=statsData, ifile=augmentedStatsFilename, debug=True)
def getGamesResultsFiles(self): files = findExt(self.getGamesResultsDir(), ext=".p", debug=False) return files
def parseTeamYearlyStandings(self, startYear=2003, endYear=2018, debug=False, verydebug=False): for year in range(startYear, endYear + 1): seasonDir = self.getYearlySeasonDir(year) files = findExt(seasonDir, ext=".p", debug=False) seasonData = season(year) for ifile in files: nameyear = getBaseFilename(ifile) htmldata = getFile(ifile) bsdata = getHTML(htmldata) teamName = nameyear.replace("-{0}".format(year), "") metadata = bsdata.find("meta", {"property": "og:url"}) if metadata is None: raise ValueError( "Could not find basic team meta data for this file! {0}" .format(ifile)) try: content = metadata.attrs['content'] year = getBasename(content) teamID = getBasename(getDirname(getDirname(content))) except: raise ValueError( "Could not get team year and ID from meta data: {0}". format(metadata)) if verydebug: print(year, '\t', teamID, '\t', ifile) ## Create Team Object teamData = team(year=year, teamName=teamName, teamMascot=None, teamID=teamID) tables = bsdata.findAll("table", {"class": "Table2__table"}) if verydebug: print("\tFound {0} game tables".format(len(tables))) for it, table in enumerate(tables): trs = table.findAll("tr") headers = trs[1] headers = [ x.text for x in headers.findAll("td") if x is not None ] gameRows = trs[2:] totalGames = len(gameRows) if verydebug: print("\tFound {0} potential games".format(totalGames)) for ig, tr in enumerate(gameRows): tds = tr.findAll("td") gameData = dict(zip(headers, tds)) extra = {"OT": False, "Bowl": False} ## Get the Date try: date = gameData["Date"] except: print(ifile) raise ValueError( "No date for this game! {0}".format(gameData)) date = date.text ## Only Keep Games With Regular Dates try: dateval = "{0} {1}".format( date.split(", ")[-1], year) date = getDateTime(dateval) except: date = None if date is None: continue ## Check for January Games (in the following year) if date.month == 1: date = addMonths(date, 12) ## Get the Opponent try: opponent = gameData["Opponent"] except: raise ValueError( "No opponent for this game! {0}".format(game)) try: oppolink = opponent.find("a") oppohref = oppolink.attrs['href'] opponame = getBasename(oppohref) oppoID = getBasename(getDirname(oppohref)) except: opponame = opponent.text oppoID = 0 #raise ValueError("Could not find href in link! {0}".format(opponent)) try: gamespan = opponent.find("span", {"class": "pr2"}) gametype = gamespan.text except: raise ValueError( "Could not find game type from {0}".format( opponent)) if gametype == "vs": location = teamID elif gametype == "@": location = oppoID else: raise ValueError( "Location --> {0}".format(gametype)) if verydebug: print("\t{0}/{1}\t{2}\t{3: <4}{4: <50}".format( ig, totalGames, printDateTime(date), gametype, opponame), end="\t") ## Get the Result try: result = gameData["Result"] except: raise ValueError( "No result for this game! {0}".format(game)) spans = result.findAll("span") if len(spans) == 0: continue if len(spans) != 2: raise ValueError( "There are {0} spans in this row!: {1}".format( len(spans), result)) outcome = spans[0].text.strip() score = spans[1].text.strip() if score.endswith("OT"): extra = {"OT": True} score = score[:-3].strip() try: scores = [int(x) for x in score.split('-')] except: raise ValueError( "Could not create integer scores from {0}". format(spans)) if outcome == 'W': teamScore = scores[0] oppoScore = scores[1] teamResult = "W" oppoResult = "L" elif outcome == "L": teamScore = scores[1] oppoScore = scores[0] teamResult = "L" oppoResult = "W" elif outcome == "T": teamScore = scores[0] oppoScore = scores[1] teamResult = "T" oppoResult = "T" else: raise ValueError( "Did not recognize game outcome {0}".format( outcome)) ## Get the Game try: gamelink = result.find("a") gamehref = gamelink.attrs['href'] except: raise ValueError( "Could not find href in link! {0}".format( result)) if verydebug: print("{0} {1}".format( teamResult, "-".join( str(x) for x in [teamScore, oppoScore]))) ## Create game object gameData = game(gameID=gameID, date=date, teamA=teamID, teamB=oppoID, teamAResult=teamResult, teamBResult=oppoResult, teamAScore=teamScore, teamBScore=oppoScore, location=location) ## Append game to team data teamData.addGame(gameData) ## Show Summary teamData.setStatistics() if debug: teamData.summary() if teamData.ngames == 0: removeFile(ifile, debug=True) seasonData.addTeam(teamData) #http://www.espn.com/college-football/team/schedule/_/id/201/season/2005" savename = setFile(self.getSeasonResultsDir(), "{0}.p".format(year)) saveFile(idata=seasonData, ifile=savename, debug=True)
def parseGameData(self, startYear=2003, endYear=2018, debug=False, verydebug=False): noData = {} for year in range(startYear, endYear + 1): yearData = {} gamesDir = self.getYearlyGamesDir(year) files = findExt(gamesDir, ext=".p", debug=False) noData[year] = [] for i, ifile in enumerate(files): gameID = getBaseFilename(ifile) if gameID in self.noGameData: continue htmldata = getFile(ifile) bsdata = getHTML(htmldata) #print(bsdata) #verydebug=True #if gameID not in ['400603866']: # continue teamData = bsdata.findAll("div", {"class": "team-container"}) longNames = [ x.find("span", {"class": "long-name"}) for x in teamData ] longNames = [x.text for x in longNames if x is not None] shortNames = [ x.find("span", {"class": "short-name"}) for x in teamData ] shortNames = [x.text for x in shortNames if x is not None] teamAbbrevs = [ x.find("span", {"class": "abbrev"}) for x in teamData ] teamNames = [x.attrs for x in teamAbbrevs if x is not None] teamNames = [x['title'] for x in teamNames] teamAbbrevs = [x.text for x in teamAbbrevs] teamIDs = [ x.find("img", {"class": "team-logo"}) for x in teamData ] teamIDs = [x.attrs for x in teamIDs if x is not None] teamIDs = [x['src'] for x in teamIDs] teamIDs = [re.search(r"(\d+).png", x) for x in teamIDs] teamIDs = [x.groups()[0] for x in teamIDs] awayTeam = { "Name": longNames[0], "Mascot": shortNames[0], "Abbrev": teamAbbrevs[0], "ID": teamIDs[0] } homeTeam = { "Name": longNames[1], "Mascot": shortNames[1], "Abbrev": teamAbbrevs[1], "ID": teamIDs[1] } metadata = bsdata.find("meta", {"property": "og:title"}) title = None if metadata is not None: title = metadata.attrs['content'] if verydebug: print("==> {0}".format(title)) ## Possesions posData = bsdata.find("ul", {"class": "css-accordion"}) if posData is None: posData = bsdata.find("article", {"class": "play-by-play"}) if posData is None: noData[year].append(gameID) if verydebug: print("Could not find possession data! {0}".format( gameID)) continue #print(bsdata) #1/0 #removeFile(ifile, debug) #continue gameData = { "Teams": { "Away": awayTeam, "Home": homeTeam }, "Plays": [] } if i % 10 == 0: print("{0}/{1} with {2} no data games".format( i, len(files), len(noData[year]))) ################### ## Get Full Drive Data ################### drives = posData.findAll("li", {"class": "accordion-item"}) if verydebug: print("Drives {0}".format(len(drives))) for idr, drive in enumerate(drives): ## Get Drive Summary headlines = [ x.text.strip() for x in drive.findAll("span", {"class": "headline"}) ] if verydebug: print("Headlines {0}".format(len(headlines))) ## Get Drive Details details = [ x.text.strip() for x in drive.findAll( "span", {"class": "drive-details"}) ] if verydebug: print("Details {0}".format(len(details))) ## Get Home Score homescores = drive.findAll("span", {"class": "home"}) homescores = [ x.find("span", {"class": "team-score"}) for x in homescores ] homescores = [x.text for x in homescores if x is not None] if verydebug: print("Home Scores {0}".format(len(homescores))) ## Get Away Score awayscores = drive.findAll("span", {"class": "away"}) awayscores = [ x.find("span", {"class": "team-score"}) for x in awayscores ] awayscores = [x.text for x in awayscores if x is not None] if verydebug: print("Away Scores {0}".format(len(awayscores))) ## Get Possession possessions = drive.findAll("span", {"class": "home-logo"}) possessions = [ x.find("img", {"class": "team-logo"}) for x in possessions ] possessions = [ x.attrs['src'] for x in possessions if x is not None ] possessions = [x.split('&')[0] for x in possessions] possessions = [getBaseFilename(x) for x in possessions] if verydebug: print("Possessions {0}".format(len(possessions))) ## Check for valid headline (parsed correctly?) if len(headlines) == 0: continue validFGs = [ "Missed FG", "Field Goal", "FIELD GOAL", "MISSED FG", "Made FG", "Field Goal Good", "Field Goal Missed", "Blocked FG" ] validTDs = [ "Touchdown", "TOUCHDOWN", "END OF HALF Touchdown", "Downs Touchdown", "Missed FG Touchdown", "End of Half Touchdown", "End of Game Touchdown", "PUNT Touchdown", "FUMBLE Touchdown", "INTERCEPTION Touchdown", "FIELD GOAL Touchdown", "MISSED FG Touchdown", "Rushing Touchdown", "Passing Touchdown", "Kickoff Return Touchdown", "Interception Return Touch", "Turnover on Downs Touchdown", "Field Goal Missed Touchdown", "Field Goal Touchdown", "Rushing Touchdown Touchdown", "Field Goal Good Touchdown", "Passing Touchdown Touchdown", "Fumble Return Touchdown Touchdown", "Rushing TD", "Passing TD", "Blocked Punt TD", "Punt Return TD", "Fumble Ret. TD", "Interception TD", "Fumble TD", "Rushing TD Touchdown", "Blocked Punt TD Touchdown", "Blocked FG (TD)", "Punt Return TD Touchdown", "Kick Return TD", "Kickoff Return Touchdown Touchdown", "Missed FG (TD) Touchdown", "Blocked FG (TD) Touchdown", "Punt Return Touchdown Touchdown", "Interception Return Touch Touchdown" ] validEnds = [ "End of Half", "End of Game", "END OF HALF", "END OF GAME", "End of 4th Quarter" ] validTOs = [ "Fumble", "Interception", "FUMBLE", "INTERCEPTION", "Kickoff", "KICKOFF", "Blocked Punt" ] validTOPnts = [ "Interception Touchdown", "Safety", "Punt Touchdown", "Fumble Touchdown", "Punt Return Touchdown", "Fumble Return Touchdown", "SAFETY" ] validDowns = [ "Punt", "Downs", "PUNT", "Possession (For OT Drives)", "DOWNS", "Possession (For OT Drives) Touchdown", "Turnover on Downs", "Poss. on downs", "Penalty" ] validPlay = [ "Rush", "Pass", "Sack", "Timeout", "Incomplete", "Pass Complete" ] valid2PT = ["2PT Pass failed", "Missed PAT Return"] validOdds = ["on-side kick"] validHeadlines = validFGs + validTDs + validEnds + validTOs + validTOPnts + validDowns + validPlay + valid2PT isValidHeadline = sum( [x in validHeadlines for x in headlines]) if headlines[0] == '': continue if isValidHeadline == 0 and idr < len(drives) - 1: print(idr, '/', len(drives)) print(title) print(ifile) #print(bsdata) raise ValueError( "No valid headline in {0}".format(headlines)) print("No valid headline in {0}".format(headlines)) continue ## Analyze Play-by-Play try: driveList = drive.find("ul", {"class": "drive-list"}) plays = driveList.findAll("li") except: raise ValueError( "Could not find drive list in drive {0}".format( drive)) driveData = [] for ip, play in enumerate(plays): ## Check for Starting Position startPos = play.find("h3") if startPos is None: raise ValueError( "Could not find Starting Position in Play! {0}" .format(play)) startData = startPos.text.strip() ## Check for Play Text span = play.find("span", {"class": "post-play"}) if span is None: raise ValueError( "Could not find post play data! {0}".format( play)) playData = span.text.strip() driveData.append({ "Play": ip, "Start": startData, "Data": playData }) #print(idr,'\t',ip,'\t',startData,'\t',playData) ## Save Drive Data gameData["Plays"].append({ "Drive": len(gameData), "Headline": headlines, "Detail": details, "HomeScore": homescores, "AwayScore": awayscores, "Possession": possessions, "Data": driveData }) if verydebug: print(idr, '\t', headlines) print(idr, '\t', details) print(idr, '\t', homescores) print(idr, '\t', awayscores) print(idr, '\t', possessions) print("") if verydebug: print("Found {0} drives for gameID {1}".format( len(gameData), gameID)) yearData[gameID] = gameData print("Parsed {0}/{1} games in {2}".format(len(yearData), len(files), year)) savename = setFile(self.getGamesResultsDir(), "{0}-games.p".format(year)) saveFile(idata=yearData, ifile=savename, debug=True) return noData
def parseArtistModValCreditFiles(self, modVal, dbdata=None, debug=False, force=False): print("\t","="*100) print("\t","Parsing Artist Credit Files For ModVal {0}".format(modVal)) artistInfo = self.artist artistDir = self.disc.getArtistsDir() maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() dirVal = setDir(artistDir, str(modVal)) dirVal = setDir(dirVal, "credit") files = findExt(dirVal, ext='.p') if len(files) == 0: return dbdata print("\t"," Found {0} credit files for ModVal {1}".format(len(files), modVal)) dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal)) retdbdata = False if dbdata is None: print("\t"," Loaded ", end="") dbdata = getFile(dbname, version=3) print("\t","{0} artist IDs.".format(len(dbdata))) else: retdbdata = True saveIt = 0 nArtistMedia = {} print("\t","{0} artist IDs.".format(len(dbdata))) for j,ifile in enumerate(files): if force is True: if j % 500 == 0: print("\t","\tProcessed {0}/{1} files.".format(j,len(files))) if debug: print("\t","{0}/{1} -- {2}.".format(j,len(files),ifile)) info = artistInfo.getData(ifile) artistID = info.ID.ID #currentMedia = sum([len(x) for x in dbdata[artistID].media.media.values()]) #print(artistID,'\t',sum([len(x) for x in dbdata[artistID].media.media.values()]),end="\t") currentKeys = [] if dbdata.get(artistID) is not None: currentKeys = list(dbdata[artistID].media.media.keys()) else: dbdata[artistID] = info saveIt += 1 continue keys = list(set(list(info.media.media.keys()) + currentKeys)) for k in keys: v = info.media.media.get(k) if v is None: continue iVal = {v2.code: v2 for v2 in v} dVal = dbdata[artistID].media.media.get(k) if dVal is None: Tretval = iVal saveIt += len(iVal) else: Tretval = {v2.code: v2 for v2 in dVal} Tretval.update(iVal) saveIt += len(iVal) dbdata[artistID].media.media[k] = list(Tretval.values()) if debug: print("\t","File:",j," \tArtist:",artistID,'-->',currentMedia,'to',sum([len(x) for x in dbdata[artistID].media.media.values()])) if retdbdata is True: return dbdata #if saveAll is False: # return saveIt if saveIt > 0: savename = setFile(artistDBDir, "{0}-DB.p".format(modVal)) print("\t","Saving {0} artists to {1}".format(len(dbdata), savename)) print("\t","Saving {0} new (credit) artist media to {1}".format(saveIt, savename)) dbNumAlbums = sum([self.getArtistNumAlbums(artistData) for artistData in dbdata.values()]) print("\t","Saving {0} total (credit) artist media".format(dbNumAlbums)) saveFile(idata=dbdata, ifile=savename) self.createArtistModValMetadata(modVal=modVal, db=dbdata, debug=debug) self.createArtistAlbumModValMetadata(modVal=modVal, db=dbdata, debug=debug) return saveIt
def parseTeamStatisticsData(self, startYear=2014, endYear=2018, debug=False, verydebug=False): for year in range(startYear, endYear + 1): yearData = {} statsDir = self.getYearlyStatisticsDir(year) files = findExt(statsDir, ext=".p", debug=False) for i, ifile in enumerate(files): teamStatistics = {} print(ifile) htmldata = getFile(ifile) bsdata = getHTML(htmldata) divs = bsdata.findAll("div", {"class": "Table2__Title"}) tableNames = [x.text for x in divs] tables = bsdata.findAll("table", {"class": "Table2__table__wrapper"}) ## Skip the team leaders table tableNames = tableNames[1:] #tables = tables[1:] if len(tables) != len(tableNames): for it, table in enumerate(tables): ths = table.findAll("th") headers = [x.text for x in ths] print(it, headers) raise ValueError( "There are {0} tables and {1} names".format( len(tables), tableNames)) #print(" Found {0} tables and {1} names".format(len(tables), len(tableNames))) tableData = dict(zip(tableNames, tables)) for tableName, table in tableData.items(): ths = table.findAll("th") headers = [x.text for x in ths] trs = table.findAll("tr")[2:] players = {} iData = -1 for tr in trs: linedata = [x for x in tr.strings] ## Get player first if len(linedata) == 3: try: name = linedata[0] position = linedata[2] except: raise ValueError( "Could not parse line data: {0}".format( linedata)) key = ":".join([name, position]) players[key] = None elif len(linedata) == 1: players["TOTAL:ALL"] = None playerNames = list(players.keys()) elif len(linedata) == len(headers) - 1: if iData == -1: header = linedata iData += 1 continue else: try: playerData = dict(zip(header, linedata)) except: raise ValueError( "Could not combine header [{0}] with data [{1}]" .format(header, linedata)) try: players[playerNames[iData]] = playerData except: raise ValueError( "Could not set data for [{0}] with data: {1}" .format(iData, playerData)) #print(iData,'\t',playerNames[iData],'\t',playerData) iData += 1 #print(tableName,'-->',players) teamStatistics[tableName] = players yearData[year] = teamStatistics print("Parsed {0}/{1} games in {2}".format(len(yearData), len(files), year)) savename = setFile(self.getStatisticsResultsDir(), "{0}-stats.p".format(year)) saveFile(idata=yearData, ifile=savename, debug=True)