Exemplo n.º 1
0
    def getArtistModValFiles(self, modVal, previousDays=5, force=False):
        artistDir = self.disc.getArtistsDir()
        maxModVal = self.disc.getMaxModVal()
                    
        artistDBDir = self.disc.getArtistsDBDir()        
        
        dirVal = setDir(artistDir, str(modVal))
        files  = findExt(dirVal, ext='.p')
        dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal))
        
        now    = datetime.now()
        if isFile(dbname):
            lastModified = datetime.fromtimestamp(path.getmtime(dbname))
            if force is True:
                lastModified = None
        else:
            lastModified = None

        newFiles = None
        if lastModified is None:
            newFiles = files
            print("  ===> Parsing all {0} files for modval {1}".format(len(newFiles), modVal))
        else:
            numNew    = [ifile for ifile in files if (now-datetime.fromtimestamp(path.getmtime(ifile))).days < previousDays]
            numRecent = [ifile for ifile in files if datetime.fromtimestamp(path.getmtime(ifile)) > lastModified]
            newFiles  = list(set(numNew).union(set(numRecent)))
            print("  ===> Found new {0} files (< {1} days) to parse for modval {2}".format(len(newFiles), previousDays, modVal))
        return newFiles
Exemplo n.º 2
0
    def processAACTACategoryData(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext="*.p")

        from collections import OrderedDict
        movies = OrderedDict()
        print(files)
        for ifile in files:

            if debug:
                print("Processing {0}".format(ifile))
            category = getBaseFilename(ifile)
            results = self.parseAACTACategoryData(ifile, category, debug=debug)

            if len(results) == 0:
                raise ValueError("No results for {0}".format(ifile))

            for year, yearData in results.items():
                for category, categoryData in yearData.items():
                    if movies.get(year) is None:
                        movies[year] = []
                    for movie in categoryData:
                        movies[year].append(movie)

        for year in movies.keys():
            movies[year] = list(set(movies[year]))
            yearlyMovies = movies[year]
            movies[year] = []
            for movie in yearlyMovies:
                movies[year].append([movie, 10])

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of AACTA Data to {1}".format(
            len(movies), savename))
        saveFile(savename, movies)
Exemplo n.º 3
0
    def findSearchTerms(self, minCnts=25):
        from collections import Counter
        from time import sleep
        from glob import glob

        artistsCntr = Counter()
        known = getFile(self.knownFile)
        
        files  = getFlatList([findExt(dirval, ext='.p') for dirval in self.getModValDirs()])
        for ifile in files:
        #for ifile in glob("/Volumes/Piggy/Discog/artists-datpiff/*/*.p"):
            if ifile.endswith("datPiffKnown.p"):
                continue
            tmp     = getFile(ifile)
            #print(ifile,'\t',len(tmp))
            results = [x["ArtistName"] for x in tmp]
            for artist in results:
                artists = self.mulArts.getArtistNames(artist)
                for artist in artists.keys():
                    key = artist.title()
                    if len(key) > 1 and key not in known:
                        artistsCntr[key] += 1
        searchTerms = [item[0] for item in artistsCntr.most_common() if item[1] >= minCnts]
        print("There are {0} new searches".format(len(searchTerms)))
        return searchTerms
Exemplo n.º 4
0
    def createRawOscarData(self, debug=True):
        print("Checking for poorly parsed oscar data.")
        indir = self.wikiData.getResultsDir()
        files = sorted(findExt(indir, ext=".json"))
        if debug:
            print("Found {0} oscar files".format(len(files)))
        yearlyData = {}
        for ifile in files:
            year = getBaseFilename(ifile)
            yearlyData[year] = getFile(ifile)

        savename = setFile(self.getCorrectionsDir(), "saved.yaml")
        if not isFile(savename):
            savedData = {}
        else:
            savedData = getFile(savename)

        for year in savedData.keys():
            for title in savedData[year].keys():
                savedWinner = savedData[year][title].get("Winner")
                savedNominees = savedData[year][title].get("Nominees")
                if savedWinner is not None:
                    print("Overwritting {0} {1} winner".format(year, title))
                    yearlyData[year][title]["Winner"] = savedWinner
                if savedNominees is not None:
                    print("Overwritting {0} {1} nominees".format(year, title))
                    yearlyData[year][title]["Nominees"] = savedNominees

        savename = setFile(self.getCorrectionsDir(), "raw.yaml")
        saveFile(idata=yearlyData, ifile=savename)
Exemplo n.º 5
0
    def parseAndDownloadTeamYearlyStandings(self):
        files = findExt(self.getSeasonDir(), ext=".p", debug=False)
        for ifile in files:
            year = getBaseFilename(ifile)
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)

            idVals = {}
            links = bsdata.findAll("a")
            for link in links:
                attrs = link.attrs
                if attrs.get("data-clubhouse-uid") is not None:
                    href = attrs['href']
                    name = getBasename(href)
                    idval = getBasename(getDirname(href))

                    if idVals.get(idval) is not None:
                        if idVals[idval] != name:
                            raise ValueError("Error in ID for this year!")
                    idVals[idval] = name

            for idVal, name in idVals.items():
                self.downloadTeamDataByYear(idVal,
                                            name,
                                            season=str(year),
                                            debug=True)
Exemplo n.º 6
0
    def processWikiFilmYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movies = OrderedDict()
        yearlyData = {}

        for ifile in sorted(files):
            if debug:
                print("Processing {0}".format(ifile))
            year = getBaseFilename(ifile)
            movies[year] = self.parseWikiFilmYearlyData(ifile, debug=False)

            yearlyData[year] = sorted(movies[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=False)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} WikiFilm data to {1}".format(len(yearlyData),
                                                       savename))
        saveFile(savename, yearlyData)
Exemplo n.º 7
0
    def parseRottenTomatoes(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext=".p")

        movies = {}
        for ifile in files:
            result = self.parseRottenTomatoesFile(ifile, debug=debug)
            for year, yearlyResult in result.items():
                if movies.get(year) is None:
                    movies[year] = yearlyResult
                else:
                    movies[year] = {**movies[year], **yearlyResult}

        yearlyData = {}
        for year in movies.keys():
            yearlyData[year] = sorted(movies[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "rottentomatoes.json")
        print("Saving", len(yearlyData), "yearly results to", savename)
        saveFile(savename, yearlyData)
Exemplo n.º 8
0
    def processFlopsData(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext=".html")

        from collections import OrderedDict
        movies = OrderedDict()
        yearlyData = {}
        for ifile in files:
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)

            tables = bsdata.findAll("table", {"class": "wikitable"})
            for table in tables:

                trs = table.findAll("tr")

                try:
                    ths = trs[0].findAll("th")
                    ths = [x.text for x in ths]
                    ths = [x.replace("\n", "") for x in ths]
                except:
                    raise ValueError("Could not get headers")

                print(ths)

                for itr, tr in enumerate(trs[2:]):

                    ths = tr.findAll("th")
                    try:
                        movie = ths[0].text
                        movie = movie.replace("\n", "").strip()
                        movie = movie.replace("[nb 2]", "")
                    except:
                        raise ValueError(
                            "Could not find movie in {0}".format(ths))

                    tds = tr.findAll("td")
                    try:
                        year = tds[0].text
                        year = int(year)
                    except:
                        raise ValueError(
                            "Could not find year in {0}".format(tds))

                    print(year, '\t', movie)

                    if yearlyData.get(year) is None:
                        yearlyData[year] = []
                    yearlyData[year].append(movie)

        for year in sorted(yearlyData.keys()):
            movies[year] = []
            for movie in yearlyData[year]:
                movies[year].append([movie, 10])

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of flops Data to {1}".format(
            len(movies), savename))
        saveFile(savename, movies)
Exemplo n.º 9
0
    def downloadKWorbSpotifyYouTubeArtists(self, update=False):
        url = "https://kworb.net/youtube/archive.html"
        savename = "kworb_youtubeartists.p"
        if update is True:
            self.dutils.downloadArtistURL(url=url,
                                          savename=savename,
                                          force=True)

        bsdata = getHTML(savename)
        data = []
        artistDir = self.disc.getArtistsDir()
        saveDir = setDir(artistDir, "youtube")
        print(artistDir)
        for table in bsdata.findAll("table"):
            ths = [th.text for th in table.findAll("th")]
            for tr in table.findAll("tr")[1:]:
                item = dict(zip(ths, tr.findAll("td")))
                data.append(item)

        print(data)

        if False:
            bsdata = getHTML(savename)
            artistDir = self.disc.getArtistsDir()
            saveDir = setDir(artistDir, "youtube")
            for div in bsdata.findAll("div", {"class": "subcontainer"}):
                if div.find("span", {"class": "pagetitle"}) is None:
                    continue
                for ref in div.findAll("a"):
                    href = ref.attrs['href']
                    url = "{0}/{1}".format(self.youtubeURL, href)
                    savename = "{0}/{1}".format(saveDir,
                                                href.replace(".html", ".p"))
                    if isFile(savename):
                        print("Y\t", savename, '\t', url)
                    else:
                        print("-\t", savename, '\t', url)
                        #dbArtistsKWorb().dutils.downloadArtistURL(url=fullURL, savename=savename, force=True)

            for ifile in findExt(saveDir, ".p"):
                bsdata = getHTML(ifile)
                for table in bsdata.findAll("table"):
                    trs = table.findAll("tr")
                    for tr in trs[1:]:
                        ref = tr.find("a")
                        href = ref.attrs['href']
                        name = ref.text
                        url = "{0}/{1}".format(self.youtubeURL, href)
                        savename = "{0}/{1}".format(
                            setDir(saveDir, "artist"),
                            href.replace(".html", ".p"))
                        print(url, savename)

                        if isFile(savename) is False:
                            data, code = downloadURL(url)
                            from ioUtils import getFile, saveFile
                            saveFile(idata=data, ifile=savename)
                            sleep(3)
                            break
Exemplo n.º 10
0
    def parseArtistFiles(self, force=False, debug=False):   
        from glob import glob
        
        artistDir = self.disc.getArtistsDir()
        
        artistDBData = {}
                
        files = findExt(self.knownDir, ext='.p')        
        files = glob("/Volumes/Biggy/Discog/artists-datpiff/*/*.p")
        print("Found {0} downloaded search terms".format(len(files)))
        for i,ifile in enumerate(files):
            if ifile.endswith("datPiffKnown.p"):
                continue
            fileresults = getFile(ifile)
            if debug:
                print(i,'/',len(files),'\t',ifile)
            for j,fileresult in enumerate(fileresults):
                if debug:
                    print("  ",j,'/',len(fileresults))
                mixArtists  = fileresult["ArtistName"]
                albumName   = fileresult["AlbumName"]
                albumURL    = fileresult["AlbumURL"]
                
                mixArtistNames = self.mulArts.getArtistNames(mixArtists)
                mixArtistNames = [x.title() for x in mixArtistNames.keys()]
                
                for artistName in mixArtistNames:
                    artistID   = str(self.dutils.getArtistID(artistName))
                    albumID    = str(self.dutils.getArtistID(albumName))
                    modval     = self.dutils.getArtistModVal(artistID)
                    if artistDBData.get(modval) is None:
                        artistDBData[modval] = {}
                    if artistDBData[modval].get(artistName) is None:
                        artistDBData[modval][artistName] = {"Name": artistName, "ID": artistID, "URL": None, "Profile": None, "Media": []}
                    albumData = {"Artists": mixArtistNames, "Name": albumName, "URL": albumURL, "Code": albumID}
                    artistDBData[modval][artistName]["Media"].append(albumData)

                    
                    
                    
        maxModVal   = self.disc.getMaxModVal()
        artistDBDir = self.disc.getArtistsDBDir()     
        totalSaves  = 0
        for modVal,modvaldata in artistDBData.items():
            dbData = {}
            for artistName, artistData in modvaldata.items():
                self.artist.setData(artistData)
                artistVal = self.artist.parse()
                dbData[artistVal.ID.ID] = artistVal
                        
            savename = setFile(artistDBDir, "{0}-DB.p".format(modVal))
            print("Saving {0} artist IDs to {1}".format(len(dbData), savename))
            totalSaves += len(dbData)
            saveFile(idata=dbData, ifile=savename)
            
            self.createArtistModValMetadata(modVal=modVal, db=dbData, debug=debug)
            self.createArtistAlbumModValMetadata(modVal=modVal, db=dbData, debug=debug)
            
        print("Saved {0} new artist IDs".format(totalSaves))
Exemplo n.º 11
0
def main(args):
    files = findExt(getcwd(), ext=".mp3")

    for ifile in files:
        fname = getBasename(ifile)
        dname = getDirname(ifile)
        fname = fname.replace(args.remove, "").strip()
        dst = join(dname, fname)
        if ifile != dst:
            moveFile(ifile, dst, debug=True)
Exemplo n.º 12
0
    def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movieData = OrderedDict()
        for ifile in sorted(files):
            #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p"
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)
            year = getBaseFilename(ifile)

            data = {}
            done = False
            tables = bsdata.findAll("table")  #, {"id": "table_3"})
            movies = {}
            for it, table in enumerate(tables):
                ths = table.findAll("th")
                trs = table.findAll("tr")
                for itr, tr in enumerate(trs):
                    tds = tr.findAll("td")
                    if len(tds) == 11:
                        val = removeTag(tds[1], 'span')
                        film = val.text
                        film = film.replace(" ({0})".format(year), "")
                        try:
                            rank = float(tds[-1].text)
                        except:
                            try:
                                rank = float(tds[-2].text)
                            except:
                                raise ValueError(tds[-1], tds[-2], tr)

                        movies[film] = rank

            movieData[year] = movies

        yearlyData = {}
        for year in sorted(movieData.keys()):
            yearlyData[year] = sorted(movieData[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format(
            len(yearlyData), savename))
        saveFile(savename, yearlyData)
Exemplo n.º 13
0
 def parseFilms101Data(self, debug=False):
     outdir = self.getDataDir()
     resultsdir = self.getResultsDir()
     files  = findExt(outdir, ext=".p")
     movies = {}
     
     for ifile in sorted(files):
         year    = getBaseFilename(ifile)
         results = self.parseFilms101YearlyData(ifile, debug=debug)
         movies[year] = []
         for movie in results:
             movies[year].append([movie,10])
         print("Found {0} movies in {1}".format(len(movies[year]),year))
     savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
     print("Saving {0} Years of films101 Data to {1}".format(len(movies), savename))
     saveFile(savename, movies)
Exemplo n.º 14
0
    def downloadTeamStatisticsData(self, debug=False):
        resultsDir = self.getSeasonResultsDir()
        files = findExt(resultsDir, ext=".p", debug=False)

        sleep(3)

        for ifile in files:
            seasonData = getFile(ifile)
            year = seasonData.getYear()
            gamesDir = self.getYearlyGamesDir(year)

            if year != 2014:
                continue

            teams = seasonData.teams
            for teamID, teamData in teams.items():
                name = teamData.teamName
                self.downloadTeamStatisticsDataByYear(teamID, name, year,
                                                      debug)
Exemplo n.º 15
0
 def getArtistModValExtraFiles(self, modVal, previousDays=5, force=False):
     artistDir = self.disc.getArtistsDir()
     maxModVal = self.disc.getMaxModVal()
                 
     artistDBDir = self.disc.getArtistsDBDir()        
     
     dirVal = setDir(artistDir, str(modVal))
     dirVal = setDir(dirVal, "extra")
     files  = findExt(dirVal, ext='.p')
     dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal))
     
     now    = datetime.now()
     newFiles = None
     if lastModified is None:
         newFiles = files
         print("  ===> Parsing all {0} extra files for modval {1}".format(len(newFiles), modVal))
     else:
         numFiles = [ifile for ifile in files if (now-datetime.fromtimestamp(path.getmtime(ifile))).days < previousDays]
         print("  ===> Found new {0} extra files (< {1} days) to parse for modval {2}".format(len(newFiles), previousDays, modVal))
     return newFiles
Exemplo n.º 16
0
    def mergeBoxOfficeMojoResults(self, debug=False):
        retval = {}
        files = findExt(self.getResultsDir(), ext=".json")
        if debug:
            print("Found {0} files in the results directory".format(
                len(files)))
        for ifile in sorted(files):
            year = getBaseFilename(ifile)
            try:
                int(year)
            except:
                continue
            data = getFile(ifile)
            retval[year] = data
            if debug:
                print("  Adding {0} entries from {1}".format(len(data), ifile))

        savename = setFile(self.getResultsDir(), "results.json")
        if debug:
            print("Saving", len(retval), "years of movie data to", savename)
        saveFile(savename, retval)
Exemplo n.º 17
0
    def downloadGameData(self, debug=False, verydebug=False):
        resultsDir = self.getSeasonResultsDir()
        files = findExt(resultsDir, ext=".p", debug=False)

        gameType = "playbyplay"
        print("Sleeping for 5 seconds...")
        sleep(5)

        for ifile in files:
            seasonData = getFile(ifile)
            year = seasonData.getYear()
            if year not in [2013, 2014, 2015]:
                continue
            gamesDir = self.getYearlyGamesDir(year)

            teams = seasonData.teams
            for teamID, teamData in teams.items():
                teamGames = teamData.games
                for gameData in teamGames:
                    gameResult = gameData["Result"]
                    gameObject = gameData["Game"]
                    gameID = gameObject.gameID

                    if False:
                        prevLocation = "/Volumes/Seagate/Football/Games/Plays/{0}.html".format(
                            gameID)
                        if isFile(prevLocation):
                            savename = setFile(gamesDir,
                                               "{0}.p".format(gameID))
                            if not isFile(savename) or True:
                                data = open(prevLocation, "rb").read()
                                saveFile(idata=data,
                                         ifile=savename,
                                         debug=True)
                                continue
                        continue

                    self.downloadGameDataByID(gameID, year, debug)
Exemplo n.º 18
0
    def processWikipediaYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movies = OrderedDict()
        for ifile in files:

            if debug:
                print("Processing {0}".format(ifile))
            year = getBaseFilename(ifile)
            #if year == "1985": continue
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)
            results = self.parseWikipediaOscarData(ifile, debug=False)

            if len(results) == 0:
                results = self.parseWikipediaOscarDataSpecial(ifile,
                                                              debug=debug)
            if len(results) == 0:
                raise ValueError("No results for {0}".format(ifile))

            for k, v in results.items():
                print("====>", year, '\t', k)
                print("      Winner  :", results[k]["Winner"])
                if debug:
                    print("      Nominees:", results[k]["Nominees"])
                    print("")

            savename = setFile(self.getResultsDir(), "{0}.json".format(year))
            print("Saving {0} wikipedia oscar data to {1}".format(
                year, savename))
            saveFile(savename, results)
Exemplo n.º 19
0
    def processRollingStoneData(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext=".html")

        from collections import OrderedDict
        movies = OrderedDict()
        yearlyData = {}
        for ifile in files:
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)

            h3s = bsdata.findAll("h3", {"class": "c-list__title t-bold"})
            h3s = [x.text for x in h3s]
            h3s = [x.replace("\n", "").strip() for x in h3s]
            for h3 in h3s:
                try:
                    year = int(h3[-5:-1])
                except:
                    raise ValueError("Could not get year from {0}".format(h3))

                movie = h3[1:-8]
                print(year, '\t', movie)

                if yearlyData.get(year) is None:
                    yearlyData[year] = []
                yearlyData[year].append(movie)

        for year in sorted(yearlyData.keys()):
            movies[year] = []
            for movie in yearlyData[year]:
                movies[year].append([movie, 10])

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of rollingstone Data to {1}".format(
            len(movies), savename))
        saveFile(savename, movies)
Exemplo n.º 20
0
    def collect(self, hist, test=False, debug=False):
        files = findExt(hist.getGamesResultsDir(), ext=".p", debug=debug)
        for ifile in files:
            print(ifile)
            try:
                year = int(getBaseFilename(ifile).split("-")[0])
            except:
                raise ValueError("Could not get year from {0}".format(ifile))

            if year not in [2014, 2015, 2016]:
                continue

            yearData = getFile(ifile)

            seasonFilename = setFile(hist.getSeasonResultsDir(),
                                     "{0}.p".format(year))
            seasonData = getFile(seasonFilename)

            statsData = {}
            self.runners = {}
            self.passers = {}
            self.punters = {}
            self.kickers = {}
            self.fgkickers = {}

            for teamID, teamData in seasonData.teams.items():
                games = [x["Game"] for x in teamData.games]
                for game in games:
                    gameID = game.gameID

                    try:
                        gameData = yearData[gameID]
                    except:
                        continue

                    teamsMetaData = gameData["Teams"]
                    homeTeamMetaData = teamsMetaData["Home"]
                    awayTeamMetaData = teamsMetaData["Away"]
                    driveData = gameData["Plays"]

                    fieldMap = {}
                    fieldMap[
                        homeTeamMetaData["ID"]] = homeTeamMetaData["Abbrev"]
                    fieldMap[
                        homeTeamMetaData["Abbrev"]] = homeTeamMetaData["ID"]
                    fieldMap[
                        awayTeamMetaData["ID"]] = awayTeamMetaData["Abbrev"]
                    fieldMap[
                        awayTeamMetaData["Abbrev"]] = awayTeamMetaData["ID"]

                    fieldMap["Home"] = homeTeamMetaData["Abbrev"]
                    fieldMap["Away"] = awayTeamMetaData["Abbrev"]

                    copMap = {}
                    copMap[homeTeamMetaData["ID"]] = awayTeamMetaData["ID"]
                    copMap[awayTeamMetaData["ID"]] = homeTeamMetaData["ID"]

                    self.getRunners(driveData, fieldMap, debug=False)
                    self.getPassers(driveData, fieldMap, debug=False)
                    self.getPunters(driveData, fieldMap, debug=False)
                    self.getKickers(driveData, copMap, debug=False)
                    self.getFieldGoalKickers(driveData, fieldMap, debug=False)

            ###
            ### Now Assign Player To A Team
            ###

            ### Passers
            from math import sqrt
            mapping = {
                "Passers": self.passers,
                "Runners": self.runners,
                "Punters": self.punters,
                "Kickers": self.kickers,
                "FGKickers": self.fgkickers
            }
            for position, players in mapping.items():
                for name, passerTeams in players.items():
                    mc = passerTeams.most_common(1)[0]
                    frac = mc[1] / sum(dict(passerTeams).values())
                    if frac < 0.75:
                        continue
                    sig = sqrt(sum(dict(passerTeams).values()))
                    if sig < 2:
                        continue
                    teamID = mc[0]
                    if statsData.get(teamID) is None:
                        statsData[teamID] = {}
                    if statsData[teamID].get(position) is None:
                        statsData[teamID][position] = {}
                    statsData[teamID][position][name] = [
                        round(frac, 1), round(sig, 1)
                    ]

            ## Show team stats
            if debug:
                for teamID, teamStats in statsData.items():
                    print(teamID)
                    for pos, names in teamStats.items():
                        statsData[teamID]
                        print('\t', pos, names)

            if test is False:
                augmentedStatsFilename = setFile(
                    hist.getStatisticsResultsDir(),
                    "{0}-stats-extra.json".format(year))
                saveFile(idata=statsData,
                         ifile=augmentedStatsFilename,
                         debug=True)
Exemplo n.º 21
0
 def getGamesResultsFiles(self):
     files = findExt(self.getGamesResultsDir(), ext=".p", debug=False)
     return files
Exemplo n.º 22
0
    def parseTeamYearlyStandings(self,
                                 startYear=2003,
                                 endYear=2018,
                                 debug=False,
                                 verydebug=False):
        for year in range(startYear, endYear + 1):
            seasonDir = self.getYearlySeasonDir(year)
            files = findExt(seasonDir, ext=".p", debug=False)

            seasonData = season(year)

            for ifile in files:
                nameyear = getBaseFilename(ifile)
                htmldata = getFile(ifile)
                bsdata = getHTML(htmldata)
                teamName = nameyear.replace("-{0}".format(year), "")

                metadata = bsdata.find("meta", {"property": "og:url"})
                if metadata is None:
                    raise ValueError(
                        "Could not find basic team meta data for this file! {0}"
                        .format(ifile))

                try:
                    content = metadata.attrs['content']
                    year = getBasename(content)
                    teamID = getBasename(getDirname(getDirname(content)))
                except:
                    raise ValueError(
                        "Could not get team year and ID from meta data: {0}".
                        format(metadata))

                if verydebug:
                    print(year, '\t', teamID, '\t', ifile)

                ## Create Team Object
                teamData = team(year=year,
                                teamName=teamName,
                                teamMascot=None,
                                teamID=teamID)

                tables = bsdata.findAll("table", {"class": "Table2__table"})
                if verydebug:
                    print("\tFound {0} game tables".format(len(tables)))
                for it, table in enumerate(tables):
                    trs = table.findAll("tr")

                    headers = trs[1]
                    headers = [
                        x.text for x in headers.findAll("td") if x is not None
                    ]

                    gameRows = trs[2:]
                    totalGames = len(gameRows)

                    if verydebug:
                        print("\tFound {0} potential games".format(totalGames))

                    for ig, tr in enumerate(gameRows):
                        tds = tr.findAll("td")
                        gameData = dict(zip(headers, tds))
                        extra = {"OT": False, "Bowl": False}

                        ## Get the Date
                        try:
                            date = gameData["Date"]
                        except:
                            print(ifile)
                            raise ValueError(
                                "No date for this game! {0}".format(gameData))
                        date = date.text

                        ## Only Keep Games With Regular Dates
                        try:
                            dateval = "{0} {1}".format(
                                date.split(", ")[-1], year)
                            date = getDateTime(dateval)
                        except:
                            date = None

                        if date is None:
                            continue

                        ## Check for January Games (in the following year)
                        if date.month == 1:
                            date = addMonths(date, 12)

                        ## Get the Opponent
                        try:
                            opponent = gameData["Opponent"]
                        except:
                            raise ValueError(
                                "No opponent for this game! {0}".format(game))

                        try:
                            oppolink = opponent.find("a")
                            oppohref = oppolink.attrs['href']
                            opponame = getBasename(oppohref)
                            oppoID = getBasename(getDirname(oppohref))
                        except:
                            opponame = opponent.text
                            oppoID = 0
                            #raise ValueError("Could not find href in link! {0}".format(opponent))

                        try:
                            gamespan = opponent.find("span", {"class": "pr2"})
                            gametype = gamespan.text
                        except:
                            raise ValueError(
                                "Could not find game type from {0}".format(
                                    opponent))

                        if gametype == "vs":
                            location = teamID
                        elif gametype == "@":
                            location = oppoID
                        else:
                            raise ValueError(
                                "Location --> {0}".format(gametype))

                        if verydebug:
                            print("\t{0}/{1}\t{2}\t{3: <4}{4: <50}".format(
                                ig, totalGames, printDateTime(date), gametype,
                                opponame),
                                  end="\t")

                        ## Get the Result
                        try:
                            result = gameData["Result"]
                        except:
                            raise ValueError(
                                "No result for this game! {0}".format(game))

                        spans = result.findAll("span")
                        if len(spans) == 0:
                            continue
                        if len(spans) != 2:
                            raise ValueError(
                                "There are {0} spans in this row!: {1}".format(
                                    len(spans), result))
                        outcome = spans[0].text.strip()
                        score = spans[1].text.strip()

                        if score.endswith("OT"):
                            extra = {"OT": True}
                            score = score[:-3].strip()

                        try:
                            scores = [int(x) for x in score.split('-')]
                        except:
                            raise ValueError(
                                "Could not create integer scores from {0}".
                                format(spans))

                        if outcome == 'W':
                            teamScore = scores[0]
                            oppoScore = scores[1]
                            teamResult = "W"
                            oppoResult = "L"
                        elif outcome == "L":
                            teamScore = scores[1]
                            oppoScore = scores[0]
                            teamResult = "L"
                            oppoResult = "W"
                        elif outcome == "T":
                            teamScore = scores[0]
                            oppoScore = scores[1]
                            teamResult = "T"
                            oppoResult = "T"
                        else:
                            raise ValueError(
                                "Did not recognize game outcome {0}".format(
                                    outcome))

                        ## Get the Game
                        try:
                            gamelink = result.find("a")
                            gamehref = gamelink.attrs['href']
                        except:
                            raise ValueError(
                                "Could not find href in link! {0}".format(
                                    result))

                        if verydebug:
                            print("{0}  {1}".format(
                                teamResult, "-".join(
                                    str(x) for x in [teamScore, oppoScore])))

                        ## Create game object
                        gameData = game(gameID=gameID,
                                        date=date,
                                        teamA=teamID,
                                        teamB=oppoID,
                                        teamAResult=teamResult,
                                        teamBResult=oppoResult,
                                        teamAScore=teamScore,
                                        teamBScore=oppoScore,
                                        location=location)

                        ## Append game to team data
                        teamData.addGame(gameData)

                ## Show Summary
                teamData.setStatistics()
                if debug:
                    teamData.summary()
                    if teamData.ngames == 0:
                        removeFile(ifile, debug=True)

                seasonData.addTeam(teamData)

            #http://www.espn.com/college-football/team/schedule/_/id/201/season/2005"

            savename = setFile(self.getSeasonResultsDir(),
                               "{0}.p".format(year))
            saveFile(idata=seasonData, ifile=savename, debug=True)
Exemplo n.º 23
0
    def parseGameData(self,
                      startYear=2003,
                      endYear=2018,
                      debug=False,
                      verydebug=False):
        noData = {}
        for year in range(startYear, endYear + 1):

            yearData = {}

            gamesDir = self.getYearlyGamesDir(year)
            files = findExt(gamesDir, ext=".p", debug=False)

            noData[year] = []
            for i, ifile in enumerate(files):
                gameID = getBaseFilename(ifile)

                if gameID in self.noGameData:
                    continue

                htmldata = getFile(ifile)
                bsdata = getHTML(htmldata)
                #print(bsdata)

                #verydebug=True
                #if gameID not in ['400603866']:
                #    continue

                teamData = bsdata.findAll("div", {"class": "team-container"})

                longNames = [
                    x.find("span", {"class": "long-name"}) for x in teamData
                ]
                longNames = [x.text for x in longNames if x is not None]

                shortNames = [
                    x.find("span", {"class": "short-name"}) for x in teamData
                ]
                shortNames = [x.text for x in shortNames if x is not None]

                teamAbbrevs = [
                    x.find("span", {"class": "abbrev"}) for x in teamData
                ]
                teamNames = [x.attrs for x in teamAbbrevs if x is not None]
                teamNames = [x['title'] for x in teamNames]
                teamAbbrevs = [x.text for x in teamAbbrevs]

                teamIDs = [
                    x.find("img", {"class": "team-logo"}) for x in teamData
                ]
                teamIDs = [x.attrs for x in teamIDs if x is not None]
                teamIDs = [x['src'] for x in teamIDs]
                teamIDs = [re.search(r"(\d+).png", x) for x in teamIDs]
                teamIDs = [x.groups()[0] for x in teamIDs]

                awayTeam = {
                    "Name": longNames[0],
                    "Mascot": shortNames[0],
                    "Abbrev": teamAbbrevs[0],
                    "ID": teamIDs[0]
                }
                homeTeam = {
                    "Name": longNames[1],
                    "Mascot": shortNames[1],
                    "Abbrev": teamAbbrevs[1],
                    "ID": teamIDs[1]
                }

                metadata = bsdata.find("meta", {"property": "og:title"})
                title = None
                if metadata is not None:
                    title = metadata.attrs['content']
                    if verydebug:
                        print("==> {0}".format(title))

                ## Possesions
                posData = bsdata.find("ul", {"class": "css-accordion"})
                if posData is None:
                    posData = bsdata.find("article", {"class": "play-by-play"})
                if posData is None:
                    noData[year].append(gameID)
                    if verydebug:
                        print("Could not find possession data! {0}".format(
                            gameID))
                    continue
                    #print(bsdata)
                    #1/0
                    #removeFile(ifile, debug)
                    #continue

                gameData = {
                    "Teams": {
                        "Away": awayTeam,
                        "Home": homeTeam
                    },
                    "Plays": []
                }

                if i % 10 == 0:
                    print("{0}/{1} with {2} no data games".format(
                        i, len(files), len(noData[year])))

                ###################
                ## Get Full Drive Data
                ###################

                drives = posData.findAll("li", {"class": "accordion-item"})
                if verydebug:
                    print("Drives {0}".format(len(drives)))

                for idr, drive in enumerate(drives):

                    ## Get Drive Summary
                    headlines = [
                        x.text.strip()
                        for x in drive.findAll("span", {"class": "headline"})
                    ]
                    if verydebug:
                        print("Headlines {0}".format(len(headlines)))

                    ## Get Drive Details
                    details = [
                        x.text.strip() for x in drive.findAll(
                            "span", {"class": "drive-details"})
                    ]
                    if verydebug:
                        print("Details {0}".format(len(details)))

                    ## Get Home Score
                    homescores = drive.findAll("span", {"class": "home"})
                    homescores = [
                        x.find("span", {"class": "team-score"})
                        for x in homescores
                    ]
                    homescores = [x.text for x in homescores if x is not None]
                    if verydebug:
                        print("Home Scores {0}".format(len(homescores)))

                    ## Get Away Score
                    awayscores = drive.findAll("span", {"class": "away"})
                    awayscores = [
                        x.find("span", {"class": "team-score"})
                        for x in awayscores
                    ]
                    awayscores = [x.text for x in awayscores if x is not None]
                    if verydebug:
                        print("Away Scores {0}".format(len(awayscores)))

                    ## Get Possession
                    possessions = drive.findAll("span", {"class": "home-logo"})
                    possessions = [
                        x.find("img", {"class": "team-logo"})
                        for x in possessions
                    ]
                    possessions = [
                        x.attrs['src'] for x in possessions if x is not None
                    ]
                    possessions = [x.split('&')[0] for x in possessions]
                    possessions = [getBaseFilename(x) for x in possessions]
                    if verydebug:
                        print("Possessions {0}".format(len(possessions)))

                    ## Check for valid headline (parsed correctly?)
                    if len(headlines) == 0:
                        continue

                    validFGs = [
                        "Missed FG", "Field Goal", "FIELD GOAL", "MISSED FG",
                        "Made FG", "Field Goal Good", "Field Goal Missed",
                        "Blocked FG"
                    ]
                    validTDs = [
                        "Touchdown", "TOUCHDOWN", "END OF HALF Touchdown",
                        "Downs Touchdown", "Missed FG Touchdown",
                        "End of Half Touchdown", "End of Game Touchdown",
                        "PUNT Touchdown", "FUMBLE Touchdown",
                        "INTERCEPTION Touchdown", "FIELD GOAL Touchdown",
                        "MISSED FG Touchdown", "Rushing Touchdown",
                        "Passing Touchdown", "Kickoff Return Touchdown",
                        "Interception Return Touch",
                        "Turnover on Downs Touchdown",
                        "Field Goal Missed Touchdown", "Field Goal Touchdown",
                        "Rushing Touchdown Touchdown",
                        "Field Goal Good Touchdown",
                        "Passing Touchdown Touchdown",
                        "Fumble Return Touchdown Touchdown", "Rushing TD",
                        "Passing TD", "Blocked Punt TD", "Punt Return TD",
                        "Fumble Ret. TD", "Interception TD", "Fumble TD",
                        "Rushing TD Touchdown", "Blocked Punt TD Touchdown",
                        "Blocked FG (TD)", "Punt Return TD Touchdown",
                        "Kick Return TD", "Kickoff Return Touchdown Touchdown",
                        "Missed FG (TD) Touchdown",
                        "Blocked FG (TD) Touchdown",
                        "Punt Return Touchdown Touchdown",
                        "Interception Return Touch Touchdown"
                    ]
                    validEnds = [
                        "End of Half", "End of Game", "END OF HALF",
                        "END OF GAME", "End of 4th Quarter"
                    ]
                    validTOs = [
                        "Fumble", "Interception", "FUMBLE", "INTERCEPTION",
                        "Kickoff", "KICKOFF", "Blocked Punt"
                    ]
                    validTOPnts = [
                        "Interception Touchdown", "Safety", "Punt Touchdown",
                        "Fumble Touchdown", "Punt Return Touchdown",
                        "Fumble Return Touchdown", "SAFETY"
                    ]
                    validDowns = [
                        "Punt", "Downs", "PUNT", "Possession (For OT Drives)",
                        "DOWNS", "Possession (For OT Drives) Touchdown",
                        "Turnover on Downs", "Poss. on downs", "Penalty"
                    ]
                    validPlay = [
                        "Rush", "Pass", "Sack", "Timeout", "Incomplete",
                        "Pass Complete"
                    ]
                    valid2PT = ["2PT Pass failed", "Missed PAT Return"]
                    validOdds = ["on-side kick"]
                    validHeadlines = validFGs + validTDs + validEnds + validTOs + validTOPnts + validDowns + validPlay + valid2PT
                    isValidHeadline = sum(
                        [x in validHeadlines for x in headlines])
                    if headlines[0] == '':
                        continue
                    if isValidHeadline == 0 and idr < len(drives) - 1:
                        print(idr, '/', len(drives))
                        print(title)
                        print(ifile)
                        #print(bsdata)
                        raise ValueError(
                            "No valid headline in {0}".format(headlines))
                        print("No valid headline in {0}".format(headlines))
                        continue

                    ## Analyze Play-by-Play
                    try:
                        driveList = drive.find("ul", {"class": "drive-list"})
                        plays = driveList.findAll("li")
                    except:
                        raise ValueError(
                            "Could not find drive list in drive {0}".format(
                                drive))

                    driveData = []
                    for ip, play in enumerate(plays):

                        ## Check for Starting Position
                        startPos = play.find("h3")
                        if startPos is None:
                            raise ValueError(
                                "Could not find Starting Position in Play! {0}"
                                .format(play))
                        startData = startPos.text.strip()

                        ## Check for Play Text
                        span = play.find("span", {"class": "post-play"})
                        if span is None:
                            raise ValueError(
                                "Could not find post play data! {0}".format(
                                    play))
                        playData = span.text.strip()

                        driveData.append({
                            "Play": ip,
                            "Start": startData,
                            "Data": playData
                        })

                        #print(idr,'\t',ip,'\t',startData,'\t',playData)

                    ## Save Drive Data
                    gameData["Plays"].append({
                        "Drive": len(gameData),
                        "Headline": headlines,
                        "Detail": details,
                        "HomeScore": homescores,
                        "AwayScore": awayscores,
                        "Possession": possessions,
                        "Data": driveData
                    })

                    if verydebug:
                        print(idr, '\t', headlines)
                        print(idr, '\t', details)
                        print(idr, '\t', homescores)
                        print(idr, '\t', awayscores)
                        print(idr, '\t', possessions)
                        print("")

                if verydebug:
                    print("Found {0} drives for gameID {1}".format(
                        len(gameData), gameID))
                yearData[gameID] = gameData

            print("Parsed {0}/{1} games in {2}".format(len(yearData),
                                                       len(files), year))
            savename = setFile(self.getGamesResultsDir(),
                               "{0}-games.p".format(year))
            saveFile(idata=yearData, ifile=savename, debug=True)

        return noData
Exemplo n.º 24
0
    def parseArtistModValCreditFiles(self, modVal, dbdata=None, debug=False, force=False):
        print("\t","="*100)
        print("\t","Parsing Artist Credit Files For ModVal {0}".format(modVal))
        artistInfo = self.artist

        artistDir = self.disc.getArtistsDir()
        maxModVal = self.disc.getMaxModVal()
                    
        artistDBDir = self.disc.getArtistsDBDir()        
        
        dirVal = setDir(artistDir, str(modVal))
        dirVal = setDir(dirVal, "credit")
        files  = findExt(dirVal, ext='.p')
        
        if len(files) == 0:
            return dbdata
        print("\t","  Found {0} credit files for ModVal {1}".format(len(files), modVal))

        dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal))
        retdbdata = False

        if dbdata is None:
            print("\t","  Loaded ", end="")
            dbdata = getFile(dbname, version=3)
            print("\t","{0} artist IDs.".format(len(dbdata)))
        else:
            retdbdata = True

        saveIt = 0
        
        nArtistMedia = {}
        print("\t","{0} artist IDs.".format(len(dbdata)))
        
        for j,ifile in enumerate(files):
            if force is True:
                if j % 500 == 0:
                    print("\t","\tProcessed {0}/{1} files.".format(j,len(files)))
            if debug:
                print("\t","{0}/{1} -- {2}.".format(j,len(files),ifile))
            
            info     = artistInfo.getData(ifile)
            artistID = info.ID.ID
            
            #currentMedia = sum([len(x) for x in dbdata[artistID].media.media.values()])
            #print(artistID,'\t',sum([len(x) for x in dbdata[artistID].media.media.values()]),end="\t")

            currentKeys = []
            if dbdata.get(artistID) is not None:
                currentKeys = list(dbdata[artistID].media.media.keys())
            else:
                dbdata[artistID] = info
                saveIt += 1
                continue
            
            keys = list(set(list(info.media.media.keys()) + currentKeys))
            for k in keys:
                v = info.media.media.get(k)
                if v is None:
                    continue
                iVal  = {v2.code: v2 for v2 in v}
                dVal  = dbdata[artistID].media.media.get(k)
                if dVal is None:
                    Tretval = iVal
                    saveIt += len(iVal)
                else:
                    Tretval = {v2.code: v2 for v2 in dVal}
                    Tretval.update(iVal)
                    saveIt += len(iVal)
                dbdata[artistID].media.media[k] = list(Tretval.values())
                
            if debug:
                print("\t","File:",j," \tArtist:",artistID,'-->',currentMedia,'to',sum([len(x) for x in dbdata[artistID].media.media.values()]))

                
        if retdbdata is True:
            return dbdata
        #if saveAll is False:
        #    return saveIt
                
                
        if saveIt > 0:
            savename = setFile(artistDBDir, "{0}-DB.p".format(modVal))
            print("\t","Saving {0} artists to {1}".format(len(dbdata), savename))
            print("\t","Saving {0} new (credit) artist media to {1}".format(saveIt, savename))
            dbNumAlbums = sum([self.getArtistNumAlbums(artistData) for artistData in dbdata.values()])
            print("\t","Saving {0} total (credit) artist media".format(dbNumAlbums))
            saveFile(idata=dbdata, ifile=savename)
            
            self.createArtistModValMetadata(modVal=modVal, db=dbdata, debug=debug)
            self.createArtistAlbumModValMetadata(modVal=modVal, db=dbdata, debug=debug)
            
        return saveIt
Exemplo n.º 25
0
    def parseTeamStatisticsData(self,
                                startYear=2014,
                                endYear=2018,
                                debug=False,
                                verydebug=False):
        for year in range(startYear, endYear + 1):

            yearData = {}

            statsDir = self.getYearlyStatisticsDir(year)
            files = findExt(statsDir, ext=".p", debug=False)

            for i, ifile in enumerate(files):
                teamStatistics = {}

                print(ifile)
                htmldata = getFile(ifile)
                bsdata = getHTML(htmldata)

                divs = bsdata.findAll("div", {"class": "Table2__Title"})
                tableNames = [x.text for x in divs]

                tables = bsdata.findAll("table",
                                        {"class": "Table2__table__wrapper"})

                ## Skip the team leaders table
                tableNames = tableNames[1:]
                #tables     = tables[1:]
                if len(tables) != len(tableNames):
                    for it, table in enumerate(tables):
                        ths = table.findAll("th")
                        headers = [x.text for x in ths]
                        print(it, headers)

                    raise ValueError(
                        "There are {0} tables and {1} names".format(
                            len(tables), tableNames))
                #print("  Found {0} tables and {1} names".format(len(tables), len(tableNames)))

                tableData = dict(zip(tableNames, tables))
                for tableName, table in tableData.items():
                    ths = table.findAll("th")
                    headers = [x.text for x in ths]

                    trs = table.findAll("tr")[2:]

                    players = {}
                    iData = -1
                    for tr in trs:
                        linedata = [x for x in tr.strings]

                        ## Get player first
                        if len(linedata) == 3:
                            try:
                                name = linedata[0]
                                position = linedata[2]
                            except:
                                raise ValueError(
                                    "Could not parse line data: {0}".format(
                                        linedata))

                            key = ":".join([name, position])
                            players[key] = None
                        elif len(linedata) == 1:
                            players["TOTAL:ALL"] = None
                            playerNames = list(players.keys())
                        elif len(linedata) == len(headers) - 1:
                            if iData == -1:
                                header = linedata
                                iData += 1
                                continue
                            else:
                                try:
                                    playerData = dict(zip(header, linedata))
                                except:
                                    raise ValueError(
                                        "Could not combine header [{0}] with data [{1}]"
                                        .format(header, linedata))

                            try:
                                players[playerNames[iData]] = playerData
                            except:
                                raise ValueError(
                                    "Could not set data for [{0}] with data: {1}"
                                    .format(iData, playerData))
                            #print(iData,'\t',playerNames[iData],'\t',playerData)
                            iData += 1

                    #print(tableName,'-->',players)
                    teamStatistics[tableName] = players

            yearData[year] = teamStatistics

            print("Parsed {0}/{1} games in {2}".format(len(yearData),
                                                       len(files), year))
            savename = setFile(self.getStatisticsResultsDir(),
                               "{0}-stats.p".format(year))
            saveFile(idata=yearData, ifile=savename, debug=True)