def saveCorrections(self, debug=True): corrsavename = setFile(self.getDataDir(), "corr.yaml") corrData = getFile(corrsavename) try: savename = setFile(self.getDataDir(), "saved.yaml") savedData = getFile(savename) except: raise ValueError("Could not access saved data!") savedData = {} if corrData is None: print("There is no corrections data.") else: print("Found {0} old corrections".format(len(savedData))) print("Found {0} new corrections".format(len(corrData))) for movie,corrs in corrData.items(): if savedData.get(movie) is None: if debug: print("Adding {0}".format(movie)) savedData[movie] = corrs else: newSaved = list(set(savedData[movie] + corrs)) if len(newSaved) != len(savedData[movie]): print("Adding new corrections to {0}".format(movie)) savedData[movie] = newSaved try: savename = setFile(self.getDataDir(), "saved.yaml") saveFile(idata=savedData, ifile=savename, debug=debug) print("There are {0} total corrections".format(len(savedData))) except: raise ValueError("There was an error saving the saved corrctions yaml file!")
def createRawOscarData(self, debug=True): print("Checking for poorly parsed oscar data.") indir = self.wikiData.getResultsDir() files = sorted(findExt(indir, ext=".json")) if debug: print("Found {0} oscar files".format(len(files))) yearlyData = {} for ifile in files: year = getBaseFilename(ifile) yearlyData[year] = getFile(ifile) savename = setFile(self.getCorrectionsDir(), "saved.yaml") if not isFile(savename): savedData = {} else: savedData = getFile(savename) for year in savedData.keys(): for title in savedData[year].keys(): savedWinner = savedData[year][title].get("Winner") savedNominees = savedData[year][title].get("Nominees") if savedWinner is not None: print("Overwritting {0} {1} winner".format(year, title)) yearlyData[year][title]["Winner"] = savedWinner if savedNominees is not None: print("Overwritting {0} {1} nominees".format(year, title)) yearlyData[year][title]["Nominees"] = savedNominees savename = setFile(self.getCorrectionsDir(), "raw.yaml") saveFile(idata=yearlyData, ifile=savename)
def mergeArtistAlbumIDMap(self): print("="*50) print("") ts = timestat("Merging ArtistAlbumID DBs for ==> {0} <==".format(self.db)) print("") print("="*50) mergerData = self.mam.getMergerDataByDB(self.db) savenames = ["IDToAlbumNames", "IDToAlbumRefs"] for basename in savenames: savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}PreMerge.p".format(basename)) savedata = getFile(savename).to_dict() print("Found {0} entries.".format(len(savedata))) fromIDs = mergerData.apply(lambda x: len(x["MergeData"])).sum() toIDs = len(mergerData) print("") print("================================================") print(" Merger From [{0}] DB IDs To [{1}] New IDs".format(fromIDs, toIDs)) print(" Pre Merge [{0}]".format(len(savedata))) for artistName,artistData in mergerData.iteritems(): newID = artistData["ID"] dbIDs = artistData["MergeData"].keys() #print(newID,'\t',len(dbIDs),'\t',artistName) savedata[newID] = {} for i,artistID in enumerate(dbIDs): if savedata.get(artistID) is not None: for mediaName, mediaData in savedata[artistID].items(): if savedata[newID].get(mediaName) is not None: savedata[newID][mediaName].update(mediaData) else: savedata[newID][mediaName] = mediaData for artistID in dbIDs: try: del savedata[artistID] except: print("Could not delete merged ID {0}".format(artistID)) print(" Post Merge [{0}]".format(len(savedata))) print("================================================") print("") savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}.p".format(basename)) print("Saving {0} entries to {1}\n".format(len(savedata), savename)) saveFile(ifile=savename, idata=Series(savedata), debug=True) sleep(0.5) ts.stop()
def main(args): cwd = getcwd() albumSegments = {} discSegments = {} for ifile in findPattern("./", pattern="."): mid = mp3ID(ifile) try: mid = mp3ID(ifile) except: print("Error reading file {0}".format(ifile)) continue album = mid.getAlbum() print("Album: {0}".format(album)) if album is not None: album = album[0] if albumSegments.get(album) is None: albumSegments[album] = [] albumSegments[album].append(ifile) disc = mid.getDiscNumber() if disc is not None: disc = disc[0] if discSegments.get(disc) is None: discSegments[disc] = [] discSegments[disc].append(ifile) if args.album is True: print("Album Segments: {0}".format(albumSegments.keys())) for album, albumFiles in albumSegments.items(): albumDir = setDir(cwd, album) mkDir(albumDir) for ifile in albumFiles: src = ifile dst = setFile(albumDir, getBasename(ifile)) print("Moving [{0}] to [{1}]".format(src, dst)) moveFile(src, dst, debug=True) if args.disc is True: print("Disc Segments: {0}".format(discSegments.keys())) for disc, discFiles in discSegments.items(): discDir = setDir(cwd, "Disc {0}".format(disc)) mkDir(discDir) for ifile in discFiles: src = ifile dst = setFile(discDir, getBasename(ifile)) #print("Moving [{0}] to [{1}]".format(src, dst)) moveFile(src, dst, debug=True)
def processWikiFilmYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in sorted(files): if debug: print("Processing {0}".format(ifile)) year = getBaseFilename(ifile) movies[year] = self.parseWikiFilmYearlyData(ifile, debug=False) yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=False) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} WikiFilm data to {1}".format(len(yearlyData), savename)) saveFile(savename, yearlyData)
def createArtistMetadataMap(self): ts = timestat("Creating Artist DBs") artistIDGenre = {} artistIDStyle = {} artistIDCollaborations = {} albumsMetadataDBDir = self.disc.getAlbumsMetadataDBDir() files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistMetadata", ext='.p') for ifile in files: print(ifile,'\t',end="") for artistID,artistData in getFile(ifile).items(): genre = artistData['Genre'] artistIDGenre[artistID] = genre artists = artistData['Artists'] artistIDCollaborations[artistID] = artists style = artistData['Style'] artistIDStyle[artistID] = style print(len(artistIDGenre)) print("\n\n==============================================\n") savenames = {"IDToGenre": artistIDGenre, "IDToStyle": artistIDStyle, "IDToCollaborations": artistIDCollaborations} for basename,savedata in savenames.items(): savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}.p".format(basename)) print("Saving {0} entries to {1}\n".format(len(savedata), savename)) saveFile(ifile=savename, idata=Series(savedata), debug=True) ts.stop()
def parseRottenTomatoes(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".p") movies = {} for ifile in files: result = self.parseRottenTomatoesFile(ifile, debug=debug) for year, yearlyResult in result.items(): if movies.get(year) is None: movies[year] = yearlyResult else: movies[year] = {**movies[year], **yearlyResult} yearlyData = {} for year in movies.keys(): yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "rottentomatoes.json") print("Saving", len(yearlyData), "yearly results to", savename) saveFile(savename, yearlyData)
def __init__(self, debug=False): self.db = "DatPiff" self.disc = dbBase(self.db.lower()) self.artist = artistDP(self.disc) self.dutils = datpiffUtils() self.dutils.setDiscogs(self.disc) self.debug = debug ## MultiArtist self.mulArts = multiartist() print("DatPiff ArtistsDir: {0}".format(self.disc.getArtistsDir())) if not isDir(self.disc.getArtistsDir()): raise ValueError("Could not find artist dir for DatPiff") self.knownDir = setDir(self.disc.getArtistsDir(), "known") if not isDir(self.knownDir): print("Make sure that Piggy is loaded!!!") raise ValueError("Could not find known [{0}] dir for DatPiff".format(self.knownDir)) self.knownFile = setFile(self.knownDir, "datPiffKnown.p") if not isFile(self.knownFile): raise ValueError("Known File [{0}] does not exist".format(self.knownFile)) self.baseURL = "https://www.datpiff.com/" self.searchURL = "https://www.datpiff.com/mixtapes-search?" super().__init__(self.db, self.disc, self.artist, self.dutils, debug=debug)
def processAACTACategoryData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext="*.p") from collections import OrderedDict movies = OrderedDict() print(files) for ifile in files: if debug: print("Processing {0}".format(ifile)) category = getBaseFilename(ifile) results = self.parseAACTACategoryData(ifile, category, debug=debug) if len(results) == 0: raise ValueError("No results for {0}".format(ifile)) for year, yearData in results.items(): for category, categoryData in yearData.items(): if movies.get(year) is None: movies[year] = [] for movie in categoryData: movies[year].append(movie) for year in movies.keys(): movies[year] = list(set(movies[year])) yearlyMovies = movies[year] movies[year] = [] for movie in yearlyMovies: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of AACTA Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def rmIDFromDB(self, artistID, modValue=None): print("Trying to remove data from ArtistID {0}".format(artistID)) if modValue is None: modValue = self.dutils.getDiscIDHashMod(discID=artistID, modval=self.disc.getMaxModVal()) artistDBDir = self.disc.getArtistsDBDir() dbname = setFile(artistDBDir, "{0}-DB.p".format(modValue)) print("Loading {0}".format(dbname)) dbdata = getFile(dbname) saveVal = False if isinstance(artistID, str): artistID = [artistID] elif not isinstance(artistID, list): raise ValueError("Not sure what to do with {0}".format(artistID)) for ID in artistID: try: del dbdata[ID] print("Deleted {0}".format(ID)) saveVal = True except: print("Not there...") self.rmIDFiles(ID) if saveVal: print("Saving {0}".format(dbname)) saveFile(idata=dbdata, ifile=dbname) else: print("No reason to save {0}".format(dbname))
def downloadUltimateMovieRankingsYearlyData(self, year, outdir, debug=False): yname = str(year) url = "https://www.ultimatemovierankings.com/{0}-top-box-office-movies/".format( year) url = "https://www.ultimatemovierankings.com/top-grossing-movies-of-{0}/".format( year) url = "https://www.ultimatemovierankings.com/{0}-movies/".format(year) url = "https://www.ultimatemovierankings.com/{0}-top-grossing-movies/".format( year) url = "https://www.ultimatemovierankings.com/biggest-box-office-hits-of-{0}/".format( year) url = "https://www.ultimatemovierankings.com/top-grossing-{0}-movies/".format( year) url = "https://www.ultimatemovierankings.com/ranking-{0}-movies/".format( year) url = "https://www.ultimatemovierankings.com/best-worst-movies-{0}/".format( year) savename = setFile(outdir, yname + ".p") if isFile(savename): return if debug: print("Downloading/Saving {0}".format(savename)) try: getWebData(base=url, savename=savename, useSafari=False) sleep(2) except: sleep(0.2)
def getArtistModValFiles(self, modVal, previousDays=5, force=False): artistDir = self.disc.getArtistsDir() maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() dirVal = setDir(artistDir, str(modVal)) files = findExt(dirVal, ext='.p') dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal)) now = datetime.now() if isFile(dbname): lastModified = datetime.fromtimestamp(path.getmtime(dbname)) if force is True: lastModified = None else: lastModified = None newFiles = None if lastModified is None: newFiles = files print(" ===> Parsing all {0} files for modval {1}".format(len(newFiles), modVal)) else: numNew = [ifile for ifile in files if (now-datetime.fromtimestamp(path.getmtime(ifile))).days < previousDays] numRecent = [ifile for ifile in files if datetime.fromtimestamp(path.getmtime(ifile)) > lastModified] newFiles = list(set(numNew).union(set(numRecent))) print(" ===> Found new {0} files (< {1} days) to parse for modval {2}".format(len(newFiles), previousDays, modVal)) return newFiles
def parseArtistFiles(self, force=False, debug=False): from glob import glob artistDir = self.disc.getArtistsDir() artistDBData = {} files = findExt(self.knownDir, ext='.p') files = glob("/Volumes/Biggy/Discog/artists-datpiff/*/*.p") print("Found {0} downloaded search terms".format(len(files))) for i,ifile in enumerate(files): if ifile.endswith("datPiffKnown.p"): continue fileresults = getFile(ifile) if debug: print(i,'/',len(files),'\t',ifile) for j,fileresult in enumerate(fileresults): if debug: print(" ",j,'/',len(fileresults)) mixArtists = fileresult["ArtistName"] albumName = fileresult["AlbumName"] albumURL = fileresult["AlbumURL"] mixArtistNames = self.mulArts.getArtistNames(mixArtists) mixArtistNames = [x.title() for x in mixArtistNames.keys()] for artistName in mixArtistNames: artistID = str(self.dutils.getArtistID(artistName)) albumID = str(self.dutils.getArtistID(albumName)) modval = self.dutils.getArtistModVal(artistID) if artistDBData.get(modval) is None: artistDBData[modval] = {} if artistDBData[modval].get(artistName) is None: artistDBData[modval][artistName] = {"Name": artistName, "ID": artistID, "URL": None, "Profile": None, "Media": []} albumData = {"Artists": mixArtistNames, "Name": albumName, "URL": albumURL, "Code": albumID} artistDBData[modval][artistName]["Media"].append(albumData) maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() totalSaves = 0 for modVal,modvaldata in artistDBData.items(): dbData = {} for artistName, artistData in modvaldata.items(): self.artist.setData(artistData) artistVal = self.artist.parse() dbData[artistVal.ID.ID] = artistVal savename = setFile(artistDBDir, "{0}-DB.p".format(modVal)) print("Saving {0} artist IDs to {1}".format(len(dbData), savename)) totalSaves += len(dbData) saveFile(idata=dbData, ifile=savename) self.createArtistModValMetadata(modVal=modVal, db=dbData, debug=debug) self.createArtistAlbumModValMetadata(modVal=modVal, db=dbData, debug=debug) print("Saved {0} new artist IDs".format(totalSaves))
def processFlopsData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".html") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in files: htmldata = getFile(ifile) bsdata = getHTML(htmldata) tables = bsdata.findAll("table", {"class": "wikitable"}) for table in tables: trs = table.findAll("tr") try: ths = trs[0].findAll("th") ths = [x.text for x in ths] ths = [x.replace("\n", "") for x in ths] except: raise ValueError("Could not get headers") print(ths) for itr, tr in enumerate(trs[2:]): ths = tr.findAll("th") try: movie = ths[0].text movie = movie.replace("\n", "").strip() movie = movie.replace("[nb 2]", "") except: raise ValueError( "Could not find movie in {0}".format(ths)) tds = tr.findAll("td") try: year = tds[0].text year = int(year) except: raise ValueError( "Could not find year in {0}".format(tds)) print(year, '\t', movie) if yearlyData.get(year) is None: yearlyData[year] = [] yearlyData[year].append(movie) for year in sorted(yearlyData.keys()): movies[year] = [] for movie in yearlyData[year]: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of flops Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def downloadRottenTomatoesYearlyData(self, year, outdir, debug=False): yname = str(year) url = "https://www.rottentomatoes.com/top/bestofrt/?year=" + yname savename = setFile(outdir, "{0}.p".format(year)) if isFile(savename): return if debug: print("Downloading/Saving {0}".format(savename)) getWebData(base=url, savename=savename, useSafari=False)
def getMyMovies(self, debug=False): savename = setFile(self.getDataDir(), "mymovies.json") if not isFile(savename): raise ValueError("Cannot access {0}".format(savename)) mine = getFile(savename) if debug: print("Found {0} my movies".format(len(mine))) return mine
def getCombinedMovies(self, debug=False): savename = setFile(self.combine.getResultsDir(), "movies.json") if not isFile(savename): raise ValueErrro("Cannot access {0}".format(savename)) combinedMovies = getFile(savename) if debug: print("Found {0} combined movies".format(len(combinedMovies))) return combinedMovies
def __init__(self, path, chart, debug=False): self.debug = debug self.chart = chart self.path = path if chart is None: fullChartData = {} artistAlbumData = {} for chart in [ "MusicVF", "Billboard", "BillboardYE", "RateYourMusic", "RateYourMusicSong", "RateYourMusicList", "RateYourMusicList2" ]: print(chart) fullChartData.update( getFile( setFile( path, "current{0}FullChartArtistAlbumData.p".format( chart.lower())))) print("There are {0} artists in the full chart data".format( len(fullChartData))) artistAlbumData.update( getFile( setFile( path, "current{0}ArtistAlbumData.p".format( chart.lower())))) print("There are {0} artists in the artist album data".format( len(artistAlbumData))) self.fullChartData = fullChartData self.artistAlbumData = artistAlbumData else: self.fullChartData = getFile( setFile( path, "current{0}FullChartArtistAlbumData.p".format( chart.lower()))) print("There are {0} artists in the full chart data".format( len(self.fullChartData))) self.artistAlbumData = getFile( setFile(path, "current{0}ArtistAlbumData.p".format(chart.lower()))) print("There are {0} artists in the artist album data".format( len(self.artistAlbumData))) self.artistData = {} self.artistKeyToNameMap = {}
def parseArtistMetadataFiles(self, debug=False): artistDBDir = self.disc.getArtistsDBDir() maxModVal = self.disc.getMaxModVal() for modVal in range(maxModVal): savename = setFile(artistDBDir, "{0}-DB.p".format(modVal)) dbdata = getFile(savename) self.createArtistModValMetadata(modVal=modVal, db=dbdata, debug=debug) self.createArtistAlbumModValMetadata(modVal=modVal, db=dbdata, debug=debug)
def downloadWikiFilmYearlyData(self, year, outdir, debug=False): url = "https://en.wikipedia.org/wiki/{0}_in_film".format(year) savename = setFile(outdir, str(year) + ".p") if isFile(savename): return if debug: print("Downloading {0}".format(url)) getWebData(base=url, savename=savename, useSafari=False) sleep(1)
def assertDBModValExtraData(self, modVal, minPages=1, maxPages=None, allowMulti=False, test=True, clean=True): print("assertDBModValExtraData(",modVal,")") artistDBDir = self.disc.getArtistsDBDir() dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal)) dbdata = getFile(dbname) nerrs = 0 #ignores = self.artistIgnoreList() for artistID,artistData in dbdata.items(): first = True pages = artistData.pages if pages.more is True: npages = pages.pages if npages < minPages: continue if maxPages is not None: npages = min([npages, maxPages]) artistRef = artistData.url.url #if artistData.artist.name in ignores: # print("\tNot downloading artist in ignore list: {0}".format(artistData.artist.name)) # continue #savename = self.dutils.getArtistSavename(artistID) #removeFile(savename) #print("\t---> {0} / {1} {2}".format(1, pages.pages, savename)) #print(artistID,'\t',npages,'\t') #continue for p in range(1, npages+1): if p == 1: url = self.getArtistURL(artistRef) savename = self.dutils.getArtistSavename(artistID) else: url = self.getArtistURL(artistRef, p) savename = self.dutils.getArtistSavename(artistID, p) print("\t---> {0} / {1} {2}".format(p, pages.pages, url)) if clean is True: if isFile(savename): print("Removing {0}".format(savename)) removeFile(savename) if test is True: print("\t\tWill download: {0}".format(url)) print("\t\tJust testing... Will not download anything.") continue if not isFile(savename): if first: print("{0: <20}{1: <10}{2}".format(artistID,pages.tot,artistData.artist.name)) first = False print("{0: <20}{1: <10}{2}".format(artistID, "{0}/{1}".format(p,pages.pages), url)) self.dutils.downloadArtistURL(url=url, savename=savename, force=True) sleep(3)
def getArtistSavename(self, discID): artistDir = self.disc.getArtistsDir() modValue = self.discogsUtils.getDiscIDHashMod( discID=discID, modval=self.disc.getMaxModVal()) if modValue is not None: outdir = mkSubDir(artistDir, str(modValue)) savename = setFile(outdir, discID + ".p") return savename return None
def downloadSAGCategoryData(self, category, outdir, debug=False): url = "https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award_for_{0}".format(category) savename = setFile(outdir, category+".p") if isFile(savename): return if debug: print("Downloading {0}".format(url)) getWebData(base=url, savename=savename, useSafari=False) sleep(1)
def getArtistSavename(self, discID, page=1, credit=False, unofficial=False): artistDir = self.disc.getArtistsDir() modValue = self.dutils.getDiscIDHashMod(discID=discID, modval=self.disc.getMaxModVal()) if modValue is not None: outdir = mkSubDir(artistDir, str(modValue)) if isinstance(page, int) and page > 1: outdir = mkSubDir(outdir, "extra") savename = setFile(outdir, discID+"-{0}.p".format(page)) elif credit is True: outdir = mkSubDir(outdir, "credit") savename = setFile(outdir, discID+".p") elif unofficial is True: outdir = mkSubDir(outdir, "unofficial") savename = setFile(outdir, discID+".p") else: savename = setFile(outdir, discID+".p") return savename return None
def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movieData = OrderedDict() for ifile in sorted(files): #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p" htmldata = getFile(ifile) bsdata = getHTML(htmldata) year = getBaseFilename(ifile) data = {} done = False tables = bsdata.findAll("table") #, {"id": "table_3"}) movies = {} for it, table in enumerate(tables): ths = table.findAll("th") trs = table.findAll("tr") for itr, tr in enumerate(trs): tds = tr.findAll("td") if len(tds) == 11: val = removeTag(tds[1], 'span') film = val.text film = film.replace(" ({0})".format(year), "") try: rank = float(tds[-1].text) except: try: rank = float(tds[-2].text) except: raise ValueError(tds[-1], tds[-2], tr) movies[film] = rank movieData[year] = movies yearlyData = {} for year in sorted(movieData.keys()): yearlyData[year] = sorted(movieData[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format( len(yearlyData), savename)) saveFile(savename, yearlyData)
def getFilename(self, fast, local): basename = "ManualMultiArtists" self.localpfname = "{0}.p".format(basename) self.localyfname = "{0}.yaml".format(basename) self.pfname = setFile(self.multiArtistDir, self.localpfname) self.yfname = setFile(self.multiArtistDir, self.localyfname) if fast is True: if local is True: return self.localpfname else: return self.pfname else: if local is True: return self.localyfname else: return self.yfname raise ValueError("Somehow didn't get a filename!")
def downloadRottenTomatoesTop100Data(self, genre, outdir, debug=False): baseurl = "https://www.rottentomatoes.com" outdir = setDir(self.getDataDir()) if not isDir(outdir): mkDir(outdir) url = "/top/bestofrt/top_100_" + genre + "_movies/" url = baseurl + url savename = setFile(outdir, genre + ".p") if isFile(savename): return if debug: print("Downloading/Saving {0}".format(savename)) getWebData(base=url, savename=savename, useSafari=False, dtime=10) sleep(2)
def searchBoxOfficeMojo(self, movie, debug=False): savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) data = getFile(savename) print("Nearest matches for {0}".format(movie)) for year, yearlyMovies in data.items(): result = findNearest(movie, [x[0] for x in yearlyMovies], num=1, cutoff=0.9) if len(result) > 0: values = [(name, value) for name, value in yearlyMovies if name in result] print("{0: <6}{1}".format(year, values))
def downloadFilms101YearlyData(self, year, outdir, debug=False): url="http://www.films101.com/y{0}r.htm".format(year) savename = setFile(outdir, "{0}.p".format(year)) if isFile(savename): return try: if debug: print("Downloading/Saving {0}".format(savename)) getWebData(base=url, savename=savename, useSafari=False) except: return sleep(2)
def processBoxOfficeMojo(self, debug=False): outdir = self.getResultsDir() savename = setFile(outdir, "results.json") data = getFile(savename) movies = {} yearlyData = {} for i, year in enumerate(sorted(data.keys())): movies[year] = {} ydata = data[year] for wdata in ydata: for mdata in wdata: movie = mdata[2] retval = re.search("\((\d+)\)", movie) if retval: stryear = retval.group() movie = movie.replace(stryear, "").strip() gross = convertCurrency(mdata[9]) weekly = convertCurrency(mdata[4]) money = max(gross, weekly) if movies[year].get(movie) == None: movies[year][movie] = money else: movies[year][movie] = max(money, movies[year][movie]) yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 25/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:25]: print(item) print('\n') savename = setFile(outdir, "{0}.json".format(self.name)) print("Saving", len(yearlyData), "yearly results to", savename) saveFile(savename, yearlyData)