def rmIDFromDB(self, artistID, modValue=None): print("Trying to remove data from ArtistID {0}".format(artistID)) if modValue is None: modValue = self.dutils.getDiscIDHashMod(discID=artistID, modval=self.disc.getMaxModVal()) artistDBDir = self.disc.getArtistsDBDir() dbname = setFile(artistDBDir, "{0}-DB.p".format(modValue)) print("Loading {0}".format(dbname)) dbdata = getFile(dbname) saveVal = False if isinstance(artistID, str): artistID = [artistID] elif not isinstance(artistID, list): raise ValueError("Not sure what to do with {0}".format(artistID)) for ID in artistID: try: del dbdata[ID] print("Deleted {0}".format(ID)) saveVal = True except: print("Not there...") self.rmIDFiles(ID) if saveVal: print("Saving {0}".format(dbname)) saveFile(idata=dbdata, ifile=dbname) else: print("No reason to save {0}".format(dbname))
def saveCorrections(self, debug=True): corrsavename = setFile(self.getDataDir(), "corr.yaml") corrData = getFile(corrsavename) try: savename = setFile(self.getDataDir(), "saved.yaml") savedData = getFile(savename) except: raise ValueError("Could not access saved data!") savedData = {} if corrData is None: print("There is no corrections data.") else: print("Found {0} old corrections".format(len(savedData))) print("Found {0} new corrections".format(len(corrData))) for movie,corrs in corrData.items(): if savedData.get(movie) is None: if debug: print("Adding {0}".format(movie)) savedData[movie] = corrs else: newSaved = list(set(savedData[movie] + corrs)) if len(newSaved) != len(savedData[movie]): print("Adding new corrections to {0}".format(movie)) savedData[movie] = newSaved try: savename = setFile(self.getDataDir(), "saved.yaml") saveFile(idata=savedData, ifile=savename, debug=debug) print("There are {0} total corrections".format(len(savedData))) except: raise ValueError("There was an error saving the saved corrctions yaml file!")
def processWikiFilmYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in sorted(files): if debug: print("Processing {0}".format(ifile)) year = getBaseFilename(ifile) movies[year] = self.parseWikiFilmYearlyData(ifile, debug=False) yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=False) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} WikiFilm data to {1}".format(len(yearlyData), savename)) saveFile(savename, yearlyData)
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw HTML Files") tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistRawHTMLFiles(expr, force) tsFiles.stop() if debug: print("Parsing {0} Raw HTML Files From Expr[{1}]".format(len(newFiles), expr)) N = len(newFiles) modValue = 250 if N >= 500 else 50 tsParse = timestat("Parsing {0} Raw HTML Files".format(N)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N or debug: tsParse.update(n=i+1, N=N) #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile)) if debug: print("{0}/{1}\tParsing {2}".format(i,N,ifile)) htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID if debug: print(" ---> ID={0}".format(artistID)) savename = self.dutils.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=False) tsParse.stop() ts.stop()
def createRawOscarData(self, debug=True): print("Checking for poorly parsed oscar data.") indir = self.wikiData.getResultsDir() files = sorted(findExt(indir, ext=".json")) if debug: print("Found {0} oscar files".format(len(files))) yearlyData = {} for ifile in files: year = getBaseFilename(ifile) yearlyData[year] = getFile(ifile) savename = setFile(self.getCorrectionsDir(), "saved.yaml") if not isFile(savename): savedData = {} else: savedData = getFile(savename) for year in savedData.keys(): for title in savedData[year].keys(): savedWinner = savedData[year][title].get("Winner") savedNominees = savedData[year][title].get("Nominees") if savedWinner is not None: print("Overwritting {0} {1} winner".format(year, title)) yearlyData[year][title]["Winner"] = savedWinner if savedNominees is not None: print("Overwritting {0} {1} nominees".format(year, title)) yearlyData[year][title]["Nominees"] = savedNominees savename = setFile(self.getCorrectionsDir(), "raw.yaml") saveFile(idata=yearlyData, ifile=savename)
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw Files") tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistRawFiles(datatype=self.datatype, expr=expr, force=force) tsFiles.stop() N = len(newFiles) tsParse = timestat("Parsing {0} New Raw Files".format(N)) newData = 0 modValue = 250 if N >= 500 else 50 for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile)) htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID if artistID is None: continue savename = self.dutils.getArtistSavename(artistID) if savename is None: continue saveFile(idata=htmldata, ifile=savename, debug=False) newData += 1 print("Created {0}/{1} New Artist Files".format(newData, N)) tsParse.stop()
def processAACTACategoryData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext="*.p") from collections import OrderedDict movies = OrderedDict() print(files) for ifile in files: if debug: print("Processing {0}".format(ifile)) category = getBaseFilename(ifile) results = self.parseAACTACategoryData(ifile, category, debug=debug) if len(results) == 0: raise ValueError("No results for {0}".format(ifile)) for year, yearData in results.items(): for category, categoryData in yearData.items(): if movies.get(year) is None: movies[year] = [] for movie in categoryData: movies[year].append(movie) for year in movies.keys(): movies[year] = list(set(movies[year])) yearlyMovies = movies[year] movies[year] = [] for movie in yearlyMovies: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of AACTA Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def parseRottenTomatoes(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".p") movies = {} for ifile in files: result = self.parseRottenTomatoesFile(ifile, debug=debug) for year, yearlyResult in result.items(): if movies.get(year) is None: movies[year] = yearlyResult else: movies[year] = {**movies[year], **yearlyResult} yearlyData = {} for year in movies.keys(): yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "rottentomatoes.json") print("Saving", len(yearlyData), "yearly results to", savename) saveFile(savename, yearlyData)
def downloadKWorbSpotifyYouTubeArtists(self, update=False): url = "https://kworb.net/youtube/archive.html" savename = "kworb_youtubeartists.p" if update is True: self.dutils.downloadArtistURL(url=url, savename=savename, force=True) bsdata = getHTML(savename) data = [] artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "youtube") print(artistDir) for table in bsdata.findAll("table"): ths = [th.text for th in table.findAll("th")] for tr in table.findAll("tr")[1:]: item = dict(zip(ths, tr.findAll("td"))) data.append(item) print(data) if False: bsdata = getHTML(savename) artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "youtube") for div in bsdata.findAll("div", {"class": "subcontainer"}): if div.find("span", {"class": "pagetitle"}) is None: continue for ref in div.findAll("a"): href = ref.attrs['href'] url = "{0}/{1}".format(self.youtubeURL, href) savename = "{0}/{1}".format(saveDir, href.replace(".html", ".p")) if isFile(savename): print("Y\t", savename, '\t', url) else: print("-\t", savename, '\t', url) #dbArtistsKWorb().dutils.downloadArtistURL(url=fullURL, savename=savename, force=True) for ifile in findExt(saveDir, ".p"): bsdata = getHTML(ifile) for table in bsdata.findAll("table"): trs = table.findAll("tr") for tr in trs[1:]: ref = tr.find("a") href = ref.attrs['href'] name = ref.text url = "{0}/{1}".format(self.youtubeURL, href) savename = "{0}/{1}".format( setDir(saveDir, "artist"), href.replace(".html", ".p")) print(url, savename) if isFile(savename) is False: data, code = downloadURL(url) from ioUtils import getFile, saveFile saveFile(idata=data, ifile=savename) sleep(3) break
def parseArtistFiles(self, force=False, debug=False): from glob import glob artistDir = self.disc.getArtistsDir() artistDBData = {} files = findExt(self.knownDir, ext='.p') files = glob("/Volumes/Biggy/Discog/artists-datpiff/*/*.p") print("Found {0} downloaded search terms".format(len(files))) for i,ifile in enumerate(files): if ifile.endswith("datPiffKnown.p"): continue fileresults = getFile(ifile) if debug: print(i,'/',len(files),'\t',ifile) for j,fileresult in enumerate(fileresults): if debug: print(" ",j,'/',len(fileresults)) mixArtists = fileresult["ArtistName"] albumName = fileresult["AlbumName"] albumURL = fileresult["AlbumURL"] mixArtistNames = self.mulArts.getArtistNames(mixArtists) mixArtistNames = [x.title() for x in mixArtistNames.keys()] for artistName in mixArtistNames: artistID = str(self.dutils.getArtistID(artistName)) albumID = str(self.dutils.getArtistID(albumName)) modval = self.dutils.getArtistModVal(artistID) if artistDBData.get(modval) is None: artistDBData[modval] = {} if artistDBData[modval].get(artistName) is None: artistDBData[modval][artistName] = {"Name": artistName, "ID": artistID, "URL": None, "Profile": None, "Media": []} albumData = {"Artists": mixArtistNames, "Name": albumName, "URL": albumURL, "Code": albumID} artistDBData[modval][artistName]["Media"].append(albumData) maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() totalSaves = 0 for modVal,modvaldata in artistDBData.items(): dbData = {} for artistName, artistData in modvaldata.items(): self.artist.setData(artistData) artistVal = self.artist.parse() dbData[artistVal.ID.ID] = artistVal savename = setFile(artistDBDir, "{0}-DB.p".format(modVal)) print("Saving {0} artist IDs to {1}".format(len(dbData), savename)) totalSaves += len(dbData) saveFile(idata=dbData, ifile=savename) self.createArtistModValMetadata(modVal=modVal, db=dbData, debug=debug) self.createArtistAlbumModValMetadata(modVal=modVal, db=dbData, debug=debug) print("Saved {0} new artist IDs".format(totalSaves))
def processFlopsData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".html") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in files: htmldata = getFile(ifile) bsdata = getHTML(htmldata) tables = bsdata.findAll("table", {"class": "wikitable"}) for table in tables: trs = table.findAll("tr") try: ths = trs[0].findAll("th") ths = [x.text for x in ths] ths = [x.replace("\n", "") for x in ths] except: raise ValueError("Could not get headers") print(ths) for itr, tr in enumerate(trs[2:]): ths = tr.findAll("th") try: movie = ths[0].text movie = movie.replace("\n", "").strip() movie = movie.replace("[nb 2]", "") except: raise ValueError( "Could not find movie in {0}".format(ths)) tds = tr.findAll("td") try: year = tds[0].text year = int(year) except: raise ValueError( "Could not find year in {0}".format(tds)) print(year, '\t', movie) if yearlyData.get(year) is None: yearlyData[year] = [] yearlyData[year].append(movie) for year in sorted(yearlyData.keys()): movies[year] = [] for movie in yearlyData[year]: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of flops Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movieData = OrderedDict() for ifile in sorted(files): #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p" htmldata = getFile(ifile) bsdata = getHTML(htmldata) year = getBaseFilename(ifile) data = {} done = False tables = bsdata.findAll("table") #, {"id": "table_3"}) movies = {} for it, table in enumerate(tables): ths = table.findAll("th") trs = table.findAll("tr") for itr, tr in enumerate(trs): tds = tr.findAll("td") if len(tds) == 11: val = removeTag(tds[1], 'span') film = val.text film = film.replace(" ({0})".format(year), "") try: rank = float(tds[-1].text) except: try: rank = float(tds[-2].text) except: raise ValueError(tds[-1], tds[-2], tr) movies[film] = rank movieData[year] = movies yearlyData = {} for year in sorted(movieData.keys()): yearlyData[year] = sorted(movieData[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format( len(yearlyData), savename)) saveFile(savename, yearlyData)
def parseDownloadedFiles(self): artistDir = self.disc.getArtistsDir() dataDir = setDir(artistDir, "data") files = findPatternExt(dataDir, pattern="Discography and Albums", ext=".htm") for ifile in files: htmldata = getFile(ifile) retval = self.getData(ifile) artistID = retval.ID.ID savename = self.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=True)
def parseDownloadedFiles(self, previousDays=None, force=False): artistDir = self.disc.getArtistsDir() files = self.getArtistRawHTMLFiles(previousDays=None, force=False) return dataDir = setDir(artistDir, "data") files = findPatternExt(dataDir, pattern="Rate Your Music", ext=".html") for ifile in files: htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID savename = self.dutils.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=False)
def downloadUnknownArtistCompositions(self): newIgnores = [] for modVal, modValMetadata in self.metadata.items(): N = len(modValMetadata) ts = timestat( "Downloading {0} Unknown Composition Files For ModVal={1}". format(N, modVal)) for i, (artistID, artistIDData) in enumerate(modValMetadata.items()): savename = self.dutils.getArtistSavename(artistID, song=True) href = artistIDData["URL"] artist = artistIDData["Name"] if isFile(savename): continue ## Replace /credits with /songs href = "/".join(href.split('/')[:-1] + ["songs", "all"]) ## Create Full URL url = urllib.parse.urljoin(self.dbArtists.baseURL, href) print("\n") print("=" * 100) print("{0}/{1}: [{2}] / [{3}]".format(i, N, artist, url)) data, response = self.dutils.downloadURL(url) if response == 200: bsdata = getHTML(data) if len(bsdata.findAll("th", {"class": "title-composer"})) > 0: print(" ---> Saving Data To {0}".format(savename)) saveFile(idata=data, ifile=savename) sleep(3) continue sleep(3) newIgnores.append(artistID) if i == 20: break ts.stop() print("New IDs To Ignore") print(newIgnores) tsUpdate = timestat( "Adding {0} ArtistIDs To Master Composition Ignore List".format( len(newIgnores))) self.updateMasterIgnoreCompositionData(newIgnores) tsUpdate.stop()
def parseFilms101Data(self, debug=False): outdir = self.getDataDir() resultsdir = self.getResultsDir() files = findExt(outdir, ext=".p") movies = {} for ifile in sorted(files): year = getBaseFilename(ifile) results = self.parseFilms101YearlyData(ifile, debug=debug) movies[year] = [] for movie in results: movies[year].append([movie,10]) print("Found {0} movies in {1}".format(len(movies[year]),year)) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of films101 Data to {1}".format(len(movies), savename)) saveFile(savename, movies)
def createArtistAlbumModValMetadata(self, modVal, db=None, debug=False): if db is None: db = self.disc.getArtistsDBModValData(modVal) artistIDMetadata = {} for artistID,artistData in db.items(): artistIDMetadata[artistID] = {} for mediaName,mediaData in artistData.media.media.items(): albumURLs = {mediaValues.code: mediaValues.url for mediaValues in mediaData} albumNames = {mediaValues.code: mediaValues.album for mediaValues in mediaData} artistIDMetadata[artistID][mediaName] = [albumNames, albumURLs] artistDBDir = self.disc.getArtistsDBDir() savename = setSubFile(artistDBDir, "metadata", "{0}-MediaMetadata.p".format(modVal)) print("Saving {0} new artist IDs media data to {1}".format(len(artistIDMetadata), savename)) saveFile(idata=artistIDMetadata, ifile=savename)
def createArtistModValMetadata(self, modVal, db=None, debug=False): if db is None: db = self.disc.getArtistsDBModValData(modVal) artistIDMetadata = {k: [v.artist.name, v.url.url] for k,v in db.items()} for artistID,artistData in db.items(): if artistData.profile.variations is not None: artistIDMetadata[artistID].append([v2.name for v2 in artistData.profile.variations]) else: artistIDMetadata[artistID].append([artistData.artist.name]) artistDBDir = self.disc.getArtistsDBDir() savename = setSubFile(artistDBDir, "metadata", "{0}-Metadata.p".format(modVal)) print("Saving {0} new artist IDs name data to {1}".format(len(artistIDMetadata), savename)) saveFile(idata=artistIDMetadata, ifile=savename)
def searchForArtist(self, artist): print("\n\n===================== Searching For {0} =====================".format(artist)) url = self.getSearchArtistURL(artist) if url is None: raise ValueError("URL is None!") ## Download data data, response = self.downloadURL(url) if response != 200: print("Error downloading {0}".format(url)) return False known = getFile(self.knownFile) print(" Found {0} previously searched for terms.".format(len(known))) known.append(artist) saveFile(idata=known, ifile=self.knownFile) self.parseSearchArtist(artist, data)
def parseBoxOfficeMojoResults(self, startYear=1982, endYear=2017, debug=False): outdir = self.getDataDir() resultsdir = self.getResultsDir() if endYear == None: endYear = startYear years = range(int(startYear), int(endYear) + 1) for year in years: retval = [] files = findPatternExt(outdir, pattern=str(year), ext=".p") for ifile in files: result = self.parseBoxOfficeMojo(ifile, debug=debug) retval.append(result) savename = setFile(resultsdir, str(year) + ".json") print("Saving", len(retval), "weekends of movie data to", savename) saveFile(savename, retval)
def mergeBoxOfficeMojoResults(self, debug=False): retval = {} files = findExt(self.getResultsDir(), ext=".json") if debug: print("Found {0} files in the results directory".format( len(files))) for ifile in sorted(files): year = getBaseFilename(ifile) try: int(year) except: continue data = getFile(ifile) retval[year] = data if debug: print(" Adding {0} entries from {1}".format(len(data), ifile)) savename = setFile(self.getResultsDir(), "results.json") if debug: print("Saving", len(retval), "years of movie data to", savename) saveFile(savename, retval)
def processBoxOfficeMojo(self, debug=False): outdir = self.getResultsDir() savename = setFile(outdir, "results.json") data = getFile(savename) movies = {} yearlyData = {} for i, year in enumerate(sorted(data.keys())): movies[year] = {} ydata = data[year] for wdata in ydata: for mdata in wdata: movie = mdata[2] retval = re.search("\((\d+)\)", movie) if retval: stryear = retval.group() movie = movie.replace(stryear, "").strip() gross = convertCurrency(mdata[9]) weekly = convertCurrency(mdata[4]) money = max(gross, weekly) if movies[year].get(movie) == None: movies[year][movie] = money else: movies[year][movie] = max(money, movies[year][movie]) yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 25/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:25]: print(item) print('\n') savename = setFile(outdir, "{0}.json".format(self.name)) print("Saving", len(yearlyData), "yearly results to", savename) saveFile(savename, yearlyData)
def downloadGameData(self, debug=False, verydebug=False): resultsDir = self.getSeasonResultsDir() files = findExt(resultsDir, ext=".p", debug=False) gameType = "playbyplay" print("Sleeping for 5 seconds...") sleep(5) for ifile in files: seasonData = getFile(ifile) year = seasonData.getYear() if year not in [2013, 2014, 2015]: continue gamesDir = self.getYearlyGamesDir(year) teams = seasonData.teams for teamID, teamData in teams.items(): teamGames = teamData.games for gameData in teamGames: gameResult = gameData["Result"] gameObject = gameData["Game"] gameID = gameObject.gameID if False: prevLocation = "/Volumes/Seagate/Football/Games/Plays/{0}.html".format( gameID) if isFile(prevLocation): savename = setFile(gamesDir, "{0}.p".format(gameID)) if not isFile(savename) or True: data = open(prevLocation, "rb").read() saveFile(idata=data, ifile=savename, debug=True) continue continue self.downloadGameDataByID(gameID, year, debug)
def downloadArtistURL(self, url, savename, force=False, sleeptime=2): if isFile(savename): if self.debug: print("{0} exists.".format(savename)) if force is False: return False else: print("Downloading again.") ## Download data data, response = self.downloadURL(url) if response != 200: print("Error downloading {0}".format(url)) return False print("Saving {0} (force={1})".format(savename, force)) saveFile(idata=data, ifile=savename) print("Done. Sleeping for {0} seconds".format(sleeptime)) sleep(sleeptime) if isFile(savename): return True else: return False
def processWikipediaYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movies = OrderedDict() for ifile in files: if debug: print("Processing {0}".format(ifile)) year = getBaseFilename(ifile) #if year == "1985": continue htmldata = getFile(ifile) bsdata = getHTML(htmldata) results = self.parseWikipediaOscarData(ifile, debug=False) if len(results) == 0: results = self.parseWikipediaOscarDataSpecial(ifile, debug=debug) if len(results) == 0: raise ValueError("No results for {0}".format(ifile)) for k, v in results.items(): print("====>", year, '\t', k) print(" Winner :", results[k]["Winner"]) if debug: print(" Nominees:", results[k]["Nominees"]) print("") savename = setFile(self.getResultsDir(), "{0}.json".format(year)) print("Saving {0} wikipedia oscar data to {1}".format( year, savename)) saveFile(savename, results)
def matchMyMusicAlbums(self, db, albumType=1, ratioCut=0.95, maxCut=0.1): self.matchedAlbums = {} start, cmt = clock( "Checking for Albums Matches Against {0} DB".format(db)) print("{0: <40}{1: <15}{2: <45} --> {3}".format( "Artist", "Database", "Album Name", "Matched Album")) ###################################################################### #### Get Map of Artists and Unmatched Albums ###################################################################### artistNames = self.mmb.getArtists() #artistAlbums = self.mmb.getArtistAlbums() ###################################################################### #### Loop Over Artist Name <-> Prime Map Items ###################################################################### for artistName in artistNames: matchedAlbums = self.matchMyMusicAlbumsByArtist( db, artistName, albumType, ratioCut, maxCut) if len(matchedAlbums) > 0: if self.matchedAlbums.get(db) is None: self.matchedAlbums[db] = {} self.matchedAlbums[db][artistName] = matchedAlbums for myAlbumName, bestMatchVal in matchedAlbums.items(): print("{0: <40}{1: <15}{2: <45} --> {3}".format( artistName, db, myAlbumName, bestMatchVal["Album"])) elapsed(start, cmt) saveFile(ifile=self.mmn.moveFilename, idata=self.matchedAlbums, debug=True) print("Found {0} music <-> discogs albums maps".format( len(self.matchedAlbums)))
def parseSearchArtist(self, artist, data): if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = [] contentdivs = bsdata.findAll("div", {"class": "contentItem"}) for i,contentdiv in enumerate(contentdivs): artistDiv = contentdiv.find("div", {"class": "artist"}) if artistDiv is None: continue artistName = artistDiv.text albumDiv = contentdiv.find("div", {"class": "title"}) if albumDiv is None: continue albumName = albumDiv.text try: albumURL = albumDiv.find("a").attrs['href'] except: albumURL = None artistDB.append({"ArtistName": artistName, "AlbumName": albumName, "AlbumURL": albumURL}) artistID = self.dutils.getArtistID(artist) page = 1 savename = self.getArtistSavename(artistID, page) while isFile(savename): page += 1 savename = self.getArtistSavename(artistID, page) print("Saving {0} new artist media to {1}".format(len(artistDB), savename)) saveFile(idata=artistDB, ifile=savename)
def processRollingStoneData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".html") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in files: htmldata = getFile(ifile) bsdata = getHTML(htmldata) h3s = bsdata.findAll("h3", {"class": "c-list__title t-bold"}) h3s = [x.text for x in h3s] h3s = [x.replace("\n", "").strip() for x in h3s] for h3 in h3s: try: year = int(h3[-5:-1]) except: raise ValueError("Could not get year from {0}".format(h3)) movie = h3[1:-8] print(year, '\t', movie) if yearlyData.get(year) is None: yearlyData[year] = [] yearlyData[year].append(movie) for year in sorted(yearlyData.keys()): movies[year] = [] for movie in yearlyData[year]: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of rollingstone Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def findMyMovies(self, debug=False): movies = glob("/Volumes/*/Movies/*.*") mine = dict(zip([getBaseFilename(x) for x in movies], movies)) print("Found {0} movies on my disks".format(len(movies))) savename = setFile(self.getDataDir(), "mymovies.json") saveFile(idata=mine, ifile=savename, debug=True)
def saveDiagnosticAlbumIDs(self, albumIDs): savename = setFile(self.getDiagnosticDir(), "albumKnownIDs.p") saveFile(ifile=savename, idata=albumIDs)