def downloadMissingArtistExtras(self, maxPages=None): ts = timestat("Downloading Missing Artist Extra Files") for modVal,modValData in self.metadata.items(): tsMod = timestat("Downloading {0} Missing Artist Extra Files For ModVal={1}".format(len(modValData), modVal)) N = len(modValData) for i,(artistID,artistPageData) in enumerate(modValData.items()): artistName = artistPageData["Name"] artistURL = artistPageData["URL"] pages = artistPageData["Pages"] print("="*100) print("{0}/{1}: [{2}] / [{3}]".format(i,N,artistName,artistURL)) for j,page in enumerate(range(pages)): if maxPages is not None: if j > maxPages: continue url = self.dbArtists.getArtistURL(artistURL, page=page) savename = self.dutils.getArtistSavename(artistID, page=page) if isFile(savename): continue print("{0}/{1}: [{2}] / [{3}] / [{4}-{5}]".format(i,N,artistName,artistURL,j,pages)) try: self.dutils.downloadArtistURL(url, savename) except: print("Error downloading {0}".format(url)) tsMod.stop() ts.stop()
def createDBData(self, db=None, fromMetadata=True, fromMaps=False, fromMerge=True): dbs = [db] if db is not None else self.dbdata.keys() tsFull = timestat( "create DB Data (fromMetadata={0}, fromMaps={1}, fromMerge={2}) For [{3}] DBs" .format(fromMetadata, fromMaps, fromMerge, ", ".join(dbs))) for db in dbs: print("=" * 200) ts = timestat("Setting DB Artists For {0}".format(db)) self.createDBArtistData(db, fromMetadata, fromMaps, fromMerge) ts.stop() ts = timestat("Setting DB Albums For {0}".format(db)) self.createDBArtistAlbumData(db, fromMetadata, fromMaps, fromMerge) ts.stop() print("=" * 200) print("") tsFull.stop()
def downloadMissingArtistUnofficial(self): ts = timestat("Downloading Missing Artist Unofficial Files") for modVal, modValData in self.metadata.items(): tsMod = timestat( "Downloading {0} Missing Artist Unofficial Files For ModVal={1}" .format(len(modValData), modVal)) N = len(modValData) for i, (artistID, artistPageData) in enumerate(modValData.items()): artistName = artistPageData["Name"] artistURL = artistPageData["URL"] print("=" * 100) print("{0}/{1}: [{2}] / [{3}]".format(i, N, artistName, artistURL)) url = self.dbArtists.getArtistURL(artistURL, unofficial=True) savename = self.dutils.getArtistSavename(artistID, unofficial=True) if isFile(savename): continue try: self.dutils.downloadArtistURL(url, savename) except: print("Error downloading {0}".format(url)) tsMod.stop() ts.stop()
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw Files") tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistRawFiles(datatype=self.datatype, expr=expr, force=force) tsFiles.stop() N = len(newFiles) tsParse = timestat("Parsing {0} New Raw Files".format(N)) newData = 0 modValue = 250 if N >= 500 else 50 for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile)) htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID if artistID is None: continue savename = self.dutils.getArtistSavename(artistID) if savename is None: continue saveFile(idata=htmldata, ifile=savename, debug=False) newData += 1 print("Created {0}/{1} New Artist Files".format(newData, N)) tsParse.stop()
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw HTML Files") tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistRawHTMLFiles(expr, force) tsFiles.stop() if debug: print("Parsing {0} Raw HTML Files From Expr[{1}]".format(len(newFiles), expr)) N = len(newFiles) modValue = 250 if N >= 500 else 50 tsParse = timestat("Parsing {0} Raw HTML Files".format(N)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N or debug: tsParse.update(n=i+1, N=N) #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile)) if debug: print("{0}/{1}\tParsing {2}".format(i,N,ifile)) htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID if debug: print(" ---> ID={0}".format(artistID)) savename = self.dutils.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=False) tsParse.stop() ts.stop()
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw Pickled HTML Files(expr=\'{0}\', force={1}, debug={2}, quiet={3})".format(expr, force, debug, quiet)) io = fileIO() newFiles = self.getArtistRawFiles(datatype="data", expr=expr, force=force) N = len(newFiles) modValue = 250 if N >= 500 else 50 nSave = 0 tsParse = timestat("Parsing {0} Raw Picked HTML Files".format(N)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N or debug: tsParse.update(n=i+1, N=N) retval = self.artist.getData(ifile) if retval is None: if debug: print("Could not find data for {0}".format(ifile)) continue artistID = retval.ID.ID if artistID is None: if debug: print("Could not find artistID for {0}".format(ifile)) continue savename = self.dutils.getArtistSavename(artistID) if isinstance(savename,str) and (force == True or fileUtil(savename).exists == False): io.save(idata=retval, ifile=savename) nSave += 1 ts.stop() print("Saved {0} New Files".format(nSave))
def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False): ts = timestat("Parsing Discogs Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) io = fileIO() ######################################################################################## # Previous DB Data ######################################################################################## if not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = Series({}) ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() ######################################################################################## # Previous Media Data ######################################################################################## previousMetadata = self.disc.getMetadataAlbumData(modVal) ######################################################################################## # Artist Search Data (No Media) ######################################################################################## tsDB = timestat("Loading Artist Search Data For ModVal={0}".format(modVal)) artistSearchFilenames = self.getArtistRawFiles(datatype="search", expr=expr, force=True) artistSearchFilename = [x for x in artistSearchFilenames if fileUtil(x).basename == "artistData-{0}".format(modVal)] if len(artistSearchFilename) == 1: artistSearchData = io.get(artistSearchFilename[0]) else: raise ValueError("Could not find Discogs API Artist Search Data") tsDB.stop() N = artistSearchData.shape[0] modValue = 5000 if N >= 50000 else 1000 nSave = 0 tsParse = timestat("Parsing {0} Searched For Discogs API Artists".format(N)) Nnew = 0 for i,(artistID,artistData) in enumerate(artistSearchData.iterrows()): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) if dbdata.get(artistID) is not None: continue artistAPIData = {"Artist": artistData, "Albums": previousMetadata.get(artistID, {})} dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)})) Nnew += 1 if Nnew > 0: print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) else: print("Not saving any of the new data") ts.stop()
def findMyMusic(self, primeDir=None, artistName=None): artistAlbums = {} if primeDir is None and artistName is None: ts = timestat("Find PrimeDir Artist Paths") pdPaths = { pd: pdpath for pd, pdpath in {pd: setDir(self.musicDir, pd) for pd in self.pdDirs}.items() if dirUtil(pdpath).isDir() } pdArtistPaths = { pd: findDirs(pdpath) for pd, pdpath in pdPaths.items() } artistPaths = { fsap.name: fsap.path for fsap in [dirUtil(ap) for ap in getFlatList(pdArtistPaths.values())] } artistAlbums = { artistName: self.getArtistPathData(artistName, artistPath) for artistName, artistPath in artistPaths.items() } print(" Found {0} Artists From {1} Prime Directories".format( len(artistAlbums), len(pdArtistPaths))) ts.stop() elif primeDir is not None: ts = timestat( "Finding All Artist Albums From [{0}] Prime Directory".format( primeDir)) pdPaths = { pd: pdpath for pd, pdpath in {pd: setDir(self.musicDir, pd) for pd in [primeDir]}.items() if dirUtil(pdpath).isDir() } pdArtistPaths = { pd: findDirs(pdpath) for pd, pdpath in pdPaths.items() } artistPaths = { fsap.name: fsap.path for fsap in [dirUtil(ap) for ap in getFlatList(pdArtistPaths.values())] } artistAlbums = { artistName: self.getArtistPathData(artistName, artistPath) for artistName, artistPath in artistPaths.items() } print(" Found {0} Artists From [{1}] Prime Directory".format( len(artistAlbums), primeDir)) ts.stop() elif artistName is not None: ts = timestat("Finding [{0}] Artist Albums".format(artistName)) artistAlbums = self.getArtistPathData(artistName) ts.stop() self.artistAlbums = artistAlbums return artistAlbums
def parse(self, modVal, expr, force=False, debug=False): ts = timestat("Parsing ModVal={0} Unofficial Files".format(modVal)) tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistUnofficialFiles(modVal, expr, force) tsFiles.stop() N = len(newFiles) modValue = 50 if N >= 100 else 10 if N > 0: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.getDBData(modVal, force) tsDB.stop() newData = 0 tsParse = timestat( "Parsing {0} New Unofficial Files For ModVal={1}".format( N, modVal)) for i, ifile in enumerate(newFiles): if (i + 1) % modValue == 0 or (i + 1) == N: print("{0: <15}Parsing {1}".format("{0}/{1}".format(i + 1, N), ifile)) artistID = getBaseFilename(ifile) info = self.artist.getData(ifile) currentKeys = [] if dbdata.get(artistID) is not None: currentKeys = list(dbdata[artistID].media.media.keys()) else: dbdata[artistID] = info newData += 1 continue keys = list(set(list(info.media.media.keys()) + currentKeys)) for k in keys: v = info.media.media.get(k) if v is None: continue iVal = {v2.code: v2 for v2 in v} dVal = dbdata[artistID].media.media.get(k) if dVal is None: Tretval = iVal else: Tretval = {v2.code: v2 for v2 in dVal} Tretval.update(iVal) dbdata[artistID].media.media[k] = list(Tretval.values()) newData += 1 tsParse.stop() print("Found {0} Unofficial Artist Records For ModVal={1}".format( newData, modVal)) if newData > 0: self.saveDBData(modVal, dbdata, newData)
def parse(self, modVal, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Primary ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistPrimaryFiles(modVal, expr, force) tsFiles.stop() N = len(newFiles) if N == 0: ts.stop() return modValue = max([250 * round((N/10)/250), 250]) if force is True or not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = {} ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() newData = 0 tsParse = timestat("Parsing {0} New Files For ModVal={1}".format(N, modVal)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile)) artistID = getBaseFilename(ifile) info = self.artist.getData(ifile) if debug: print("\t",ifile,' ==> ',info.ID.ID,' <-> ',artistID) if info.ID.ID != artistID: if debug is True: print("Error for {0} ID={1} FileID={2}".format(info.meta.title,info.ID.ID,artistID)) 1/0 continue dbdata[artistID] = info newData += 1 tsParse.stop() if newData > 0: dbdata = Series(dbdata) print("Saving [{0}/{1}] {2} Entries To {3}".format(newData, len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) ts.stop() return newData > 0
def downloadUnknownArtistCompositions(self): newIgnores = [] for modVal, modValMetadata in self.metadata.items(): N = len(modValMetadata) ts = timestat( "Downloading {0} Unknown Composition Files For ModVal={1}". format(N, modVal)) for i, (artistID, artistIDData) in enumerate(modValMetadata.items()): savename = self.dutils.getArtistSavename(artistID, song=True) href = artistIDData["URL"] artist = artistIDData["Name"] if isFile(savename): continue ## Replace /credits with /songs href = "/".join(href.split('/')[:-1] + ["songs", "all"]) ## Create Full URL url = urllib.parse.urljoin(self.dbArtists.baseURL, href) print("\n") print("=" * 100) print("{0}/{1}: [{2}] / [{3}]".format(i, N, artist, url)) data, response = self.dutils.downloadURL(url) if response == 200: bsdata = getHTML(data) if len(bsdata.findAll("th", {"class": "title-composer"})) > 0: print(" ---> Saving Data To {0}".format(savename)) saveFile(idata=data, ifile=savename) sleep(3) continue sleep(3) newIgnores.append(artistID) if i == 20: break ts.stop() print("New IDs To Ignore") print(newIgnores) tsUpdate = timestat( "Adding {0} ArtistIDs To Master Composition Ignore List".format( len(newIgnores))) self.updateMasterIgnoreCompositionData(newIgnores) tsUpdate.stop()
def parse(self, modVal, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw Pickled Spotify API Primary ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) io = fileIO() newFiles = self.getArtistPrimaryFiles(modVal, expr, force) print("Found {0} New Files".format(len(newFiles))) if len(newFiles) == 0: return artistSearchFilename = self.getArtistRawFiles(datatype="search", expr=expr, force=True) if len(artistSearchFilename) == 1: artistSearchData = io.get(artistSearchFilename[0]) else: raise ValueError("Could not find Spotify API Artist Search Data") if force is True or not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = Series({}) ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() N = len(newFiles) modValue = 500 if N >= 5000 else 100 nSave = 0 tsParse = timestat("Parsing {0} Raw Picked API Files".format(N)) for i,ifile in enumerate(newFiles): dData = io.get(ifile) artistID = dData['artistID'] try: artistData = artistSearchData.loc[artistID] except: print("Could not find Spotify ID [{0}]".format(artistID)) continue artistAPIData = {"Artist": artistData, "Albums": dData} dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)})) nSave += 1 if nSave > 0: print("Saving [{0}/{1}] {2} Entries To {3}".format(nSave, len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) else: print("Not saving any of the new data") ts.stop()
def createArtistIDMap(self): print("="*125) ts = timestat("Creating Artist DBs for ==> {0} <==".format(self.db)) print("="*125) artistIDToName = Series() artistIDToRef = Series() modValue = 10 if self.debug is False else 1 for modVal in range(100): metadata = self.disc.getMetadataArtistData(modVal) artistIDToName = artistIDToName.append(metadata.apply(lambda x: self.manc.getArtistName(x[0]))) artistIDToRef = artistIDToRef.append(metadata.apply(lambda x: x[1])) if (modVal+1) % modValue == 0: print("{0: <15}{1: >9}".format("ModVal={0}".format(modVal+1),len(artistIDToName))) print("\n\n==============================================\n") print("Saving [{0}] {1} Entries To {2}".format(len(artistIDToName), "ID => Name", self.disc.getArtistIDToNameFilename())) self.disc.saveArtistIDToNameData(idata=artistIDToName) print("Saving [{0}] {1} Entries To {2}".format(len(artistIDToName), "ID => Ref", self.disc.getArtistIDToRefFilename())) self.disc.saveArtistIDToRefData(idata=artistIDToRef) ts.stop()
def createArtistIDAlbumsMap(self): print("="*125) ts = timestat("Creating Album DBs for ==> {0} <==".format(self.db)) print("="*125) artistIDToNumAlbums = {} artistIDToAlbumNames = {} nAllAlbums = 0 modValue = 10 if self.debug is False else 1 for modVal in range(100): metadata = self.disc.getMetadataAlbumData(modVal) for j,(artistID,artistData) in enumerate(metadata.iteritems()): artistIDToNumAlbums[artistID] = 0 artistIDToAlbumNames[artistID] = {} for mediaName,mediaData in artistData.items(): artistIDToAlbumNames[artistID].update({mediaName: mediaData}) nAllAlbums += len(mediaData) artistIDToNumAlbums[artistID] += len(mediaData) if (modVal+1) % modValue == 0: print("{0: <15}{1: >9}{2: >9}".format("ModVal={0}".format(modVal+1), len(artistIDToNumAlbums), nAllAlbums)) print("\n\n==============================================\n") artistIDToNumAlbums = Series(artistIDToNumAlbums) print("Saving [{0}] {1} Entries To {2}".format(len(artistIDToNumAlbums), "ID => NumAlbums", self.disc.getArtistIDToNumAlbumsFilename())) self.disc.saveArtistIDToNumAlbumsData(idata=artistIDToNumAlbums) artistIDToAlbumNames = Series(artistIDToAlbumNames) print("Saving [{0}] {1} Entries To {2}".format(len(artistIDToAlbumNames), "ID => AlbumNames", self.disc.getArtistIDToAlbumNamesFilename())) self.disc.saveArtistIDToAlbumNamesData(idata=artistIDToAlbumNames) ts.stop()
def createArtistMetadataMap(self): ts = timestat("Creating Artist DBs") artistIDGenre = {} artistIDStyle = {} artistIDCollaborations = {} albumsMetadataDBDir = self.disc.getAlbumsMetadataDBDir() files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistMetadata", ext='.p') for ifile in files: print(ifile,'\t',end="") for artistID,artistData in getFile(ifile).items(): genre = artistData['Genre'] artistIDGenre[artistID] = genre artists = artistData['Artists'] artistIDCollaborations[artistID] = artists style = artistData['Style'] artistIDStyle[artistID] = style print(len(artistIDGenre)) print("\n\n==============================================\n") savenames = {"IDToGenre": artistIDGenre, "IDToStyle": artistIDStyle, "IDToCollaborations": artistIDCollaborations} for basename,savedata in savenames.items(): savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}.p".format(basename)) print("Saving {0} entries to {1}\n".format(len(savedata), savename)) saveFile(ifile=savename, idata=Series(savedata), debug=True) ts.stop()
def createAlbumMetadata(self): ts = timestat("Creating Artist Album Metadata For ModVal={0}".format(self.modVal)) artistIDMetadata = {} errs = {} for artistID,artistData in self.dbData.items(): artistID = str(artistID) if artistData.artist.name is None: continue artistIDMetadata[artistID] = {} for mediaName,mediaData in artistData.media.media.items(): try: albumURLs = {mediaValues.code: mediaValues.url for mediaValues in mediaData} albumNames = {mediaValues.code: mediaValues.album for mediaValues in mediaData} artistIDMetadata[artistID][mediaName] = albumNames #, albumURLs] except: errs[artistID] = artistData.artist.name #print(artistID,'\t',mediaName) artistIDMetadata = Series(artistIDMetadata) print("Saving [{0}] {1} Entries To {2}".format(len(artistIDMetadata), "ID => AlbumNames", self.disc.getMetadataAlbumFilename(self.modVal))) self.disc.saveMetadataAlbumData(idata=artistIDMetadata, modVal=self.modVal) ts.stop() print(errs)
def createCompositionMetadata(self, modVal=None): modVals = [modVal] if modVal is not None else range(100) ts = timestat("Creating AllMusic Composition Metadata") for modVal in modVals: tsDBData = timestat( "Finding Known Credit Artists For ModVal={0}".format(modVal)) dbData = self.getDBData(modVal) dbArtistURLs = { artistID: { "Name": artistData.artist.name, "URL": artistData.url.url } for artistID, artistData in dbData.items() } tsDBData.stop() tsCredit = timestat( "Finding Known Credit Artists From {0} Artists For ModVal={1}". format(len(dbArtistURLs), modVal)) creditArtistIDs = { artistID: artistData for artistID, artistData in dbArtistURLs.items() if artistData["URL"] is not None and artistData["URL"].endswith("/credits") } tsCredit.stop() tsIgnore = timestat( "Removing IDs To Ignore From {0} Primary Files For ModVal={0}". format(len(creditArtistIDs), modVal)) availableArtistIDs = { artistID: artistData for artistID, artistData in creditArtistIDs.items() if artistID not in self.songIgnores } tsIgnore.stop() tsMeta = timestat( "Finding Metadata For {0}/{1}/{2} Missing ArtistIDs for ModVal={3}" .format(len(availableArtistIDs), len(creditArtistIDs), len(dbArtistURLs), modVal)) self.metadata[modVal] = availableArtistIDs tsMeta.stop() ts.stop()
def createArtistMetadata(self): ts = timestat("Creating Artist Name Metadata For ModVal={0}".format(self.modVal)) artistIDMetadata = {str(artistID): [artistData.artist.name, artistData.url.url] for artistID,artistData in self.dbData.items() if artistData.artist.name is not None} artistIDMetadata = Series(artistIDMetadata) print("Saving [{0}] {1} Entries To {2}".format(len(artistIDMetadata), "ID => Name/URL", self.disc.getMetadataArtistFilename(self.modVal))) self.disc.saveMetadataArtistData(idata=artistIDMetadata, modVal=self.modVal) ts.stop()
def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False): ts = timestat("Parsing Spotify Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) if not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = Series({}) ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() io = fileIO() artistSearchFilename = self.getArtistRawFiles(datatype="search", expr=expr, force=True) if len(artistSearchFilename) == 1: artistSearchData = io.get(artistSearchFilename[0]) else: raise ValueError("Could not find Spotify API Artist Search Data") #print(artistSearchData.columns) amv = artistModValue() idx = artistSearchData.reset_index()['sid'].apply(amv.getModVal) == modVal idx.index = artistSearchData.index artists = artistSearchData[idx] N = artists.shape[0] tsParse = timestat("Parsing {0} Searched For Spotify API Artists".format(N)) Nnew = 0 for artistID,artistData in artists.iterrows(): if dbdata.get(artistID) is not None: continue artistAPIData = {"Artist": artistData, "Albums": {}} dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)})) Nnew += 1 if Nnew > 0: print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) else: print("Not saving any of the new data") ts.stop()
def mergeArtistAlbumIDMap(self): print("="*50) print("") ts = timestat("Merging ArtistAlbumID DBs for ==> {0} <==".format(self.db)) print("") print("="*50) mergerData = self.mam.getMergerDataByDB(self.db) savenames = ["IDToAlbumNames", "IDToAlbumRefs"] for basename in savenames: savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}PreMerge.p".format(basename)) savedata = getFile(savename).to_dict() print("Found {0} entries.".format(len(savedata))) fromIDs = mergerData.apply(lambda x: len(x["MergeData"])).sum() toIDs = len(mergerData) print("") print("================================================") print(" Merger From [{0}] DB IDs To [{1}] New IDs".format(fromIDs, toIDs)) print(" Pre Merge [{0}]".format(len(savedata))) for artistName,artistData in mergerData.iteritems(): newID = artistData["ID"] dbIDs = artistData["MergeData"].keys() #print(newID,'\t',len(dbIDs),'\t',artistName) savedata[newID] = {} for i,artistID in enumerate(dbIDs): if savedata.get(artistID) is not None: for mediaName, mediaData in savedata[artistID].items(): if savedata[newID].get(mediaName) is not None: savedata[newID][mediaName].update(mediaData) else: savedata[newID][mediaName] = mediaData for artistID in dbIDs: try: del savedata[artistID] except: print("Could not delete merged ID {0}".format(artistID)) print(" Post Merge [{0}]".format(len(savedata))) print("================================================") print("") savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}.p".format(basename)) print("Saving {0} entries to {1}\n".format(len(savedata), savename)) saveFile(ifile=savename, idata=Series(savedata), debug=True) sleep(0.5) ts.stop()
def getData(self, fast=True, local=False): ftype = {True: "Pickle", False: "YAML"} ltype = {True: "Local", False: "Main"} ts = timestat("Getting Manual Renames Data From {0} {1} File".format( ltype[local], ftype[fast])) fname = self.getFilename(fast, local) manualRenames = self.io.get(fname) ts.stop() return manualRenames
def createExtraMetadata(self, modVal=None): modVals = [modVal] if modVal is not None else range(100) ts = timestat("Creating Extra Files Metadata") for modVal in modVals: tsDBData = timestat("Finding Pages/URL Data For ModVal={0}".format(modVal)) dbData = self.getDBData(modVal) dbArtistURLPages = {artistID: {"Name": artistData.artist.name, "URL": artistData.url.url, "Pages": self.getNumPages(artistData.pages)} for artistID,artistData in dbData.items()} tsDBData.stop() tsPages = timestat("Finding Artists With More Pages From {0} Artists For ModVal={1}".format(len(dbArtistURLPages), modVal)) pagesData = {artistID: artistData for artistID,artistData in dbArtistURLPages.items() if artistData["Pages"] > 1} tsPages.stop() tsIgnore = timestat("Removing Ignored Artists From {0} Artists For ModVal={1}".format(len(pagesData), modVal)) ignoreData = {artistID: artistData for artistID,artistData in pagesData.items() if artistData["Name"] not in self.extraIgnores} tsIgnore.stop() tsMeta = timestat("Saving Metadata From {0}/{1}/{2} For ModVal={3}".format(len(ignoreData), len(pagesData),len(dbArtistURLPages),modVal)) self.metadata[modVal] = ignoreData tsMeta.stop() ts.stop()
def createMasterDBArtistAlbumsDataFrame(self): ts = timestat("=================================== Creating Artist Album DB ===================================") print("Loading ArtistID Data") artistIDtoAlbumNames = self.disc.getArtistIDToAlbumNamesData() print("Creating Flattened List for {0} Artists".format(artistIDtoAlbumNames.shape[0])) artistIDToAlbumNamesData = artistIDtoAlbumNames.apply(lambda val: getFlatList([mediaData.values() for mediaType,mediaData in val.items()])) savename = setFile(self.disc.getDiscogDBDir(), "MasterArtistIDToAlbums.p") print("Saving {0}/{1} artists/albums to {2}".format(len(artistIDToAlbumNamesData), artistIDToAlbumNamesData.apply(len).sum(), savename)) saveFile(ifile=savename, idata=artistIDToAlbumNamesData) artistIDToNumAlbumsData = artistIDToAlbumNamesData.apply(len) artistIDToNumAlbumsData.name = "NumAlbums" savename = setFile(self.disc.getDiscogDBDir(), "MasterArtistIDToNumAlbums.p") print("Saving {0}/{1} artists/albums to {2}".format(len(artistIDToNumAlbumsData), artistIDToNumAlbumsData.sum(), savename)) saveFile(ifile=savename, idata=artistIDToNumAlbumsData) ts.stop() return ts = timestat("=================================== Creating Artist Album DB ===================================") print("Creating Pandas DataFrame for {0} Artists".format(artistIDtoAlbumNames.shape[0])) cols = ["Albums"] discdf = DataFrame(artistIDtoAlbumNames) discdf.columns = cols print("\tShape --> {0}".format(discdf.shape)) print("DataFrame Shape is {0}".format(discdf.shape)) saveFilename = self.disc.getMasterDBArtistAlbumsFilename() print("Saving Master Artist Albums DB File: {0}".format(saveFilename)) saveFile(ifile=saveFilename, idata=discdf, debug=False) ts.stop()
def downloadUnknownArtistCredits(self): newIgnores = [] for modVal, modValMetadata in self.metadata.items(): N = len(modValMetadata) ts = timestat( "Downloading {0} Unknown Credit Files For ModVal={1}".format( N, modVal)) for i, (artistID, artistIDData) in enumerate(modValMetadata.items()): savename = self.dutils.getArtistSavename(artistID, credit=True) if isFile(savename): continue title = artistIDData["title"] title = title.replace("Artist Search for ", "") title = title.replace(" | AllMusic", "") title = title.replace("Songs, Albums, Reviews, Bio & More", "").strip() title = title[1:] if title.startswith('"') else title title = title[:-1] if title.endswith('"') else title artist = title print("{0}/{1}: [{2}]".format(i, N, artist)) if len(artist) < 1: continue numDownload = self.dbArtists.searchForArtistCredit( artist=artist, artistID=artistID) if numDownload == 0: newIgnores.append(artistID) ts.stop() print("New IDs To Ignore") print(newIgnores) tsUpdate = timestat( "Adding {0} ArtistIDs To Master Credit Ignore List".format( len(newIgnores))) self.updateMasterIgnoreCreditData(newIgnores) tsUpdate.stop()
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw HTML Files(expr=\'{0}\', force={1}, debug={2}, quiet={3})".format(expr, force, debug, quiet)) io = fileIO() newFiles = self.getArtistRawHTMLFiles(expr, force=force) N = len(newFiles) modValue = 250 if N >= 500 else 50 modValue = 500 if N >= 2000 else modValue nSave = 0 tsParse = timestat("Parsing {0} Raw HTML Files".format(N)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) htmldata = io.get(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID savename = self.dutils.getArtistSavename(artistID) if isinstance(savename,str) and (force == True or fileUtil(savename).exists == False): io.save(idata=retval, ifile=savename) nSave += 1 ts.stop() print("Saved {0} New Files".format(nSave))
def poolParse(dbObj, modVals, expr="< 0 Days", force=False, numProcs=2): num_processes = numProcs func = parseDB argument_list = modVals args = {"expr": expr, "class": dbObj, "force": force, "debug": False} ## Create kwargs for pool # Giving some arguments for kwargs pfunc = partial(func, **args) ts = timestat("Running imap multiprocessing for {0} mod values ...".format( len(argument_list))) result_list = tqdmMap(func=pfunc, argument_list=argument_list, num_processes=num_processes) ts.stop()
def createArtistDataForMatch(self): print("="*125) ts = timestat("Creating Slimmed >=2 Albums + Translation DBs for ==> {0} <==".format(self.db)) print("="*125) ignoreIDs = self.miid.getIgnoreDBIDs(self.db) print(" Removing These IDs: {0}".format(ignoreIDs)) #################################################################################################################### ### Artist ID => Num Albums (w/ >=2 Albums) #################################################################################################################### artistIDToNumAlbums = self.disc.getArtistIDToNumAlbumsData() artistIDToSearchNumAlbums = artistIDToNumAlbums[artistIDToNumAlbums >= 2] artistIDToSearchNumAlbums = artistIDToSearchNumAlbums[~artistIDToSearchNumAlbums.index.isin(ignoreIDs)] print("Saving [{0}/{1}] {2} Entries To {3}".format(len(artistIDToSearchNumAlbums), len(artistIDToNumAlbums), "ID => NumAlbums", self.disc.getArtistIDToSearchNumAlbumsFilename())) self.disc.saveArtistIDToSearchNumAlbumsData(idata=artistIDToSearchNumAlbums) #################################################################################################################### ### Artist ID => Name (w/ >=2 Albums) #################################################################################################################### artistIDToName = self.disc.getArtistIDToNameData() artistIDToSearchName = artistIDToName[artistIDToNumAlbums >= 2] artistIDToSearchName = artistIDToSearchName[~artistIDToSearchName.index.isin(ignoreIDs)] artistIDToSearchTransName = artistIDToSearchName.apply(self.transDB.renamed) numTranslated = (artistIDToSearchTransName != artistIDToSearchName).sum() print("Saving [{0}/{1}] (NumTrans={2}) {3} Entries To {4}".format(len(artistIDToSearchTransName), len(artistIDToName), numTranslated, "ID => Name", self.disc.getArtistIDToSearchNameFilename())) self.disc.saveArtistIDToSearchNameData(idata=artistIDToSearchTransName) #################################################################################################################### ### Artist ID => Albums (w/ >=2 Albums) #################################################################################################################### artistIDToAlbumNames = self.disc.getArtistIDToAlbumNamesData() artistIDToSearchAlbumNames = artistIDToAlbumNames[artistIDToNumAlbums >= 2] artistIDToSearchAlbumNames = artistIDToSearchAlbumNames[~artistIDToSearchAlbumNames.index.isin(ignoreIDs)] print("Saving [{0}/{1}] {2} Entries To {3}".format(len(artistIDToSearchAlbumNames), len(artistIDToAlbumNames), "ID => AlbumNames", self.disc.getArtistIDToSearchAlbumNamesFilename())) self.disc.saveArtistIDToSearchAlbumNamesData(idata=artistIDToSearchAlbumNames) ts.stop()
def saveData(self, manualMultiArtists, fast=True, local=False): ftype = {True: "Pickle", False: "YAML"} ltype = {True: "Local", False: "Main"} ts = timestat("Saving Manual Renames Data To {0} {1} File".format( ltype[local], ftype[fast])) #manualMultiArtists = self.manualMultiArtists if manualMultiArtists is None else manualMultiArtists #self.summary(manualRenames) fname = self.getFilename(fast, local) if fast: toSave = Series(manualMultiArtists) if isinstance( manualMultiArtists, list) else manualMultiArtists toSave = toSave.sort_values() else: toSave = manualMultiArtists.to_list() if isinstance( manualMultiArtists, Series) else manualMultiArtists self.io.save(idata=toSave, ifile=fname) ts.stop()
def loadAlbums(self, idxReq={}): ts = timestat("Loading DB Albums Data For {0} DBs".format( len(self.discs))) for db, disc in self.discs.items(): idxs = self.mdbData[db]["IDToName"].notna( ) if self.mdbData[db].get("IDToName") is not None else None self.mdbData[db]["IDToAlbums"] = self.getArtistAlbumsData(db) if idxs is not None: self.mdbData[db]["IDToAlbums"] = self.mdbData[db][ "IDToAlbums"][self.mdbData[db]["IDToAlbums"].index.isin( idxs.index)] idxs = self.getIdxReqs(db, idxReq=idxReq) if idxs is not None: self.mdbData[db]["IDToAlbums"] = self.mdbData[db][ "IDToAlbums"][self.mdbData[db]["IDToAlbums"].index.isin( idxs.index)] self.mdbData[db]["IDToAlbums"].name = "Albums" #ts.update() ts.stop()
def loadArtists(self, numAlbumsReq={}): ts = timestat("Loading DB Artist Data For {0} DBs".format( len(self.discs))) for db, disc in self.discs.items(): idxs = self.getIdxReqs(db, numAlbumsReq=numAlbumsReq) self.mdbData[db]["IDToName"] = self.getArtistNameData(db) self.mdbData[db]["IDToName"] = self.mdbData[db][ "IDToName"] if idxs is None else self.mdbData[db][ "IDToName"].loc[idxs] self.mdbData[db]["IDToName"].name = "ArtistName" self.mdbData[db]["IDToNumAlbums"] = self.getNumAlbumsData( db).loc[idxs] self.mdbData[db]["IDToNumAlbums"] = self.mdbData[db][ "IDToNumAlbums"] if idxs is None else self.mdbData[db][ "IDToNumAlbums"].loc[idxs] self.mdbData[db]["IDToNumAlbums"].name = "NumAlbums" #ts.update() ts.stop()