def downloadKWorbSpotifyYouTubeArtists(self, update=False): url = "https://kworb.net/youtube/archive.html" savename = "kworb_youtubeartists.p" if update is True: self.dutils.downloadArtistURL(url=url, savename=savename, force=True) bsdata = getHTML(savename) data = [] artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "youtube") print(artistDir) for table in bsdata.findAll("table"): ths = [th.text for th in table.findAll("th")] for tr in table.findAll("tr")[1:]: item = dict(zip(ths, tr.findAll("td"))) data.append(item) print(data) if False: bsdata = getHTML(savename) artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "youtube") for div in bsdata.findAll("div", {"class": "subcontainer"}): if div.find("span", {"class": "pagetitle"}) is None: continue for ref in div.findAll("a"): href = ref.attrs['href'] url = "{0}/{1}".format(self.youtubeURL, href) savename = "{0}/{1}".format(saveDir, href.replace(".html", ".p")) if isFile(savename): print("Y\t", savename, '\t', url) else: print("-\t", savename, '\t', url) #dbArtistsKWorb().dutils.downloadArtistURL(url=fullURL, savename=savename, force=True) for ifile in findExt(saveDir, ".p"): bsdata = getHTML(ifile) for table in bsdata.findAll("table"): trs = table.findAll("tr") for tr in trs[1:]: ref = tr.find("a") href = ref.attrs['href'] name = ref.text url = "{0}/{1}".format(self.youtubeURL, href) savename = "{0}/{1}".format( setDir(saveDir, "artist"), href.replace(".html", ".p")) print(url, savename) if isFile(savename) is False: data, code = downloadURL(url) from ioUtils import getFile, saveFile saveFile(idata=data, ifile=savename) sleep(3) break
def assertDBModValExtraData(self, modVal, minPages=1, maxPages=None, allowMulti=False, test=True, clean=True): print("assertDBModValExtraData(",modVal,")") artistDBDir = self.disc.getArtistsDBDir() dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal)) dbdata = getFile(dbname) nerrs = 0 #ignores = self.artistIgnoreList() for artistID,artistData in dbdata.items(): first = True pages = artistData.pages if pages.more is True: npages = pages.pages if npages < minPages: continue if maxPages is not None: npages = min([npages, maxPages]) artistRef = artistData.url.url #if artistData.artist.name in ignores: # print("\tNot downloading artist in ignore list: {0}".format(artistData.artist.name)) # continue #savename = self.dutils.getArtistSavename(artistID) #removeFile(savename) #print("\t---> {0} / {1} {2}".format(1, pages.pages, savename)) #print(artistID,'\t',npages,'\t') #continue for p in range(1, npages+1): if p == 1: url = self.getArtistURL(artistRef) savename = self.dutils.getArtistSavename(artistID) else: url = self.getArtistURL(artistRef, p) savename = self.dutils.getArtistSavename(artistID, p) print("\t---> {0} / {1} {2}".format(p, pages.pages, url)) if clean is True: if isFile(savename): print("Removing {0}".format(savename)) removeFile(savename) if test is True: print("\t\tWill download: {0}".format(url)) print("\t\tJust testing... Will not download anything.") continue if not isFile(savename): if first: print("{0: <20}{1: <10}{2}".format(artistID,pages.tot,artistData.artist.name)) first = False print("{0: <20}{1: <10}{2}".format(artistID, "{0}/{1}".format(p,pages.pages), url)) self.dutils.downloadArtistURL(url=url, savename=savename, force=True) sleep(3)
def __init__(self, mp3=None, debug=False, allowMissing=True, test=False): self.mp3exts = [".mp3", ".MP3", ".Mp3"] if mp3 is not None: if not isFile(mp3): raise ValueError("Could not access {0}".format(mp3)) if getExt(mp3) not in self.mp3exts: raise ValueError("This is not an mp3") self.mp3 = mp3 self.debug = debug self.allowMissing = allowMissing self.test = test self.tags = { 'TALB': 'Album', 'TBPM': 'BPM', 'TCMP': 'Compilation', 'TCOM': 'Composer', 'TCOP': 'Copyright', 'TENC': 'EncodedBy', 'TEXT': 'Lyricist', 'TIT2': 'Title', 'TIT3': 'Version', 'TLEN': 'Length', 'TMED': 'Media', 'TMOO': 'Mood', 'TOLY': 'Author', 'TPE1': 'Artist', 'TPE2': 'Performer', 'TPE3': 'Conductor', 'TPE4': 'Arranger', 'TPOS': 'DiscNumber', 'TPUB': 'Organization', 'TRCK': 'TrackNumber', 'TSO2': 'AlbumArtist', 'TSOA': 'Album', 'TSOC': 'Composer', 'TSOP': 'Artist', 'TSOT': 'Title', 'TSRC': 'Isrc', 'TSST': 'DiscSubtitle' } self.id3Map = {v: k for k, v in self.tags.items()} self.tagsEasyID3 = {} self.tagsID3 = {} if isFile(self.mp3): self.setMP3(self.mp3)
def downloadUltimateMovieRankingsYearlyData(self, year, outdir, debug=False): yname = str(year) url = "https://www.ultimatemovierankings.com/{0}-top-box-office-movies/".format( year) url = "https://www.ultimatemovierankings.com/top-grossing-movies-of-{0}/".format( year) url = "https://www.ultimatemovierankings.com/{0}-movies/".format(year) url = "https://www.ultimatemovierankings.com/{0}-top-grossing-movies/".format( year) url = "https://www.ultimatemovierankings.com/biggest-box-office-hits-of-{0}/".format( year) url = "https://www.ultimatemovierankings.com/top-grossing-{0}-movies/".format( year) url = "https://www.ultimatemovierankings.com/ranking-{0}-movies/".format( year) url = "https://www.ultimatemovierankings.com/best-worst-movies-{0}/".format( year) savename = setFile(outdir, yname + ".p") if isFile(savename): return if debug: print("Downloading/Saving {0}".format(savename)) try: getWebData(base=url, savename=savename, useSafari=False) sleep(2) except: sleep(0.2)
def getArtistModValFiles(self, modVal, previousDays=5, force=False): artistDir = self.disc.getArtistsDir() maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() dirVal = setDir(artistDir, str(modVal)) files = findExt(dirVal, ext='.p') dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal)) now = datetime.now() if isFile(dbname): lastModified = datetime.fromtimestamp(path.getmtime(dbname)) if force is True: lastModified = None else: lastModified = None newFiles = None if lastModified is None: newFiles = files print(" ===> Parsing all {0} files for modval {1}".format(len(newFiles), modVal)) else: numNew = [ifile for ifile in files if (now-datetime.fromtimestamp(path.getmtime(ifile))).days < previousDays] numRecent = [ifile for ifile in files if datetime.fromtimestamp(path.getmtime(ifile)) > lastModified] newFiles = list(set(numNew).union(set(numRecent))) print(" ===> Found new {0} files (< {1} days) to parse for modval {2}".format(len(newFiles), previousDays, modVal)) return newFiles
def createRawOscarData(self, debug=True): print("Checking for poorly parsed oscar data.") indir = self.wikiData.getResultsDir() files = sorted(findExt(indir, ext=".json")) if debug: print("Found {0} oscar files".format(len(files))) yearlyData = {} for ifile in files: year = getBaseFilename(ifile) yearlyData[year] = getFile(ifile) savename = setFile(self.getCorrectionsDir(), "saved.yaml") if not isFile(savename): savedData = {} else: savedData = getFile(savename) for year in savedData.keys(): for title in savedData[year].keys(): savedWinner = savedData[year][title].get("Winner") savedNominees = savedData[year][title].get("Nominees") if savedWinner is not None: print("Overwritting {0} {1} winner".format(year, title)) yearlyData[year][title]["Winner"] = savedWinner if savedNominees is not None: print("Overwritting {0} {1} nominees".format(year, title)) yearlyData[year][title]["Nominees"] = savedNominees savename = setFile(self.getCorrectionsDir(), "raw.yaml") saveFile(idata=yearlyData, ifile=savename)
def downloadMissingArtistUnofficial(self): ts = timestat("Downloading Missing Artist Unofficial Files") for modVal, modValData in self.metadata.items(): tsMod = timestat( "Downloading {0} Missing Artist Unofficial Files For ModVal={1}" .format(len(modValData), modVal)) N = len(modValData) for i, (artistID, artistPageData) in enumerate(modValData.items()): artistName = artistPageData["Name"] artistURL = artistPageData["URL"] print("=" * 100) print("{0}/{1}: [{2}] / [{3}]".format(i, N, artistName, artistURL)) url = self.dbArtists.getArtistURL(artistURL, unofficial=True) savename = self.dutils.getArtistSavename(artistID, unofficial=True) if isFile(savename): continue try: self.dutils.downloadArtistURL(url, savename) except: print("Error downloading {0}".format(url)) tsMod.stop() ts.stop()
def __init__(self, debug=False): self.db = "DatPiff" self.disc = dbBase(self.db.lower()) self.artist = artistDP(self.disc) self.dutils = datpiffUtils() self.dutils.setDiscogs(self.disc) self.debug = debug ## MultiArtist self.mulArts = multiartist() print("DatPiff ArtistsDir: {0}".format(self.disc.getArtistsDir())) if not isDir(self.disc.getArtistsDir()): raise ValueError("Could not find artist dir for DatPiff") self.knownDir = setDir(self.disc.getArtistsDir(), "known") if not isDir(self.knownDir): print("Make sure that Piggy is loaded!!!") raise ValueError("Could not find known [{0}] dir for DatPiff".format(self.knownDir)) self.knownFile = setFile(self.knownDir, "datPiffKnown.p") if not isFile(self.knownFile): raise ValueError("Known File [{0}] does not exist".format(self.knownFile)) self.baseURL = "https://www.datpiff.com/" self.searchURL = "https://www.datpiff.com/mixtapes-search?" super().__init__(self.db, self.disc, self.artist, self.dutils, debug=debug)
def downloadMissingArtistExtras(self, maxPages=None): ts = timestat("Downloading Missing Artist Extra Files") for modVal,modValData in self.metadata.items(): tsMod = timestat("Downloading {0} Missing Artist Extra Files For ModVal={1}".format(len(modValData), modVal)) N = len(modValData) for i,(artistID,artistPageData) in enumerate(modValData.items()): artistName = artistPageData["Name"] artistURL = artistPageData["URL"] pages = artistPageData["Pages"] print("="*100) print("{0}/{1}: [{2}] / [{3}]".format(i,N,artistName,artistURL)) for j,page in enumerate(range(pages)): if maxPages is not None: if j > maxPages: continue url = self.dbArtists.getArtistURL(artistURL, page=page) savename = self.dutils.getArtistSavename(artistID, page=page) if isFile(savename): continue print("{0}/{1}: [{2}] / [{3}] / [{4}-{5}]".format(i,N,artistName,artistURL,j,pages)) try: self.dutils.downloadArtistURL(url, savename) except: print("Error downloading {0}".format(url)) tsMod.stop() ts.stop()
def getDBData(self, modVal, force=False, debug=False): dbname = self.disc.getArtistsDBModValFilename(modVal) dbdata = {} localForce = False if self.credit is True or self.extra is True or self.song is True or self.composition is True: localForce = False else: localForce = force if isFile(dbname) is False: localForce = True if localForce is False: if debug: print("Loading {0}".format(dbname)) dbdata = fileIO().get(dbname) if isinstance(dbdata, Series): dbdata = dbdata.to_dict() if debug: print(" ===> Found {0} previous data for ModVal={1}".format( len(dbdata), modVal)) else: print(" ===> Forcing Reloads of ModVal={0}".format(modVal)) return dbdata
def downloadWikiFilmYearlyData(self, year, outdir, debug=False): url = "https://en.wikipedia.org/wiki/{0}_in_film".format(year) savename = setFile(outdir, str(year) + ".p") if isFile(savename): return if debug: print("Downloading {0}".format(url)) getWebData(base=url, savename=savename, useSafari=False) sleep(1)
def downloadRottenTomatoesYearlyData(self, year, outdir, debug=False): yname = str(year) url = "https://www.rottentomatoes.com/top/bestofrt/?year=" + yname savename = setFile(outdir, "{0}.p".format(year)) if isFile(savename): return if debug: print("Downloading/Saving {0}".format(savename)) getWebData(base=url, savename=savename, useSafari=False)
def setMP3(self, mp3): if isFile(mp3): self.mp3 = mp3 self.findEasyTags() self.findID3Tags() else: raise ValueError("Could not access {0}".format(mp3))
def installData(self): if not isDir(self.musicDataDir): print("Install: Making Prefix Dir [{0}]".format(self.musicDataDir)) mkDir(self.musicDataDir) if not isFile(self.getFilename(local=False)): print("Install: Creating Prefix Data From Local Data") fileIO().save(idata=fileIO().get(self.getFilename(local=True)), ifile=self.getFilename(local=False))
def installData(self): if not isDir(self.multiArtistDir): print("Install: Making Prefix Dir [{0}]".format( self.multiArtistDir)) mkDir(self.multiArtistDir) if not isFile(self.getFilename(fast=True, local=False)): print("Install: Creating Prefix Data From Local Data") self.writeToMainPickleFromLocalYAML()
def getCombinedMovies(self, debug=False): savename = setFile(self.combine.getResultsDir(), "movies.json") if not isFile(savename): raise ValueErrro("Cannot access {0}".format(savename)) combinedMovies = getFile(savename) if debug: print("Found {0} combined movies".format(len(combinedMovies))) return combinedMovies
def getMyMovies(self, debug=False): savename = setFile(self.getDataDir(), "mymovies.json") if not isFile(savename): raise ValueError("Cannot access {0}".format(savename)) mine = getFile(savename) if debug: print("Found {0} my movies".format(len(mine))) return mine
def setMusic(self, file): if isFile(file): self.file = file if getExt(file) in self.flacExts: self.isFLAC = True if self.debug is True: print(" File is FLAC") elif getExt(file) in self.mp3Exts: self.isMP3 = True if self.debug: print(" File is MP3") elif getExt(file) in self.m4aExts: self.isM4A = True if self.debug: print(" File is M4A") elif getExt(file) in self.asfExts: self.isASF = True if self.debug: print(" File is ASF (WMA)") elif getExt(file) in self.oggExts: self.isOGG = True if self.debug: print(" File is OGG") elif getExt(file) in self.aiffExts: self.isAIFF = True if self.debug: print(" File is AIFF") elif getExt(file) in self.wavExts: self.isWAV = True if self.debug: print(" File is WAV") elif getExt(file) in self.skips: self.skip = True elif ".DS_Store" in file: self.skip = True else: raise ValueError( "Could not determine format for [{0}] with extention [{1}]" .format(file, getExt(file))) if self.isMP3 is True: #self.findID3Tags() self.findEasyTags() if self.isFLAC is True: self.findFlacTags() if self.isM4A is True: self.findM4ATags() if self.isASF is True: self.findASFTags() if self.isOGG is True: self.findOGGTags() if self.isAIFF is True: self.findAIFFTags() if self.isWAV is True: self.findWAVTags() else: raise ValueError("Could not access {0}".format(ifile))
def downloadSAGCategoryData(self, category, outdir, debug=False): url = "https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award_for_{0}".format(category) savename = setFile(outdir, category+".p") if isFile(savename): return if debug: print("Downloading {0}".format(url)) getWebData(base=url, savename=savename, useSafari=False) sleep(1)
def downloadMainArtists(self, force=False, debug=False, sleeptime=2): savename = self.getMainSavename() ## Parse data bsdata = getHTML(savename) artistDB = {} ## Find and Download Artists categories = bsdata.find("div", {"class": "sidebar-widget widget_categories"}) if categories is None: raise ValueError("Cannot find categories!") uls = categories.findAll("ul") for ul in uls: lis = ul.findAll("li") for i, li in enumerate(lis): try: catitem = li.attrs["class"][1] except: raise ValueError( "Cannot find list class item: {0}".format(li)) ref = li.find("a") if ref is None: raise ValueError("Cannot find list link!") try: href = ref.attrs['href'] except: raise ValueError("Cannot find list href!") # check for artist artistName = href.split('/')[-2] try: int(artistName) continue except: if artistName.find("parent-category-ii") == -1: pass else: continue # get artist ID artistID = catitem.split('-')[-1] try: int(artistID) except: continue if force is False and isFile(savename): print("{0} exists.".format(savename)) continue url = href savename = self.getArtistSavename(artistID) print(i, '\t', artistID, '\t', artistName, '\t', savename) self.downloadArtistURL(url=url, savename=savename, parse=False)
def parseSearchArtist(self, artist, data, maxArtists=99, force=False, debug=False): if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = {} uls = bsdata.findAll("ul", {"class": "search-results"}) for ul in uls: lis = ul.findAll("li", {"class": "artist"}) for li in lis: divs = li.findAll("div", {"class": "name"}) for div in divs: link = div.find("a") href = link.attrs['href'] tooltip = link.attrs['data-tooltip'] try: from json import loads tooltip = loads(tooltip) artistID = tooltip['id'] except: artistID = None if artistDB.get(href) is None: artistDB[href] = {"N": 0, "Name": artist} artistDB[href]["N"] += 1 if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 for href, hrefData in artistDB.items(): iArtist += 1 if iArtist > maxArtists: break discID = self.dutils.getArtistID(href) url = self.getArtistURL(href) savename = self.dutils.getArtistSavename(discID) print(iArtist, '/', len(artistDB), '\t:', discID, '\t', url) if isFile(savename): if force is False: continue self.dutils.downloadArtistURL(url, savename, force=force)
def downloadArtistUnofficialURL(self, artistData, debug=False, force=False): artistRef = artistData.url.url artistID = artistData.ID.ID print("Downloading credit URL for ArtistID {0}".format(artistID)) url = self.getArtistURL(artistRef, unofficial=True) savename = self.getArtistSavename(artistID, unofficial=True) if not isFile(savename) or force is True: retval = self.downloadArtistURL(url=url, savename=savename, force=force) return retval return False
def downloadRottenTomatoesTop100Data(self, genre, outdir, debug=False): baseurl = "https://www.rottentomatoes.com" outdir = setDir(self.getDataDir()) if not isDir(outdir): mkDir(outdir) url = "/top/bestofrt/top_100_" + genre + "_movies/" url = baseurl + url savename = setFile(outdir, genre + ".p") if isFile(savename): return if debug: print("Downloading/Saving {0}".format(savename)) getWebData(base=url, savename=savename, useSafari=False, dtime=10) sleep(2)
def downloadFilms101YearlyData(self, year, outdir, debug=False): url="http://www.films101.com/y{0}r.htm".format(year) savename = setFile(outdir, "{0}.p".format(year)) if isFile(savename): return try: if debug: print("Downloading/Saving {0}".format(savename)) getWebData(base=url, savename=savename, useSafari=False) except: return sleep(2)
def downloadTeamStandingsByYear(self, year, debug=False): baseurl = self.getBase() suburl = "college-football/standings/_/season" url = join(baseurl, suburl, str(year)) savename = setFile(self.getSeasonDir(), str(year) + ".p") if isFile(savename): return if debug: print("Downloading {0}".format(url)) getWebData(base=url, savename=savename, useSafari=False) sleep(10 + 2 * random())
def parseSearchArtist(self, artist, data, maxArtists=99, force=False, debug=False): return if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = {} for div in bsdata.findAll("div", {"class": "section"}): refs = div.findAll("a") for ref in refs: if ref.find("img") is not None: continue href = ref.attrs['href'] artist = ref.text if href.startswith("/artist/") is False: continue #print(artist,"\t",href) if artistDB.get(href) is None: artistDB[href] = {"N": 0, "Name": artist} artistDB[href]["N"] += 1 if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 for href, hrefData in artistDB.items(): iArtist += 1 if iArtist > maxArtists: break discID = self.dutils.getArtistID(href) url = self.getArtistURL(href) savename = self.dutils.getArtistSavename(discID) print(iArtist, '/', len(artistDB), '\t:', discID, '\t', url) #continue if isFile(savename): if force is False: continue self.dutils.downloadArtistURL(url, savename, force=force)
def downloadGameData(self, debug=False, verydebug=False): resultsDir = self.getSeasonResultsDir() files = findExt(resultsDir, ext=".p", debug=False) gameType = "playbyplay" print("Sleeping for 5 seconds...") sleep(5) for ifile in files: seasonData = getFile(ifile) year = seasonData.getYear() if year not in [2013, 2014, 2015]: continue gamesDir = self.getYearlyGamesDir(year) teams = seasonData.teams for teamID, teamData in teams.items(): teamGames = teamData.games for gameData in teamGames: gameResult = gameData["Result"] gameObject = gameData["Game"] gameID = gameObject.gameID if False: prevLocation = "/Volumes/Seagate/Football/Games/Plays/{0}.html".format( gameID) if isFile(prevLocation): savename = setFile(gamesDir, "{0}.p".format(gameID)) if not isFile(savename) or True: data = open(prevLocation, "rb").read() saveFile(idata=data, ifile=savename, debug=True) continue continue self.downloadGameDataByID(gameID, year, debug)
def downloadTeamStatisticsDataByYear(self, idval, name, year, debug=False): baseurl = self.getBase() suburl = "college-football/team/stats/_/id/{0}/season".format(idval) url = join(baseurl, suburl, str(year)) outputdir = self.getYearlyStatisticsDir(year) savename = setFile(outputdir, "{0}-{1}.p".format(name, year)) if isFile(savename): return if debug: print("Downloading {0} to {1}".format(url, savename)) getWebData(base=url, savename=savename, useSafari=False) sleep(15 + 2 * random())
def getDBData(self, dbname, prefix, returnName=False, debug=False): savename = setFile(self.getDiscogDBDir(), "{0}{1}.p".format(prefix, dbname)) if self.debug is True: print("Data stored in {0}".format(savename)) if returnName is True: return savename if not isFile(savename): raise ValueError("Could not find {0}".format(savename)) if self.debug: print("Returning data from {0}".format(savename)) data = getFile(savename, debug=debug) return data
def downloadUnknownArtistCompositions(self): newIgnores = [] for modVal, modValMetadata in self.metadata.items(): N = len(modValMetadata) ts = timestat( "Downloading {0} Unknown Composition Files For ModVal={1}". format(N, modVal)) for i, (artistID, artistIDData) in enumerate(modValMetadata.items()): savename = self.dutils.getArtistSavename(artistID, song=True) href = artistIDData["URL"] artist = artistIDData["Name"] if isFile(savename): continue ## Replace /credits with /songs href = "/".join(href.split('/')[:-1] + ["songs", "all"]) ## Create Full URL url = urllib.parse.urljoin(self.dbArtists.baseURL, href) print("\n") print("=" * 100) print("{0}/{1}: [{2}] / [{3}]".format(i, N, artist, url)) data, response = self.dutils.downloadURL(url) if response == 200: bsdata = getHTML(data) if len(bsdata.findAll("th", {"class": "title-composer"})) > 0: print(" ---> Saving Data To {0}".format(savename)) saveFile(idata=data, ifile=savename) sleep(3) continue sleep(3) newIgnores.append(artistID) if i == 20: break ts.stop() print("New IDs To Ignore") print(newIgnores) tsUpdate = timestat( "Adding {0} ArtistIDs To Master Composition Ignore List".format( len(newIgnores))) self.updateMasterIgnoreCompositionData(newIgnores) tsUpdate.stop()