def setupSpark(debug=False): start, cmt = clock("Setting up Spark/PySpark") import os try: sparkhome = os.environ.get("SPARK_HOME") print("PySpark home is {0}".format(sparkhome)) except: raise ValueError("There is not SPARK_HOME variable") try: import findspark findspark.init(sparkhome) except: raise ValueError("Could import findspark") try: import pyspark except: raise ValueError("Could not import pyspark") hivejar = '/opt/cloudera/parcels/CDH/jars/hive-hcatalog-core-1.1.0-cdh5.13.3.jar' from os.path import exists if not exists(hivejar): raise ValueError("Hive JAR {0} does not exist".format(hivejar)) conf = ( pyspark.SparkConf().setAppName('Daily Feature Generator').setMaster( 'yarn').set('spark.driver.memory', '20g').set('spark.shuffle.service.enabled', True).set('spark.dynamicAllocation.enabled', True) # .set('spark.executor.heartbeatInterval', '3600s') .set('spark.executor.memory', '5g').set( 'spark.yarn.executor.memoryOverhead', '4000m').set('spark.dynamicAllocation.maxExecutors', 250).set( 'spark.dynamicAllocation.minExecutors', 10).set('spark.kryoserializer.buffer.max', '1g').set( 'spark.speculation', True).set('spark.jars', hivejar).set( 'spark.port.maxRetries', 100).set('spark.driver.maxResultSize', '6g').set('spark.sql.broadcastTimeout', 600)) sc = pyspark.SparkContext(conf=conf) sc.setLogLevel('ERROR') elapsed(start, cmt) return sc
def getArtistStatus(self): start, cmt = clock("Matching All Music Artists") ###################################################################### #### Loop Over My Artists and Paths ###################################################################### for primeDir in self.mmb.getPrimeDirectories(): for artistName, artistPrimeDirs in self.mmb.getArtistPrimeDirMap( primeDir).items(): if self.debug: print("{0: <50}{1}".format(artistName, artistPrimeDirs)) ###################################################################### #### Get Database IDs ###################################################################### isKnown = self.mdb.isKnownByName(artistName) if isKnown is False: self.unknownArtists[artistName] = artistPrimeDirs if self.debug: print("\tUnknown (All) --> {0}".format(artistName)) elapsed(start, cmt) print("Found {0} unknown artists".format(len(self.unknownArtists))) print("Found {0} total artists".format(len(self.artistAlbums)))
def matchMyMusicAlbums(self, db, albumType=1, ratioCut=0.95, maxCut=0.1): self.matchedAlbums = {} start, cmt = clock( "Checking for Albums Matches Against {0} DB".format(db)) print("{0: <40}{1: <15}{2: <45} --> {3}".format( "Artist", "Database", "Album Name", "Matched Album")) ###################################################################### #### Get Map of Artists and Unmatched Albums ###################################################################### artistNames = self.mmb.getArtists() #artistAlbums = self.mmb.getArtistAlbums() ###################################################################### #### Loop Over Artist Name <-> Prime Map Items ###################################################################### for artistName in artistNames: matchedAlbums = self.matchMyMusicAlbumsByArtist( db, artistName, albumType, ratioCut, maxCut) if len(matchedAlbums) > 0: if self.matchedAlbums.get(db) is None: self.matchedAlbums[db] = {} self.matchedAlbums[db][artistName] = matchedAlbums for myAlbumName, bestMatchVal in matchedAlbums.items(): print("{0: <40}{1: <15}{2: <45} --> {3}".format( artistName, db, myAlbumName, bestMatchVal["Album"])) elapsed(start, cmt) saveFile(ifile=self.mmn.moveFilename, idata=self.matchedAlbums, debug=True) print("Found {0} music <-> discogs albums maps".format( len(self.matchedAlbums)))
def analyzePartiallyUnknownArtists(matchedResults): start, cmt = clock("Finding Possible New Matches") num = 2 cutoff = 0.50 discogMediaNames = ['Albums', 'Singles & EPs', 'Compilations', 'Videos', 'Miscellaneous', 'Visual', 'DJ Mixes'] allmusicMediaNames = ['Album'] myMediaNames = ['Random', 'Todo', 'Match', 'Title', 'Singles'] additions = {} print("{0: <40}{1}".format("Artist", "# of Albums")) for i,(artistName, unknownVals) in enumerate(matchedResults["PartiallyUnknown"].items()): print("{0: <40}".format(artistName)) for dbKey in dbKeys: key = dbKey['Key'] if key != "AceBootlegs": continue if unknownVals.get(key) is not None: dirvals = unknownVals[key] print("{0: <40}{1}".format(artistName, key)) myMusicAlbums = [] for dirval in dirvals: myMusicAlbums += getMyMusicAlbums(dirval, returnNames=True) + getMyMatchedMusicAlbums(dirval) + getMyUnknownMusicAlbums(dirval) if len(myMusicAlbums) == 0: continue print("{0: <40}There are {1} my albums".format(artistName,len(myMusicAlbums))) ## Find Possible IDs possibleIDs = findPossibleArtistIDs(artistName, artistNameToID[key], artists[key], num, cutoff) print(" Possible IDs ===>",len(possibleIDs)) maxRat = None for possibleID in possibleIDs: print("\t{0: <15}".format(possibleID), end="") artistAlbums = getRowData(artistAlbumsDB[key], rownames=possibleID)['Albums'] artistAlbums = getFlattenedArtistAlbums(artistAlbums) print("\t{0: <10}".format(len(artistAlbums)), end="") ## Find overlapping albums retval = getBestAlbumsMatch(artistAlbums, myMusicAlbums, cutoff=cutoff, debug=False) print("\t",round(retval,2), end="") if retval > cutoff: if maxRat is None: maxRat = retval if retval < maxRat: print("") continue maxRat = retval if additions.get(artistName) is None: additions[artistName] = {} additions[artistName][key] = {"Score": retval, "Value": {'ID': possibleID, 'Name': None}} print("\t{0: <15} is a match!".format(possibleID)) else: print("") print("") print("Found {0} new matches".format(len(additions))) elapsed(start, cmt) return additions
def appendSparkData(spdf, dbname, tablename): start, cmt = clock("Appending spark dataframe to {0}.{1}".format( dbname, tablename)) spdf.write.mode('append').format('parquet').saveAsTable("{0}.{1}".format( dbname, tablename)) elapsed(start, cmt)
def saveSparkData(spdf, dbname, tablename): start, cmt = clock("Saving spark dataframe to {0}.{1}".format( dbname, tablename)) spdf.write.mode('overwrite').format('parquet').saveAsTable( "{0}.{1}".format(dbname, tablename)) elapsed(start, cmt)
def getPandasDataFrame(spdf): start = clock(comment="Creating Pandas DataFrame from Spark DataFrame") pddf = spdf.toPandas() elapsed(start, comment="Create Pandas DataFrame") return pddf
movie = mdata['TITLE'] except: raise ValueError("Could not get movie name from TITLE key! {0}".format(mdata)) movies.append(movie) if debug: print("Found {0}/{1} movies".format(len(movies), expect)) return movies def parseFilms101Data(self, debug=False): outdir = self.getDataDir() resultsdir = self.getResultsDir() files = findExt(outdir, ext=".p") movies = {} for ifile in sorted(files): year = getBaseFilename(ifile) results = self.parseFilms101YearlyData(ifile, debug=debug) movies[year] = [] for movie in results: movies[year].append([movie,10]) print("Found {0} movies in {1}".format(len(movies[year]),year)) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of films101 Data to {1}".format(len(movies), savename)) saveFile(savename, movies) _, _ = clock("Last Run")