Пример #1
0
def setupSpark(debug=False):
    start, cmt = clock("Setting up Spark/PySpark")

    import os
    try:
        sparkhome = os.environ.get("SPARK_HOME")
        print("PySpark home is {0}".format(sparkhome))
    except:
        raise ValueError("There is not SPARK_HOME variable")

    try:
        import findspark
        findspark.init(sparkhome)
    except:
        raise ValueError("Could import findspark")

    try:
        import pyspark
    except:
        raise ValueError("Could not import pyspark")

    hivejar = '/opt/cloudera/parcels/CDH/jars/hive-hcatalog-core-1.1.0-cdh5.13.3.jar'
    from os.path import exists
    if not exists(hivejar):
        raise ValueError("Hive JAR {0} does not exist".format(hivejar))

    conf = (
        pyspark.SparkConf().setAppName('Daily Feature Generator').setMaster(
            'yarn').set('spark.driver.memory',
                        '20g').set('spark.shuffle.service.enabled',
                                   True).set('spark.dynamicAllocation.enabled',
                                             True)
        #        .set('spark.executor.heartbeatInterval', '3600s')
        .set('spark.executor.memory', '5g').set(
            'spark.yarn.executor.memoryOverhead',
            '4000m').set('spark.dynamicAllocation.maxExecutors', 250).set(
                'spark.dynamicAllocation.minExecutors',
                10).set('spark.kryoserializer.buffer.max', '1g').set(
                    'spark.speculation', True).set('spark.jars', hivejar).set(
                        'spark.port.maxRetries',
                        100).set('spark.driver.maxResultSize',
                                 '6g').set('spark.sql.broadcastTimeout', 600))

    sc = pyspark.SparkContext(conf=conf)
    sc.setLogLevel('ERROR')

    elapsed(start, cmt)

    return sc
Пример #2
0
    def getArtistStatus(self):
        start, cmt = clock("Matching All Music Artists")

        ######################################################################
        #### Loop Over My Artists and Paths
        ######################################################################
        for primeDir in self.mmb.getPrimeDirectories():
            for artistName, artistPrimeDirs in self.mmb.getArtistPrimeDirMap(
                    primeDir).items():
                if self.debug:
                    print("{0: <50}{1}".format(artistName, artistPrimeDirs))

                ######################################################################
                #### Get Database IDs
                ######################################################################
                isKnown = self.mdb.isKnownByName(artistName)
                if isKnown is False:
                    self.unknownArtists[artistName] = artistPrimeDirs
                    if self.debug:
                        print("\tUnknown (All)     --> {0}".format(artistName))

        elapsed(start, cmt)
        print("Found {0} unknown artists".format(len(self.unknownArtists)))
        print("Found {0} total artists".format(len(self.artistAlbums)))
Пример #3
0
    def matchMyMusicAlbums(self, db, albumType=1, ratioCut=0.95, maxCut=0.1):
        self.matchedAlbums = {}

        start, cmt = clock(
            "Checking for Albums Matches Against {0} DB".format(db))

        print("{0: <40}{1: <15}{2: <45} --> {3}".format(
            "Artist", "Database", "Album Name", "Matched Album"))

        ######################################################################
        #### Get Map of Artists and Unmatched Albums
        ######################################################################
        artistNames = self.mmb.getArtists()
        #artistAlbums = self.mmb.getArtistAlbums()

        ######################################################################
        #### Loop Over Artist Name <-> Prime Map Items
        ######################################################################
        for artistName in artistNames:
            matchedAlbums = self.matchMyMusicAlbumsByArtist(
                db, artistName, albumType, ratioCut, maxCut)
            if len(matchedAlbums) > 0:
                if self.matchedAlbums.get(db) is None:
                    self.matchedAlbums[db] = {}
                self.matchedAlbums[db][artistName] = matchedAlbums
                for myAlbumName, bestMatchVal in matchedAlbums.items():
                    print("{0: <40}{1: <15}{2: <45} --> {3}".format(
                        artistName, db, myAlbumName, bestMatchVal["Album"]))

        elapsed(start, cmt)

        saveFile(ifile=self.mmn.moveFilename,
                 idata=self.matchedAlbums,
                 debug=True)
        print("Found {0} music <-> discogs albums maps".format(
            len(self.matchedAlbums)))
Пример #4
0
def analyzePartiallyUnknownArtists(matchedResults):
    start, cmt = clock("Finding Possible New Matches")

    num = 2
    cutoff = 0.50


    discogMediaNames   = ['Albums', 'Singles & EPs', 'Compilations', 'Videos', 'Miscellaneous', 'Visual', 'DJ Mixes']
    allmusicMediaNames = ['Album']
    myMediaNames       = ['Random', 'Todo', 'Match', 'Title', 'Singles']

    additions = {}

    print("{0: <40}{1}".format("Artist", "# of Albums"))
    for i,(artistName, unknownVals) in enumerate(matchedResults["PartiallyUnknown"].items()):
        print("{0: <40}".format(artistName))
        for dbKey in dbKeys:
            key = dbKey['Key']
            if key != "AceBootlegs":
                continue
            if unknownVals.get(key) is not None:
                dirvals = unknownVals[key]
                print("{0: <40}{1}".format(artistName, key))

                myMusicAlbums = []
                for dirval in dirvals:
                    myMusicAlbums += getMyMusicAlbums(dirval, returnNames=True) + getMyMatchedMusicAlbums(dirval) + getMyUnknownMusicAlbums(dirval)
                if len(myMusicAlbums) == 0:
                    continue
                print("{0: <40}There are {1} my albums".format(artistName,len(myMusicAlbums)))


                ## Find Possible IDs
                possibleIDs = findPossibleArtistIDs(artistName, artistNameToID[key], artists[key], num, cutoff)
                print("     Possible IDs ===>",len(possibleIDs))
                maxRat = None
                for possibleID in possibleIDs:
                    print("\t{0: <15}".format(possibleID), end="")
                    artistAlbums = getRowData(artistAlbumsDB[key], rownames=possibleID)['Albums']
                    artistAlbums = getFlattenedArtistAlbums(artistAlbums)          
                    print("\t{0: <10}".format(len(artistAlbums)), end="")


                    ## Find overlapping albums
                    retval = getBestAlbumsMatch(artistAlbums, myMusicAlbums, cutoff=cutoff, debug=False)                
                    print("\t",round(retval,2), end="")
                    if retval > cutoff:
                        if maxRat is None:
                            maxRat = retval
                        if retval < maxRat:
                            print("")
                            continue
                        maxRat = retval
                        if additions.get(artistName) is None:
                            additions[artistName] = {}
                        additions[artistName][key] = {"Score": retval, "Value": {'ID': possibleID, 'Name': None}}

                        print("\t{0: <15} is a match!".format(possibleID))
                    else:
                        print("")

    print("")
    print("Found {0} new matches".format(len(additions)))
    elapsed(start, cmt)
    
    return additions
Пример #5
0
def appendSparkData(spdf, dbname, tablename):
    start, cmt = clock("Appending spark dataframe to {0}.{1}".format(
        dbname, tablename))
    spdf.write.mode('append').format('parquet').saveAsTable("{0}.{1}".format(
        dbname, tablename))
    elapsed(start, cmt)
Пример #6
0
def saveSparkData(spdf, dbname, tablename):
    start, cmt = clock("Saving spark dataframe to {0}.{1}".format(
        dbname, tablename))
    spdf.write.mode('overwrite').format('parquet').saveAsTable(
        "{0}.{1}".format(dbname, tablename))
    elapsed(start, cmt)
Пример #7
0
def getPandasDataFrame(spdf):
    start = clock(comment="Creating Pandas DataFrame from Spark DataFrame")
    pddf = spdf.toPandas()
    elapsed(start, comment="Create Pandas DataFrame")
    return pddf
Пример #8
0
                    movie = mdata['TITLE']
                except:
                    raise ValueError("Could not get movie name from TITLE key! {0}".format(mdata))

                movies.append(movie)
            
        if debug:
            print("Found {0}/{1} movies".format(len(movies), expect))
            
        return movies
                    


    def parseFilms101Data(self, debug=False):
        outdir = self.getDataDir()
        resultsdir = self.getResultsDir()
        files  = findExt(outdir, ext=".p")
        movies = {}
        
        for ifile in sorted(files):
            year    = getBaseFilename(ifile)
            results = self.parseFilms101YearlyData(ifile, debug=debug)
            movies[year] = []
            for movie in results:
                movies[year].append([movie,10])
            print("Found {0} movies in {1}".format(len(movies[year]),year))
        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of films101 Data to {1}".format(len(movies), savename))
        saveFile(savename, movies)
_, _ = clock("Last Run")