def createDataPoints():
    """Make the data points of user locations for the map generation."""
    usersList = files.readUsers()
    beersList = files.readBeers()
    points = []
    i = 1
    for hashId, user in usersList.iteritems():
        if 'lat' in user.location and user.ratings:
            for bid, rating in user.ratings.iteritems():
                country = None
                if 'country' in user.location:
                    country = user.location['country']
                pointAttribs = {
                    'lat': user.location['lat'],
                    'lng': user.location['lng'],
                    'country': country,
                    'abv': beersList[str(hash(bid))].abv,
                    'rating': rating,
                    'style': beersList[str(hash(bid))].style
                point = dp.dataPoint(pointAttribs)
                if i % 1000 == 0:
                    print "Points added: " + str(i)
                i += 1
    data = dp.dataPoints(points)
    writeJSONFile('../data/dataPoints.json', data)
def beerKeywords():
    """Extract keywords from beer descriptions and rate it."""
    beersList = files.readBeers()
    print 'beers.json loaded...'

    # List of keywords generation
    keywordsList = {}
    position = 0

    for hashId, beer in beersList.iteritems():
        beer.keywords = []
        beer.keywords = extract.extractKeywords(beer.description)
        for keyword in beer.keywords:
            if keyword in keywordsList:
                keywordsList[keyword][0] += beer.rating
                keywordsList[keyword][1] += 1
                keywordsList[keyword] = [beer.rating, 1]
        position += 1
        if (position % 100) == 0:
            print 'Processed ' + str(position) + '/' + str(
                len(beersList)) + ' beers.'

    writeJSONFile('../data/beers.json', beersList)
    writeJSONFile('../data/keywords.json', keywordsList)
def processLabels():
    Prediction of how dominant label color affects the beer rating.

    Download beer bottle labels, extract n dominant colors,
    make the color palette, flag each color and calculate
    average rating of that color.
    beersList = files.readBeers()
    beerColorsDict = files.readBeerColors()

    # Path for saving the images
    path = "../data/labels/"

    fileList = os.listdir(path)
    fileList = [
        item for item in fileList
        if item.split(".")[-1] in ('jpeg', 'jpg', 'png')

    # Download and save images
    labels.download(beersList, path, fileList)

    # Number of label colors to cluster
    nColors = 5
    i = 0
    # stop = 6  # Then use the whole list.

    # Loop over images in the folder
    for file in fileList:
        i += 1
        bid = unicode(file.split('.')[0])
        if (bid in beerColorsDict
                and len(beerColorsDict[bid].colorPaletteFlags) == nColors):

        print("Processing image " + file + " [" + str(i - 1) + "/" +
              str(len(fileList)) + "]")
        beerLabel = labels.Image(path + file)

        beerColor = beerLabel.clusterize(nColors)
        beerColorsDict[bid] = beerColor

        # Only for presentation
        # beerLabel.quantizeImage()
        # beerLabel.showResults()

    # Generate the color palette with ratings - Classification
    colorPalette = labels.ColorPalette()
    colorPalette.build(beerColorsDict, beersList)

    # Write the colorsFile - dict{ 'bid': beerColor{RGB,intensity}}
    writeJSONFile('../data/beerColors.json', beerColorsDict)
    writeJSONFile('../data/colorPalette.json', colorPalette.palette)

    print 'Color palette saved.'
def processLabels():
    Prediction of how dominant label color affects the beer rating.

    Download beer bottle labels, extract n dominant colors,
    make the color palette, flag each color and calculate
    average rating of that color.
    beersList = files.readBeers()
    beerColorsDict = files.readBeerColors()

    # Path for saving the images
    path = "../data/labels/"

    fileList = os.listdir(path)
    fileList = [item for item in fileList
                if item.split(".")[-1] in ('jpeg', 'jpg', 'png')]

    # Download and save images
    labels.download(beersList, path, fileList)

    # Number of label colors to cluster
    nColors = 5
    i = 0
    # stop = 6  # Then use the whole list.

    # Loop over images in the folder
    for file in fileList:
        i += 1
        bid = unicode(file.split('.')[0])
        if (bid in beerColorsDict and
                len(beerColorsDict[bid].colorPaletteFlags) == nColors):

        print ("Processing image " + file +
               " [" + str(i - 1) + "/" + str(len(fileList)) + "]")
        beerLabel = labels.Image(path + file)

        beerColor = beerLabel.clusterize(nColors)
        beerColorsDict[bid] = beerColor

        # Only for presentation
        # beerLabel.quantizeImage()
        # beerLabel.showResults()

    # Generate the color palette with ratings - Classification
    colorPalette = labels.ColorPalette()
    colorPalette.build(beerColorsDict, beersList)

    # Write the colorsFile - dict{ 'bid': beerColor{RGB,intensity}}
    writeJSONFile('../data/beerColors.json', beerColorsDict)
    writeJSONFile('../data/colorPalette.json', colorPalette.palette)

    print 'Color palette saved.'
def createCommonStyles():
    """Generate common beer styles and save it to csv file."""
    beersList = files.readBeers()
    allStyles = {}
    for hashId, beer in beersList.iteritems():
        styles = beer.style
        for style in styles:
            numRatings = beer.numRatings if (hasattr(beer, 'numRatings')) else 0
            if style in allStyles:
                allStyles[style] += numRatings
                allStyles[style] = numRatings

    sorted_styles = sorted(allStyles.items(), key=operator.itemgetter(1))[-20:]
    with open('../data/styles.csv', 'wb') as stylesCSV:
        csvwriter = csv.writer(stylesCSV, delimiter=',',
        csvwriter.writerow(["id", "style", "numRatings"])
        i = 1
        for style in sorted_styles:
            csvwriter.writerow([i, unicode(style[0]).encode("utf-8"), style[1]])
            i += 1
def createDataPoints():
    """Make the data points of user locations for the map generation."""
    usersList = files.readUsers()
    beersList = files.readBeers()
    points = []
    i = 1
    for hashId, user in usersList.iteritems():
        if 'lat' in user.location and user.ratings:
            for bid, rating in user.ratings.iteritems():
                country = None
                if 'country' in user.location:
                    country = user.location['country']
                pointAttribs = {'lat': user.location['lat'], 'lng': user.location['lng'],
                'country': country, 'abv': beersList[str(hash(bid))].abv, 'rating': rating,
                'style': beersList[str(hash(bid))].style}
                point = dp.dataPoint(pointAttribs)
                if i % 1000 == 0:
                    print "Points added: " + str(i)
                i += 1
    data = dp.dataPoints(points)
    writeJSONFile('../data/dataPoints.json', data)
def createCommonStyles():
    """Generate common beer styles and save it to csv file."""
    beersList = files.readBeers()
    allStyles = {}
    for hashId, beer in beersList.iteritems():
        styles = beer.style
        for style in styles:
            numRatings = beer.numRatings if (hasattr(beer,
                                                     'numRatings')) else 0
            if style in allStyles:
                allStyles[style] += numRatings
                allStyles[style] = numRatings

    sorted_styles = sorted(allStyles.items(), key=operator.itemgetter(1))[-20:]
    with open('../data/styles.csv', 'wb') as stylesCSV:
        csvwriter = csv.writer(stylesCSV, delimiter=',', quotechar='"')
        csvwriter.writerow(["id", "style", "numRatings"])
        i = 1
        for style in sorted_styles:
                [i, unicode(style[0]).encode("utf-8"), style[1]])
            i += 1
def beerKeywords():
    """Extract keywords from beer descriptions and rate it."""
    beersList = files.readBeers()
    print 'beers.json loaded...'

    # List of keywords generation
    keywordsList = {}
    position = 0

    for hashId, beer in beersList.iteritems():
        beer.keywords = []
        beer.keywords = extract.extractKeywords(beer.description)
        for keyword in beer.keywords:
            if keyword in keywordsList:
                keywordsList[keyword][0] += beer.rating
                keywordsList[keyword][1] += 1
                keywordsList[keyword] = [beer.rating, 1]
        position += 1
        if (position % 100) == 0:
            print 'Processed ' + str(position) + '/' + str(len(beersList)) + ' beers.'

    writeJSONFile('../data/beers.json', beersList)
    writeJSONFile('../data/keywords.json', keywordsList)
Single-purpose script for easy monitoring of data quantity.

Load each json data file, find its size and generate
a plot for presentation.

import fileReader as files
import matplotlib.pyplot as plt
import os
import numpy as np

# Load files
print "Loading beers..."
beersList = files.readBeers()
print "Loading users..."
usersList = files.readUsers()
print "Loading breweries..."
breweriesList = files.readBreweries()

# Path for saving the images
path = "../data/labels/"
fileList = os.listdir(path)

# Data gathering
labels = ('Beers', 'Reviews', 'Users', 'Breweries', 'Labels')
index = np.arange(len(labels))
quantities = (len(beersList), sum([len(x.ratings) for x in usersList.values()]),
               len(usersList),  len(breweriesList), len(fileList))

# Plot the quantities
def userReviews():
    Parse through user reviews /user/beers/{username}
    Retrieves at most 50 reviews per user, retains review, beer, and
    brewery information. After querying the api, remove username to
    lessen privacy concerns with untappd data.
    usersList = files.readUsers()
    beersList = files.readBeers()
    breweryList = files.readBreweries()
    breweryToBeers = files.readBreweryToBeers()

    total = 0
    totalUsersComplete = 0
    for userHash, user in usersList.iteritems():
        totalUsersComplete += 1
        # if the data has been normalized, old data will not
        # have usernames. Ignore older users which may have
        # already gotten reviews
        if user.username:
            userId = user.uid
            username = user.username
            user.username = None
            userReviewCount = 0
            offsetTotal = 0
            ratings = {}

            print 'Processing ' + str(userId) + ': ' + username
            # each response returns at most 25 reviews. To get more user
            # reviews, call again with an offset get at most 50 reviews
            # from the same user
            while (userReviewCount < 2):
                print username + ': ' + str(userReviewCount + 1)
                data = untappd.getUserReviewData(username, offsetTotal)
                offset = data['response']['beers']['count']
                offsetTotal += offset
                reviews = data['response']['beers']['items']
                for review in reviews:
                    userRating = review['rating_score']
                    if userRating > 0:
                        beerInfo = review['beer']
                        breweryInfo = review['brewery']
                        # fill in beer information
                        if hash(str(beerInfo['bid'])) not in beersList:
                            stylesList = []
                            style = unicode(
                            styles = style.lower().title().split('/')
                            for style in styles:
                                style = style.strip()
                            beerAttribs = {
                            beer = UT.UntappdBeer(beerAttribs)
                            beersList[hash(beer.bid)] = beer
                                beerInfo['bid']))].numRatings += 1
                        # fill in brewery information
                        if hash(str(
                                breweryInfo['brewery_id'])) not in breweryList:
                            breweryAttribs = {
                            brewery = UT.UntappdBrewery(breweryAttribs)
                            breweryList[hash(brewery.breweryId)] = brewery

                        # map breweery_id to a list of beers produced there
                        if hash(str(breweryInfo['brewery_id'])
                                ) not in breweryToBeers:
                            # store the current beer in a list of beers of
                            # the brewery
                                breweryInfo['brewery_id']))] = {
                            # add current beer to brewery's list of beers

                        # add list of beer ratings to user
                        ratings[str(beerInfo['bid'])] = userRating
                userReviewCount += 1
                user.ratings = ratings

                # store the dictionaries after new data so user doesn't kill process before writing
                # with open('../data/users.json', 'wb') as usersFile:
                #     json = jpickle.encode(usersList)
                #     usersFile.write(json)
                # with open('../data/beers.json', 'wb') as beersFile:
                #     json = jpickle.encode(beersList)
                #     beersFile.write(json)
                # with open('../data/breweries.json', 'wb') as breweriesFile:
                #     json = jpickle.encode(breweryList)
                #     breweriesFile.write(json)
                # with open('../data/breweryToBeers.json', 'wb') as breweryToBeersFile:
                #     json = jpickle.encode(breweryToBeers)
                #     breweryToBeersFile.write(json)

                # if the offset is less than 25, then there are no more reviews to retrieve
                if offset < 25:
            writeJSONFile('../data/users.json', usersList)
            writeJSONFile('../data/beers.json', beersList)
            writeJSONFile('../data/breweries.json', breweryList)
            writeJSONFile('../data/breweryToBeers.json', breweryToBeers)

            total += len(ratings)
            print str(userId) + ': ' + username + ', Processed: ' + str(
                len(ratings)) + ' reviews'
            print 'Total Reviews: ' + str(total)
            print 'Total Users Completed: ' + str(totalUsersComplete)
            sleep(37 * (userReviewCount))
            total += len(user.ratings)
def userReviews():
    Parse through user reviews /user/beers/{username}
    Retrieves at most 50 reviews per user, retains review, beer, and
    brewery information. After querying the api, remove username to
    lessen privacy concerns with untappd data.
    usersList = files.readUsers()
    beersList = files.readBeers()
    breweryList = files.readBreweries()
    breweryToBeers = files.readBreweryToBeers()

    total = 0
    totalUsersComplete = 0
    for userHash, user in usersList.iteritems():
        totalUsersComplete += 1
        # if the data has been normalized, old data will not
        # have usernames. Ignore older users which may have
        # already gotten reviews
        if user.username:
            userId = user.uid
            username = user.username
            user.username = None
            userReviewCount = 0
            offsetTotal = 0
            ratings = {}

            print 'Processing ' + str(userId) + ': ' + username
            # each response returns at most 25 reviews. To get more user
            # reviews, call again with an offset get at most 50 reviews
            # from the same user
            while (userReviewCount < 2):
                print username + ': ' + str(userReviewCount + 1)
                data = untappd.getUserReviewData(username, offsetTotal)
                offset = data['response']['beers']['count']
                offsetTotal += offset
                reviews = data['response']['beers']['items']
                for review in reviews:
                    userRating = review['rating_score']
                    if userRating > 0:
                        beerInfo = review['beer']
                        breweryInfo = review['brewery']
                        # fill in beer information
                        if hash(str(beerInfo['bid'])) not in beersList:
                            stylesList = []
                            style = unicode(beerInfo['beer_style']).encode("utf-8")
                            styles = style.lower().title().split('/')
                            for style in styles:
                                style = style.strip()
                            beerAttribs = {
                                'bid': str(beerInfo['bid']),
                                'name': unicode(beerInfo['beer_name']).encode("utf-8"),
                                'label': beerInfo['beer_label'],
                                'abv': beerInfo['beer_abv'],
                                'ibu': beerInfo['beer_ibu'],
                                'style': stylesList,
                                'description': unicode(beerInfo['beer_description']).encode("utf-8"),
                                'rating': beerInfo['rating_score'],
                                'numRatings': 1,
                                'brewery': str(breweryInfo['brewery_id'])
                            beer = UT.UntappdBeer(beerAttribs)
                            beersList[hash(beer.bid)] = beer
                            beersList[hash(str(beerInfo['bid']))].numRatings += 1
                        # fill in brewery information
                        if hash(str(breweryInfo['brewery_id'])) not in breweryList:
                            breweryAttribs = {
                                'breweryId': str(breweryInfo['brewery_id']),
                                'name': unicode(breweryInfo['brewery_name']).encode("utf-8"),
                                'label': breweryInfo['brewery_label'],
                                'country': unicode(breweryInfo['country_name']).encode("utf-8"),
                                'location': unicode(breweryInfo['location']).encode("utf-8")
                            brewery = UT.UntappdBrewery(breweryAttribs)
                            breweryList[hash(brewery.breweryId)] = brewery

                        # map breweery_id to a list of beers produced there
                        if hash(str(breweryInfo['brewery_id'])) not in breweryToBeers:
                            # store the current beer in a list of beers of
                            # the brewery
                            breweryToBeers[hash(str(breweryInfo['brewery_id']))] = {str(breweryInfo['brewery_id']): [str(beerInfo['bid'])]}
                            # add current beer to brewery's list of beers

                        # add list of beer ratings to user
                        ratings[str(beerInfo['bid'])] = userRating
                userReviewCount += 1
                user.ratings = ratings

                # store the dictionaries after new data so user doesn't kill process before writing
                # with open('../data/users.json', 'wb') as usersFile:
                #     json = jpickle.encode(usersList)
                #     usersFile.write(json)
                # with open('../data/beers.json', 'wb') as beersFile:
                #     json = jpickle.encode(beersList)
                #     beersFile.write(json)
                # with open('../data/breweries.json', 'wb') as breweriesFile:
                #     json = jpickle.encode(breweryList)
                #     breweriesFile.write(json)
                # with open('../data/breweryToBeers.json', 'wb') as breweryToBeersFile:
                #     json = jpickle.encode(breweryToBeers)
                #     breweryToBeersFile.write(json)

                # if the offset is less than 25, then there are no more reviews to retrieve
                if offset < 25:
            writeJSONFile('../data/users.json', usersList)
            writeJSONFile('../data/beers.json', beersList)
            writeJSONFile('../data/breweries.json', breweryList)
            writeJSONFile('../data/breweryToBeers.json', breweryToBeers)

            total += len(ratings)
            print str(userId) + ': ' + username + ', Processed: ' + str(len(ratings)) + ' reviews'
            print 'Total Reviews: ' + str(total)
            print 'Total Users Completed: ' + str(totalUsersComplete)
            sleep(37 * (userReviewCount))
            total += len(user.ratings)