示例#1
0
def enrichAllRedditPlaces():
    with open(utils.getFullPathFromDataFileName('places.json')) as data_file:
        places = json.load(data_file)
        placeCoordinateDictionary = {}
        for place in places:
            cityName = place["name"]
            cityProperName = place["properName"]
            coordinates = place["coordinates"]
            placeCoordinateDictionary[cityName] = str(coordinates)

        print ''
        print ''
        print 'About to enrich all reddit data'

        fileName = 'reddit/allRedditComments'

        # Enrich with weather data
        print 'Enriching with Weather Data'
        augmentWeather.enrichWithWeather(fileName, placeCoordinateDictionary)

        # Enrich with sentiment
        print 'Enriching with Sentiment'
        sentiment.enrichWithSentiment(fileName)

        # Clean data
        print 'Cleaning Data'
        clean.clean(fileName)

        # Group Data
        print 'Grouping Data'
        groupDataByHour(fileName)
示例#2
0
def enrichAllPlaces():
    with open(utils.getFullPathFromDataFileName('places.json')) as data_file:
        places = json.load(data_file)

        for place in places:
            cityName = place["name"]
            cityProperName = place["properName"]
            coordinates = place["coordinates"]

            print ''
            print ''
            print 'About to enrich: ', cityProperName
            if cityDataExists(cityName):
                print 'City Data found for: ', cityProperName

                # # Enrich with weather data
                # print 'Enriching with Weather Data'
                # augmentWeather.enrichWithWeather(cityName, coordinates)

                # # Enrich with sentiment
                # print 'Enriching with Sentiment'
                # sentiment.enrichWithSentiment(cityName)
                #
                # # Clean data
                # print 'Cleaning Data'
                # clean.clean(cityName)

                # Group Data
                print 'Grouping Data'
                groupDataByHour(cityName)

            else:
                print 'No data file found for: ', cityProperName
def retrieveJsonData(jsonFileNames):
    results = []
    for jsonFileName in jsonFileNames:
        jsonPath = utils.getFullPathFromDataFileName(jsonFileName)
        with open(jsonPath) as dataFile:
            jsonData = json.load(dataFile)
        results.extend(jsonData)
    return results
示例#4
0
def enrichWithSentiment(cityName):
    inputFilePath = utils.getFullPathFromDataFileName(cityName +
                                                      '_weather.json')
    outputFilePath = utils.getFullPathFromDataFileName(
        cityName + '_weather_sentiment.json')

    if 'Reddit' in cityName and sys.getdefaultencoding() != 'utf-8':
        reload(sys)
        sys.setdefaultencoding('utf-8')

    count = 0
    with open(inputFilePath) as data_file:
        data = json.load(data_file)
        print 'Adding sentiments to tweet list of length: ', len(data)
        for tweet in data:
            if count % 100000 == 0:
                print "Adding sentiment data: ", count
            count = count + 1
            tweet_body = tweet['body']
            if 'Reddit' in cityName:
                if type(tweet_body) == unicode:
                    tweet_body = tweet_body.encode('utf-8')
                    tweet_body = tweet_body.strip()
                else:
                    tweet_body = str(tweet_body)
            sent_score = 0
            try:
                sent_score = getSentScore(tweet_body)
#                if tweet_body:
#                    tweet_word = tweet_body.lower().split()
#                    # print tweet_word
#                    for word in tweet_word:
#                        word = word.rstrip('?:!.,;"!@')
#                        word = word.replace("\n", "")
#                        if word in sentScores:
#                            # print word
#                            sent_score = sent_score + float(sentScores[word])
            except Exception, (e):
                print str(e)
            tweet['sentiment'] = sent_score
        print 'Saving file to ', outputFilePath
        with open(outputFilePath, 'w') as outfile:
            json.dump(data, outfile)
        print 'File saved to ', outputFilePath
示例#5
0
def getPresavedWeatherData(gps):
    weatherDataPath = utils.getFullPathFromDataFileName(
        'weather/weatherData_' + gps + '.json')
    if os.path.isfile(weatherDataPath):
        # data_file = open(weatherDataPath)
        # json_data = json.load(data_file)
        # return json_data
        with open(weatherDataPath) as data_file:
            load = json.load(data_file)
            return load

    else:
        return dict()
示例#6
0
def clean(cityName):
    inputPath = utils.getFullPathFromDataFileName(cityName + '_weather_sentiment.json')
    outputPath = utils.getFullPathFromDataFileName(cityName + '_weather_sentiment_clean.json')
    with open(inputPath) as data_file:
        dataEntries = json.load(data_file)

        count = 0
        cleanData = []
        for dataEntry in dataEntries:
            if count % 100000 == 0:
                print "Cleaning data -- count: ", count
            count = count + 1

            if hasNonNeutralSentiment(dataEntry) and hasWeatherData(dataEntry):
                addCreatedField(dataEntry)
                fixWeatherData(dataEntry)
                addSentimentLabel(dataEntry)
                cleanData.append(dataEntry)

    print 'Saving file: ', outputPath
    with open(outputPath, 'w') as outfile:
        json.dump(cleanData, outfile)
    print 'Saved file: ', outputPath
示例#7
0
def countGroupData(cityName):
    inputPath = utils.getFullPathFromDataFileName(
        cityName + '_weather_sentiment_clean_grouped.json')
    with open(inputPath) as data_file:
        dataEntries = json.load(data_file)
        for data in dataEntries:
            if data['sentiment_average'] < -1 or data['sentiment_average'] > 1:
                print 'average is bad'
                print data
            if data['sentiment_percent_positive'] < 0 or data[
                    'sentiment_percent_positive'] > 1:
                print 'percent is bad:'
                print data

        return len(dataEntries)
示例#8
0
def printClassVariables():
    jsonFileNames = [
        'chicago_weather_sentiment_clean_grouped.json',
        'denver_weather_sentiment_clean_grouped.json',
        'detroit_weather_sentiment_clean_grouped.json',
        'houston_weather_sentiment_clean_grouped.json',
        'manhattan_weather_sentiment_clean_grouped.json',
        'phoenix_weather_sentiment_clean_grouped.json',
        'sanFrancisco_weather_sentiment_clean_grouped.json',
        'seattle_weather_sentiment_clean_grouped.json',
    ]



    jsonData = retrieveJsonData(jsonFileNames)
    dataframe = pd.DataFrame(jsonData)

    sentimentAveragePath = utils.getFullPathFromDataFileName('csv/full_data.csv')
    dataframe.to_csv(sentimentAveragePath)
示例#9
0
def countAllPlaces():
    with open(utils.getFullPathFromDataFileName('places.json')) as data_file:
        places = json.load(data_file)
        sum = 0
        placeCounts = []
        for place in places:
            cityName = place["name"]
            cityProperName = place["properName"]

            if cityDataExists(cityName):
                count = countGroupData(cityName)
                placeCounts.append((cityProperName, count))
                sum = sum + count
            else:
                print 'No data file found for: ', cityProperName
        placeCountsSorted = sorted(((v, k) for k, v in placeCounts),
                                   reverse=True)
        for key, value in placeCountsSorted:
            print value + ': ' + str(key)
        print sum
示例#10
0
def countRedditData():
    # inputPath = utils.getFullPathFromDataFileName('reddit/allRedditComments.json')
    # inputPath = utils.getFullPathFromDataFileName('reddit/allRedditComments_weather.json')
    # inputPath = utils.getFullPathFromDataFileName('reddit/allRedditComments_weather_sentiment.json')
    # inputPath = utils.getFullPathFromDataFileName('reddit/allRedditComments_weather_sentiment_clean.json')
    inputPath = utils.getFullPathFromDataFileName(
        'reddit/allRedditComments_weather_sentiment_clean_grouped.json')
    dataEntries = json.load(open(inputPath))
    counts = {}
    # cityKey = 'city'
    cityKey = 'location'
    for dataEntry in dataEntries:
        city = dataEntry[cityKey]
        if city not in counts:
            counts[city] = {}
            counts[city]['count'] = 1
            counts[city]['weatherCount'] = 1
        else:
            counts[city]['count'] = counts[city]['count'] + 1
            if 'temperature' in dataEntry:
                counts[city]['weatherCount'] = counts[city]['weatherCount'] + 1
    print counts
    print len(dataEntries)
    return len(dataEntries)
示例#11
0
def cityDataExists(cityName):
    cityFileName = cityName + '.json'
    cityFilePath = utils.getFullPathFromDataFileName(cityFileName)
    return os.path.isfile(cityFilePath)
示例#12
0
def countData(cityName):
    inputPath = utils.getFullPathFromDataFileName(cityName + '.json')
    with open(inputPath) as data_file:
        dataEntries = json.load(data_file)
        return len(dataEntries)
示例#13
0
def groupDataByHour(cityName):
    inputPath = utils.getFullPathFromDataFileName(
        cityName + '_weather_sentiment_clean.json')
    outputPath = utils.getFullPathFromDataFileName(
        cityName + '_weather_sentiment_clean_grouped.json')
    print 'Opening data from: ', inputPath
    with open(inputPath) as data_file:
        dataEntries = json.load(data_file)
        print 'Data of length: ', len(dataEntries)

        count = 0
        groupedData = {}
        for dataEntry in dataEntries:
            if count % 100000 == 0:
                print "Grouping data -- count: ", count
            count = count + 1

            sentiment = dataEntry['sentiment']
            sentimentScore = float(dataEntry['sentimentScore'])
            if 'location' in dataEntry:
                location = dataEntry['location']
            elif 'city' in dataEntry:
                location = dataEntry['city']
            timeHour = time.strftime('%Y-%m-%d %H',
                                     time.localtime(dataEntry['created']))
            groupKey = makeUniqueKey(timeHour, location)
            if groupKey in groupedData:
                if groupedData[groupKey]['temperature'] != dataEntry[
                        'temperature']:
                    print ''
                    print 'This data entry is wrong'
                    print dataEntry

                oldCount = groupedData[groupKey]['num_data']
                newCount = oldCount + 1.0

                oldSentimentAverageScore = groupedData[groupKey][
                    'sentiment_percent_positive']
                newSentimentAverageScore = (
                    (oldSentimentAverageScore * oldCount) +
                    sentimentScore) / newCount
                groupedData[groupKey][
                    'sentiment_percent_positive'] = newSentimentAverageScore

                oldSentimentAverage = groupedData[groupKey][
                    'sentiment_average']
                newSentimentAverage = (
                    (oldSentimentAverage * oldCount) + sentiment) / newCount
                groupedData[groupKey][
                    'sentiment_average'] = newSentimentAverage

                groupedData[groupKey]['num_data'] = newCount

            else:
                weatherColumnNames = [
                    'cloudCover',
                    'temperature',
                    'dewPoint',
                    'visibility',
                    'apparentTemperature',
                    'pressure',
                    'precipIntensity',
                    'precipTypeNone',
                    'precipTypeRain',
                    'precipTypeSnow',
                    'humidity',
                    'windSpeed',
                    'precipProbability',
                    'precipType',
                    'icon',
                ]
                newDataEntry = {
                    'timeHour': timeHour,
                    'created': dataEntry['created'],
                    'time': dataEntry['time'],
                    'sentiment_average': sentiment,
                    'sentiment_percent_positive': sentimentScore,
                    'num_data': 1.0
                }
                if 'location' in dataEntry:
                    newDataEntry['location'] = location
                elif 'city' in dataEntry:
                    newDataEntry['location'] = location
                for weatherColumn in weatherColumnNames:
                    newDataEntry[weatherColumn] = dataEntry[weatherColumn]
                groupedData[groupKey] = newDataEntry

        print 'Saving file: ', outputPath
        print '# values: ', str(len(groupedData))
        with open(outputPath, 'w') as outfile:
            groupedDataValues = groupedData.values()
            json.dump(groupedDataValues, outfile)

        print 'Saved file: ', outputPath
示例#14
0
def enrichWithWeather(location_name, coordinates):
    actualCityNameMap = {
        'chicago': 'chicago',
        'asburypark': 'asburyPark',
        'denver': 'denver',
        'detroit': 'detroit',
        'houston': 'houston',
        'nyc': 'manhattan',
        'phoenix': 'phoenix',
        'sanfrancisco': 'sanFrancisco',
        'san francisco': 'sanFrancisco',
        'seattle': 'seattle',
        'manhattan': 'manhattan'
    }
    locationWeatherDictionary = {}
    print 'Getting weather data'
    if type(coordinates) == unicode:
        # weather = None
        weather = getNewWeather.getWeatherForCoordinates(coordinates)
        locationWeatherDictionary[location_name] = weather
    else:
        for place, coordinate in coordinates.iteritems():
            weather = getNewWeather.getWeatherForCoordinates(coordinate)
            # weather = None
            locationWeatherDictionary[place] = weather

    dataFilePath = utils.getFullPathFromDataFileName(location_name + '.json')
    with open(dataFilePath) as data_file:
        jsonData = json.load(data_file)
        print 'Adding weather to data of length = ' + str(len(jsonData))

        count = 0
        for dataObject in jsonData:
            if count % 100000 == 0:
                print "Adding weather data: ", count
            count = count + 1
            if 'created' in dataObject:
                datetime = dataObject['created']
                city_ = str(dataObject['city'].lower().strip())
            else:
                datetime = dataObject['created_at']['$date']
                if 'location' not in dataObject:
                    dataObject['location'] = location_name
                city_ = str(dataObject['location'].lower().strip())
            # try:
            place = actualCityNameMap[city_]
            weather = locationWeatherDictionary[place]
            tweetWeather = getWeatherAtDatetime(datetime, locationWeatherDictionary[place])

            dataObject.update(tweetWeather)
            # except:
            #     print city_

        outputPath = utils.getFullPathFromDataFileName(location_name + '_weather.json')
        print 'Saving file: ', outputPath
        with open(outputPath, 'w') as outfile:
            json.dump(jsonData, outfile)

    print 'Saved file: ', outputPath

    return outputPath
示例#15
0
def savePresavedWeatherData(gps, json_data):
    weatherDataPath = utils.getFullPathFromDataFileName(
        'weather/weatherData_' + gps + '.json')
    with open(weatherDataPath, 'w') as outfile:
        json.dump(json_data, outfile)