예제 #1
0
    def execute(trial=False):
        startTime = datetime.datetime.now()

        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('angelay_maulikjs', 'angelay_maulikjs')

        repo.dropPermanent('angelay_maulikjs')
        repo.createPermanent('angelay_maulikjs')

        data = repo.angelay_maulikjs.clean2012.find()
        D = []
        for document in data:
            d = dict(document)
            D.append([
                d['CarbonIntensity'], d['CO2Emissions'], d['EnergyIntensity'],
                d['EnergyUse'], d['GDPperCapita'], d['HDI'], d['Population']
            ])
        df = pd.DataFrame(D,
                          columns=[
                              'CarbonIntensity', 'CO2Emissions',
                              'EnergyIntensity', 'EnergyUse', 'GDPperCapita',
                              'HDI', 'Population'
                          ])

        axes = pd.plotting.scatter_matrix(df, alpha=1, figsize=(10, 10))
        plt.tight_layout()
        plt.savefig('angelay_maulikjs/statsmodel_correlations')
        print('\nCorrelation coefficients:\n')
        print(df.corr())
        print()
        print(df.describe())
        print()

        Independents = df[[
            'CarbonIntensity', 'EnergyIntensity', 'EnergyUse', 'GDPperCapita',
            'HDI', 'Population'
        ]]
        Dependent = df.CO2Emissions
        model = sm.OLS(Dependent, Independents)
        results = model.fit()
        print(results.summary())
        print()
        print(
            '\nThis is our linear least-square model. It does not yield a good R-squared value. We are going to build our model by adding one variable at a time, starting with the variable that yields the highest R-squared value when fitted to a linear model against CO2 emissions. After doing some research, we found a theory called Kaya Identity which states that CO2 emissions is roughly equal to population * GDP per capita * energy intensity * carbon intensity. We will incorporate that into our model and see if adding energy use and HDI will make it better.\n'
        )
        ind = [
            'CarbonIntensity', 'EnergyIntensity', 'EnergyUse', 'GDPperCapita',
            'HDI', 'Population'
        ]
        for i in range(len(ind)):
            model = sm.OLS(Dependent, df[ind[i]])
            results = model.fit()
            print('\nCO2Emissions vs ' + ind[i] + '\n')
            print(results.summary())
        print(
            '\nLooks like population yields the highest R-squared value, and energy use comes next. Build a model with population and energy use and see if R-squared value goes up.\n'
        )
        model = smf.ols(formula='CO2Emissions ~ Population * EnergyUse',
                        data=df)
        results = model.fit()
        print('\nCO2Emissions vs Population + EnergyUse\n')
        print(results.summary())
        print(
            '\nR-squared value went up to 0.954. Now adding Carbon Intensity.\n'
        )
        model = smf.ols(
            formula='CO2Emissions ~ Population * EnergyUse * CarbonIntensity',
            data=df)
        results = model.fit()
        print('\nCO2Emissions vs Population * EnergyUse * CarbonIntensity\n')
        print(results.summary())
        print(
            '\nR-squared value went up to 0.982. Now adding GDP per capita.\n')
        model = smf.ols(
            formula=
            'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita',
            data=df)
        results = model.fit()
        print(
            '\nCO2Emissions vs Population * EnergyUse * CarbonIntensity * GDPperCapita\n'
        )
        print(results.summary())
        print('\nR-squared value went up to 0.992. Now adding HDI.\n')
        model = smf.ols(
            formula=
            'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI',
            data=df)
        results = model.fit()
        print(
            '\nCO2Emissions vs Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI\n'
        )
        print(results.summary())
        print(
            '\nR-squared value went up to 0.995. Now adding Energy Intensity.\n'
        )
        model = smf.ols(
            formula=
            'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity',
            data=df)
        results = model.fit()
        print(
            '\nCO2Emissions vs Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity\n'
        )
        print(results.summary())
        #print(results.params)
        print(
            '\nR-squared value went up to 0.998. This is a really high R-squared value. We might be at risk of overfitting the data. Lets test our model on the 2013 data and see how we do.\n'
        )

        # getting the 2013 data
        data2 = repo.angelay_maulikjs.clean2013.find()
        D2 = []
        for document in data2:
            d = dict(document)
            D2.append([
                d['CarbonIntensity'], d['CO2Emissions'], d['EnergyIntensity'],
                d['EnergyUse'], d['GDPperCapita'], d['HDI'], d['Population']
            ])
        df2 = pd.DataFrame(D2,
                           columns=[
                               'CarbonIntensity', 'CO2Emissions',
                               'EnergyIntensity', 'EnergyUse', 'GDPperCapita',
                               'HDI', 'Population'
                           ])
        # Testing the model on 2013 data
        model = smf.ols(
            formula=
            'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity',
            data=df)
        pred = model.fit().predict(df2)
        R2 = r2_score(df2.CO2Emissions, pred)
        print('\nResults on 2013 Data\n')
        fig, ax = plt.subplots()
        x2 = range(len(df2.index))
        ax.plot(x2, df2.CO2Emissions, 'o', label="Data")
        ax.plot(x2, pred, 'r', label="OLS prediction")
        ax.legend(loc="best")
        plt.savefig('angelay_maulikjs/statsmodel_results_without_outliers')

        print(
            '\nThe R-squared value is %f, this is a pretty good R-squared value, and it means that our model does pretty well at predicting future values. Now lets train the model on all 2012 data with ourliers and test it on all 2013 data with outliers.\n'
            % R2)

        # getting all 2012 data with outliers
        data3 = repo.angelay_maulikjs.all2012.find()
        D3 = []
        for document in data3:
            d = dict(document)
            D3.append([
                d['CarbonIntensity'], d['CO2Emissions'], d['EnergyIntensity'],
                d['EnergyUse'], d['GDPperCapita'], d['HDI'], d['Population']
            ])
        df3 = pd.DataFrame(D3,
                           columns=[
                               'CarbonIntensity', 'CO2Emissions',
                               'EnergyIntensity', 'EnergyUse', 'GDPperCapita',
                               'HDI', 'Population'
                           ])
        # Training the model on all 2012 data
        model = smf.ols(
            formula=
            'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity',
            data=df3)
        results = model.fit()
        print(
            '\nCO2Emissions vs Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity\n'
        )
        print(results.summary())
        #print(results.params)

        print(
            '\nWe got an R-squared value of 1, but we might be overfitting the data. Lets test the model on all 2013 data to see how we do.\n'
        )

        # getting all 2013 data with outliers
        data4 = repo.angelay_maulikjs.all2013.find()
        D4 = []
        for document in data4:
            d = dict(document)
            D4.append([
                d['CarbonIntensity'], d['CO2Emissions'], d['EnergyIntensity'],
                d['EnergyUse'], d['GDPperCapita'], d['HDI'], d['Population']
            ])
        df4 = pd.DataFrame(D4,
                           columns=[
                               'CarbonIntensity', 'CO2Emissions',
                               'EnergyIntensity', 'EnergyUse', 'GDPperCapita',
                               'HDI', 'Population'
                           ])
        # Testing the model on all 2013 data
        pred = model.fit().predict(df4)
        R2 = r2_score(df4.CO2Emissions, pred)
        print('\nResults on All 2013 Data\n')
        fig, ax = plt.subplots()
        x1 = range(len(df4.index))
        ax.plot(x1, df4.CO2Emissions, 'o', label="Data")
        ax.plot(x1, pred, 'r', label="OLS prediction")
        ax.legend(loc="best")
        plt.savefig('angelay_maulikjs/statsmodel_results_with_outliers')

        print(
            '\nThe R-squared value is %f, even higher than the one we got from clean data without outliers. We can be pretty confident about our model being able to predict future values now.\n'
            % R2)
        endTime = datetime.datetime.now()
        return {"Start ": startTime, "End ": endTime}
    def execute(trial=False):
        startTime = datetime.datetime.now()
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('nathansw_rooday_sbajwa_shreyap',
                          'nathansw_rooday_sbajwa_shreyap')

        # Read data from mongo
        mbta_db = repo['nathansw_rooday_sbajwa_shreyap.OTP_by_line']
        stops_db = repo['nathansw_rooday_sbajwa_shreyap.stops']
        stopsVsLines_db = repo['nathansw_rooday_sbajwa_shreyap.stopsVsLines']

        # Read data into pandas
        print("Loading OTP Data")
        otpData = mbta_db.find_one()
        del otpData['_id']
        otp_by_line = pd.DataFrame.from_dict(otpData)
        otp_by_line = otp_by_line.transpose()
        otp_by_line[otp_by_line['Peak Service'] == ''] = np.nan
        otp_by_line[otp_by_line['Off-Peak Service'] == ''] = np.nan

        print("Loading Stops Data")
        stopsData = stops_db.find_one()
        del stopsData['_id']
        stops = pd.DataFrame.from_dict(stopsData)

        print("Loading Stops By Line Data")
        stop_by_line_data = stopsVsLines_db.find_one()
        del stop_by_line_data['_id']
        stop_by_line = pd.DataFrame([(key, x)
                                     for key, val in stop_by_line_data.items()
                                     for x in val],
                                    columns=['Name', 'Values'])
        stop_by_line.columns = ['Route', 'Stop']
        stop_by_line = stop_by_line.set_index('Stop')

        print("Joining Stops By Line with Stops")
        stop_route_neighborhood = stop_by_line.join(stops.set_index('stop_id'),
                                                    how='left')
        stop_route_neighborhood = stop_route_neighborhood[
            stop_route_neighborhood['neighborhood'].notnull()]
        stop_route_neighborhood['stop_id'] = stop_route_neighborhood.index
        merged = pd.merge(otp_by_line,
                          stop_route_neighborhood,
                          left_index=True,
                          right_on='Route',
                          how='right')
        merged_stop = merged[merged['Peak Service'].notnull()]

        print("Creating dummy data")
        stop_dummy_city = pd.get_dummies(merged_stop['city'])
        stop_dummy_neighborhood = pd.get_dummies(merged_stop['neighborhood'])
        merged_dummy_city = merged_stop.join(stop_dummy_city)
        merged_dummy_city_final = merged_dummy_city.groupby('Route').max()
        x_cols = merged_dummy_city_final.columns[8:]
        y_cols = 'Off-Peak Service'

        print("Creating regression model")
        model = sm.GLM(merged_dummy_city_final[y_cols],
                       merged_dummy_city_final[x_cols],
                       family=sm.families.Gaussian())
        results = model.fit()
        resultsKeys = results.params.keys()

        coefficients = {}
        for key in resultsKeys:
            coefficients[key] = results.params[key]

        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(coefficients)

        print("Saving coefficients")

        repo.dropCollection('regressionAnalysis')
        repo.createCollection('regressionAnalysis')
        repo['nathansw_rooday_sbajwa_shreyap.regressionAnalysis'].insert_one(
            coefficients)

        print("Done!")
        repo.logout()
        endTime = datetime.datetime.now()
        return {"start": startTime, "end": endTime}
 def doKmeans(X, nclust):
     model = KMeans(nclust)
     model.fit(X)
     clust_labels = model.predict(X)
     cent = model.cluster_centers_
     return (clust_labels, cent)
예제 #4
0
    def execute(trial = False):
        startTime = datetime.datetime.now()

        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('aditid_benli95_teayoon_tyao', 'aditid_benli95_teayoon_tyao')

        all_crime = repo.aditid_benli95_teayoon_tyao.crimesPerNumberOfEstablishment.find()
        x = []
        y = []
        print(all_crime)

        for crime in all_crime:
            crimeDict = dict(crime)
            x.append(crimeDict["_id"])
            y.append(crimeDict["value"]["crimes"])
        
        sumy = sum(y)
        
        all = []
        for i in range(0,len(x)):
            for j in range(0,int(y[i]/21.5)):           #253075 / 11800 = 21.5 for nomalization
                all.append(x[i])
        
        
        drug_crime = repo.aditid_benli95_teayoon_tyao.drugCrimesPerNumberOfEstablishment.find()
        a = []
        b = []
        for crime in drug_crime:
            crimeDict = dict(crime)
            a.append(crimeDict["_id"])
            b.append(crimeDict["value"]["crimes"])

        drug = []
        for i in range(0,len(a)):
            for j in range(0,int(b[i])):
                drug.append(a[i])

        bins = []
        for k in range(0,250,5):
            bins.append(k)


        pyplot.hist(all, bins, alpha=.5, label='All Crimes')
        pyplot.hist(drug, bins, alpha=.5, label='Drug Crimes')


        # plt.hist(all, bins, alpha=.7, color='blue')
        # plt.hist(drug, bins, alpha=.7, color='red')

        pyplot.xlabel("Establisments")
        pyplot.ylabel("Crimes")
        pyplot.legend(loc='upper left')
        pyplot.show()

        import statsmodels.api as sm
        model = sm.OLS(y, x)
        results2 = model.fit()
        print (results2.summary())
        print ("Confidence Intervals:", results.conf_int())
        print ("Parameters:", results2.params)

        model = sm.OLS(b, a)
        results2 = model.fit()
        print (results2.summary())
        print ("Confidence Intervals:", results.conf_int())
        print ("Parameters:", results2.params)

        endTime = datetime.datetime.now()
        return {"Start ":startTime, "End ":endTime}
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('liweixi_mogujzhu', 'liweixi_mogujzhu')
        repo.dropCollection("prediction_weather_incident")
        repo.createCollection("prediction_weather_incident")
        # Create the training data and target
        data_name = 'liweixi_mogujzhu.weather_fire_incident_transformation'
        data = pd.DataFrame(list(repo[data_name].find()))
        print(data.shape)
        # If trial mode, use half of the data for training
        if trial:
            data = data[:data.shape[0] // 2]
        data['LSCORE'] = data['NINCIDENT']
        data['TDIFF'] = data["TMAX"] - data["TMIN"]
        X = data[["TAVG", "TDIFF", "PRCP", "SNOW", "AWND"]]
        y = data["LSCORE"].astype(float)
        # Scale the data to range [0,1]
        min_max_scaler = MinMaxScaler()
        x_scaled = numpy.array(min_max_scaler.fit_transform(X.values))
        y_scaled_value = numpy.array(
            min_max_scaler.fit_transform(y.values.reshape(-1, 1)))
        kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
        y_scaled = kbd.fit_transform(y_scaled_value)
        # Shuffle the data and create the training set and testing set
        X_shuffled, y_shuffled, y_shuffled_scaled_value = shuffle(
            x_scaled, y_scaled, y_scaled_value)
        X_train = X_shuffled[:int(X.shape[0] * 0.8)]
        y_train = y_shuffled[:int(X.shape[0] * 0.8)].ravel()
        y_train_value = y_shuffled_scaled_value[:int(X.shape[0] * 0.8)].ravel()
        X_test = X_shuffled[int(X.shape[0] * 0.8):]
        y_test = y_shuffled[int(X.shape[0] * 0.8):].ravel()
        y_test_value = y_shuffled_scaled_value[int(X.shape[0] * 0.8):].ravel()
        # Ser up the classifiers. We use 7 different classifier in this case
        classifiers = [
            linear_model.SGDClassifier(),
            linear_model.LogisticRegression(),
            svm.SVC(),
            ensemble.AdaBoostClassifier(),
            ensemble.BaggingClassifier(),
            ensemble.RandomForestClassifier(),
            ensemble.GradientBoostingClassifier()
        ]
        for item in classifiers:
            print(item)
            clf = item
            clf.fit(X_train, y_train)
            print("Training accuracy:", clf.score(X_train, y_train),
                  "Base: 0.33")
            print("Testing accuracy:", clf.score(X_test, y_test), "Base: 0.33")

        insert_data = pd.DataFrame()
        model = svm.SVC(probability=True)
        model.fit(X_train, y_train)
        print("Final Classifer", model)
        pred = model.predict_proba(X_test)
        pred_label = model.predict(X_test)
        print("Accuracy", model.score(X_test, y_test))
        insert_data["LOW_PROB"] = pred[:, 0]
        insert_data["MID_PROB"] = pred[:, 1]
        insert_data["HIGH_PROB"] = pred[:, 2]
        insert_data["PRED_LABEL"] = pd.DataFrame(pred_label).replace(
            0.0, "LOW").replace(1.0, "MID").replace(2.0, "HIGH")
        insert_data["TRUE_LABEL"] = pd.DataFrame(y_test).replace(
            0.0, "LOW").replace(1.0, "MID").replace(2.0, "HIGH")
        insert_data["TRUE_VALUE"] = y_test_value
        print(insert_data)
        repo['liweixi_mogujzhu.prediction_weather_incident'].insert_many(
            insert_data.to_dict('records'))
        repo['liweixi_mogujzhu.prediction_weather_incident'].metadata(
            {'complete': True})
        print(repo['liweixi_mogujzhu.prediction_weather_incident'].metadata())
        repo.logout()
        endTime = datetime.datetime.now()
        return {"start": startTime, "end": endTime}
예제 #6
0
    def execute(trial=False):
        '''Retrieves our data sets from Boston Open Data using specific URLs.
        Creates the necessary pymongo collections within our repo database.'''

        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('esaracin', 'esaracin')

        # Support for Trial mode:
        if (trial == True):
            # Skip all but 1 percent of each collection
            dataset = repo['esaracin.crime_incidents'].find().skip(
                repo['esaracin.crime_incidents'].count() - 2393)
            datasetFIO = repo['esaracin.fio_data'].find().skip(
                repo['esaracin.crime_incidents'].count() - 1522)
        else:

            dataset = repo['esaracin.crime_incidents'].find()
            datasetFIO = repo['esaracin.fio_data'].find()

        df_crime = pd.DataFrame(list(dataset))
        df_fios = pd.DataFrame(list(datasetFIO))

        url = 'http://datamechanics.io/data/district_racial_composition.csv'
        df_race = pd.read_csv(url)

        # Now we need to find the number of crimes/fios for each Policing
        # District.
        crime_by_district = {dist: 0 for dist in df_crime['DISTRICT'].unique()}
        del crime_by_district['A15']
        del crime_by_district['A1']
        crime_by_district['A1/A15'] = 0

        for index, row in df_crime.iterrows():
            if row['DISTRICT'] == 'A1' or row['DISTRICT'] == 'A15':
                crime_by_district['A1/A15'] += 1
            else:
                crime_by_district[row['DISTRICT']] += 1

        # Replace former crime DataFrame with this new, filtered, data.
        # Use it to join df_race on the District field.
        df_crime = pd.DataFrame.from_dict(crime_by_district, orient='index')
        df_race = df_race.join(df_crime, on='dist')
        new_columns = df_race.columns.values
        new_columns[-1] = 'Crime Count'
        df_race.columns = new_columns

        # Similarly compute the number of FIOs in each District.
        # Join this with our growing df_race table as well.
        fios_by_district = {dist: 0 for dist in df_race['dist']}
        for index, row in df_fios.iterrows():
            if (row['DIST'] in fios_by_district):
                fios_by_district[row['DIST']] += 1
            elif (row['DIST'] == 'A1' or row['DIST'] == 'A15'):
                fios_by_district['A1/A15'] += 1

        df_fios = pd.DataFrame.from_dict(fios_by_district, orient='index')
        df_race = df_race.join(df_fios, on='dist')
        new_columns = df_race.columns.values
        new_columns[-1] = 'FIO Count'
        df_race.columns = new_columns

        # Normalize crime count and FIO count by population of district.
        for index, row in df_race.iterrows():
            df_race.ix[index, 'Crime Count'] /= row['population']
            df_race.ix[index, 'FIO Count'] /= row['population']

        # Now drop the categorical data before the regression
        to_insert = df_race.to_json(orient='records')  # Save to insert later

        districts = df_race['dist']
        df_race = df_race.drop('dist', axis=1).drop('dist_name', axis=1)
        df_race = df_race.drop('population', axis=1)

        # Run regression with the number of crimes in each area as the output
        # attribute.
        y_train = df_race['FIO Count']
        X_train = df_race.drop('FIO Count', axis=1)

        model = sm.OLS(y_train, X_train)
        results = model.fit()

        outfile = open('Linear_Reg_Results.txt', 'w')
        print(results.summary(), file=outfile)
        outfile.close()

        # Insert our race dataset.
        r = json.loads(to_insert)

        repo.dropCollection("race_data")
        repo.createCollection("race_data")
        repo['esaracin.race_data'].insert_many(r)
        repo['esaracin.race_data'].metadata({'complete': True})
        print(repo['esaracin.race_data'].metadata())

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}