def main(): if len(sys.argv) < 3: try: start_date = datetime(2013,5,26,16) #load current model file_name = "nn_current_model" fileObject = open(file_name,'rb') except: raise Exception("Please look at the variables defined in the script. The date format in '%Y-%m-%d %H:%M:%S', or the right name for the saved model are incorrect") else: try: start_date = time.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S") #load current model file_name = datetime(sys.argv[2]) fileObject = open(file_name,'rb') except: raise Exception("Please enter the date format in '%Y-%m-%d %H:%M:%S', or the right name for the saved model") print("Using {}, for date: {}".format(file_name, start_date)) pipeline = pickle.load(fileObject) #get data for model input sql_string = 'select date, location_name, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, co from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;'.format(start_date) fixed_samples_data = data_from_db(sql_string, exit_on_zero=False, verbose=False) try: #assert that more than 4 stations need to be returned #sometimes 8 rows are returned (duplicate records..) assert fixed_samples_data is not None and len(fixed_samples_data) >= 4 except AssertionError: #print("Assertion on number of rows returned failed") raise Exception("No rows on {0}\n".format(start_date)); try: specific_hour = start_date.hour if use_hour_simplification_feature: hour_feature = classify_hour(specific_hour) else: hour_feature = specific_hour FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle'] mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS]) co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed #prepare data to be inserted into svm_estimates table data = [fixed_samples_data['weekdays'].iloc[0], hour_feature, get_season(start_date), 0, 0, co_liverpool, co_prospect, co_chullora, co_rozelle] except Exception, ex: raise Exception("Error on {}; SQL Statement is {}; Error is str({})\n".format(start_date, sql_string, str(ex)));
def main(): # Open database connection db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" ) # prepare a cursor object using cursor() method cursor = db.cursor() #start_date = datetime(2015,5,1) start_date = datetime(2015,5,17) end_date = datetime(2016,5,2) #log file log_file_name = "populate_SVM_estimates_{0}_log.txt".format(datetime.now().strftime("%Y-%m-%d %H")) logObject = open(log_file_name,'w') logObject.write("Start date: {0}, End date: {1}\n".format(start_date, end_date)); #load current model file_name = "svm_current_model" fileObject = open(file_name,'rb') pipeline = pickle.load(fileObject) #variable to store the number of rows committed to the db row_count = 0 #TODO: Make this set difference work #precheck to make the query go faster sql_string = """select distinct datetime from {0}; """.format(svm_estimates_table) cursor.execute(sql_string) inserted_datetimes = cursor.fetchall() total_hours = (end_date - start_date).days*24 total_datetimes = {start_date + timedelta(seconds=i*3600) for i in xrange(total_hours)} remaining_datetimes = sorted(list(total_datetimes - set(inserted_datetimes))) #iterate through all the hours for start_date in remaining_datetimes: #precheck for date and hour in svm_estimates tables if start_date in inserted_datetimes or start_date.hour < 8: continue #get data for model input sql_string = 'select date, location_name, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, co from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;'.format(start_date) fixed_samples_data = data_from_db(sql_string, exit_on_zero=False) try: #assert that more than 4 stations need to be returned #sometimes 8 rows are returned (duplicate records..) assert fixed_samples_data is not None and len(fixed_samples_data) >= 4 except AssertionError as aex: #print("Assertion on number of rows returned failed") logObject.write("No rows on {0}\n".format(start_date)); continue #pdb.set_trace() try: specific_hour = start_date.hour if use_hour_simplification_feature: hour_feature = classify_hour(specific_hour) else: hour_feature = specific_hour FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle'] mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS]) co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed #prepare data to be inserted into svm_estimates table data = [fixed_samples_data['weekdays'].iloc[0], hour_feature, get_season(start_date), 0, 0, co_liverpool, co_prospect, co_chullora, co_rozelle] except Exception, ex: logObject.write("Error on {}; SQL Statement is {}; Error is str({})\n".format(start_date, sql_string, str(ex))); continue #print X_train X = np.float64([data]) #go through 100x100 grid pixels for i in xrange(100): for j in xrange(100): X[0][3] = i X[0][4] = j y_val = pipeline.predict(X)[0] insert_data = ['"{0}"'.format(start_date), '"{0}"'.format(start_date.date()), start_date.hour, fixed_samples_data['weekdays'].iloc[0], fixed_samples_data['dayoftheweek'].iloc[0], get_season(start_date), i, j, co_liverpool, co_prospect, co_chullora, co_rozelle, 5.7464*y_val+3.48652] insert_str = """insert ignore into {0} (datetime, date, time, weekdays, dayoftheweek, season, grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, co_original) values ({1}); """.format(svm_estimates_table, ','.join([ str(x) for x in insert_data])) #print insert_str try: cursor.execute(insert_str) except: print insert_str pdb.set_trace() #commit the db every 10000 rows db.commit() row_count += 10000 print("Committed date: {0} with row count: {1}".format(start_date, row_count))
def zero_mean_analysis(): """ Function to generate data for zero mean analysis. The result return the mean, std and non zero grid count for each hour. When using this, pipe the output out to a file. e.g. python train_model.py > file.txt """ #start data and end data from populate estimates script start_date = datetime(2013,3,1) end_date = datetime(2015,11,1) # Open database connection db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" ) # prepare a cursor object using cursor() method cursor = db.cursor() sql_str = """ INSERT IGNORE INTO CV_values (datetime, date, time, weekdays, dayoftheweek, mean_fixed) SELECT date as datetime, DATE_FORMAT(date,"%Y-%m-%d") AS date, DATE_FORMAT(date,"%H") as time, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, avg(co) as mean_fixed FROM Samples WHERE user_id = 2 and date between "{0}" AND "{1}" AND co IS NOT NULL AND latitude is not null and longitude is not null AND (latitude <= {2} AND latitude >= {3}) AND (longitude >= {4} AND longitude <= {5}) GROUP BY date ORDER BY date asc """.format(start_date, end_date, NW_BOUND[0], SW_BOUND[0], NW_BOUND[1], NE_BOUND[1]) cursor.execute(sql_str) db.close() #retrieve and group the data by datetime (hour) sql_string = """select datetime, date, time, dayoftheweek, grid_location, co from {0} order by datetime asc, grid_location asc; """.format(data_table) #print(sql_string) df_mysql = data_from_db(sql_string) grouped = df_mysql.groupby(['datetime']) X = [] y = [] stats = [] #iterate through and group by datetime for name, group in grouped: X.append([group['dayoftheweek'].iloc[0], group['time'].iloc[0]]) assert(len(group['co'].values.flatten()) == 10000) y.append(group['co'].values.flatten()) date_vals = group[['datetime', 'date', 'time', 'dayoftheweek']].values[0] query_datetime = date_vals[0] #get data for an hour select_str = """SELECT date as datetime, DATE_FORMAT(date,"%Y-%m-%d") AS date, DATE_FORMAT(date,"%H") as time, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, latitude, longitude, user_id, co FROM Samples WHERE user_id != 2 and date between "{0}" and date_add("{0}", interval 1 hour) and co is not null and latitude is not null and longitude is not null AND (latitude <= {1} AND latitude >= {2}) AND (longitude >= {3} AND longitude <= {4}) AND co > 0 AND co < 60 ORDER BY date asc """.format(query_datetime, NW_BOUND[0], SW_BOUND[0], NW_BOUND[1], NE_BOUND[1]) df_mysql = data_from_db(select_str, verbose=False, exit_on_zero=False) #check the number of bins populated _, non_zero_grid_count = create_mean_value_grid(df_mysql) #update status array # use degrees of freedom = 0, i.e. without Bessel's correction stats.append(np.append(date_vals, [group['co'].mean(), group['co'].std(ddof=0), non_zero_grid_count])) #write all the rows of stats for row in stats: print(';'.join(["%s" % i for i in row])) sys.exit()
def main(): """ Implement the training of the model. This decides if nn or svn is run, or we choose to instead run the zero mean analysis. These variables are set below the imports, globally. """ if run_zero_mean_analysis: zero_mean_analysis() if run_existing_model: use_existing_model() ################################## ###### Train Model here ################################## #name of the image file name = "time_vs_co_averages" name = images_base_dir + name #retrieve sql data for the period required sql_string = """select * from {0} order by datetime asc, grid_location_row, grid_location_col asc; """.format(data_table) df_mysql = data_from_db(sql_string) X = [] y = [] #if use_nn: # #output predicted here is 10000 co values # grouped = df_mysql.groupby(['datetime']) # #iterate through and group by datetime # for name, group in grouped: # X.append([group['dayoftheweek'].iloc[0], group['time'].iloc[0], group['season'].iloc[0]]) # assert(len(group['co'].values.flatten()) == 10000) # y.append(group['co'].values.flatten()) #else: #output predicted here is one co value for _, row in df_mysql.iterrows(): if use_hour_simplification_feature: hour_feature = classify_hour(row['time']) else: hour_feature = row['time'] X.append([row['weekdays'], hour_feature, row['season'], row['grid_location_row'], row['grid_location_col'], row['co_liverpool'], row['co_prospect'], row['co_chullora'], row['co_rozelle']]) y.append(row['co']) #print X_train Z = np.float64(X) y = np.float64(y) X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size =0.0 , random_state=0) if use_nn: pipeline = Pipeline([ ('min/max scaler', preprocessing.MinMaxScaler(feature_range=(0.0, 1.0))), ('nn', Regressor( layers=[ Layer("Rectifier", units=150), #Layer("Rectifier", units=100), Layer("Linear") ], learning_rate=0.001, #regularize='L2', #weight_decay=0.0000005, n_iter=70, valid_size=.33, verbose=True)) ]) param_grid = { 'nn__learning_rate': [0.001],#np.arange(0.001, 0.040, 0.010), 'nn__hidden0__units': [150],#np.arange(500,5000,1000), } else: pipeline = Pipeline([ ('svm', SVR( C=1.0, epsilon=0.2, gamma='auto', kernel='rbf', verbose=True, cache_size=3000 )) ]) param_grid = { 'svm__C': [1.00],#np.arange(0.001, 0.040, 0.010), 'svm__epsilon': [0.2],#np.arange(500,5000,1000), } #run if optimisation is needed if do_optimisation: optimise(pipeline, X_train, y_train) #run the below for regular model training # grid search is done on set of parameters (not actually a grid search) # the below is done for setting the scoring parameter and setting cross validation gs = GridSearchCV(pipeline, param_grid = param_grid, scoring="mean_squared_error", cv=10) gs.fit(X_train, y_train) print(gs.scorer_) import pdb; pdb.set_trace() print("Mean squared score for 10 fold cross validation is: ", -gs.best_score_ ) pipeline = gs #save the model that's trained if pickle_model: import pdb; pdb.set_trace() if use_nn: file_name = "nn_current_model" else: file_name = "svm_current_model" fileObject = open(file_name,'wb') pickle.dump(pipeline, fileObject) fileObject.close() print("Pickled the model.")
def use_existing_model(): """ Function is used if flag is set to use existing model. The model is set based on the current svm or nn model available in a pickle file in the same directory as this script. """ #no need to predict the full grid if you're using nn #if not use_nn: predict_full_grid = True #else: # predict_full_grid = False #Assume there exists existing model of name "current_model" pickle file if use_nn: file_name = "nn_current_model" else: file_name = "svm_current_model" fileObject = open(file_name,'rb') pipeline = pickle.load(fileObject) #use the datetime specified and get the rows of data #weekend date #specified_date = "2013-05-26 12:" #weekday date specified_date = "2015-09-03 12:" print("date is ", specified_date) sql_string = """select * from {1} where datetime like "{0}%" order by datetime asc, grid_location_row, grid_location_col asc; """.format(specified_date, data_table) print(sql_string) df_mysql = data_from_db(sql_string) y_true = [] y_pred = [] for _, row in df_mysql.iterrows(): if use_hour_simplification_feature: hour_feature = classify_hour(row['time']) else: hour_feature = row['time'] X = [[row['weekdays'], hour_feature, row['season'], row['grid_location_row'], row['grid_location_col'], row['co_liverpool'], row['co_prospect'], row['co_chullora'], row['co_rozelle']]] y = row['co'] #print X_train X = np.float64(X) y = np.float64(y) y_true.append(y) y_pred.append(pipeline.predict(X)[0]) if predict_full_grid: y_pred_full = [] for i in xrange(100): for j in xrange(100): X[0][3] = i X[0][4] = j y_val = pipeline.predict(X)[0] y_pred_full.append(y_val) #print(i, j, y_val) #create the mesh here id_name = specified_date.replace(":","") #prediction_name = "_pred_zeroMean" #estimates_name = "_estimates_zeroMean" if not use_nn: prediction_name = "_pred_full_zeroMean_svm" y_pred_full = np.float64(y_pred_full) create_mesh(y_pred_full.reshape(100,100), images_base_dir + id_name + prediction_name, title_name="predicted_full_grid_with_svm") else: prediction_name = "_pred_full_zeroMean_nn" y_pred_full = np.float64(y_pred_full) create_mesh(y_pred_full.reshape(100,100), images_base_dir + id_name + prediction_name, title_name="predicted_full_grid_with_nn") #create_mesh(y_true.reshape(100,100), images_base_dir + id_name + estimates_name, title_name="estimates_nn") #create_mesh(y_pred.reshape(100,100), images_base_dir + id_name + prediction_name, title_name="predicted_nn") #print some stats #print("Mean squared error is: ", -mean_squared_error(y_true, y_pred)) print("Mean absolute error is: ", abs(mean_absolute_error(y_true, y_pred))) #print("R^2 score is: ", -r2_score(y_true, y_pred, multioutput='uniform_average')) print(pipeline.get_params()) #save a log of the parameters of the last run using the existing model with open(images_base_dir + id_name + "_existing_model_params.txt", "w") as text_file: text_file.write(specified_date+"\n") text_file.write(sql_string+"\n") text_file.write(str(pipeline)+"\n") text_file.write(str(pipeline.get_params())+"\n") #text_file.write("Mean squared error is: {0}\n".format(-mean_squared_error(y_true, y_pred))) text_file.write("Mean absolute error is: {0}\n".format(abs(mean_absolute_error(y_true, y_pred)))) #text_file.write("R^2 score is: {0}\n".format(-r2_score(y_true, y_pred, multioutput='uniform_average'))) fileObject.close() sys.exit()
def main(): """ Populating the Estimates was an initial method used by NN, then by SVM. Here, Interpolation was used to calculate values for all the grids. This data was then fed into the model to train. This technigue could be left for populating the samplesGridData and using the model to estimate points. i.e. no interpolation used for estimation """ # Open database connection db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" ) # prepare a cursor object using cursor() method cursor = db.cursor() # get the oldest date #sql_str = """select distinct date from Samples where user_id = 2 order by date asc limit 1;""" # start date #cursor.execute(sql_str) #start_date = cursor.fetchone()[0] start_date = datetime(2013,3,1) # get the newest date #sql_str = """select distinct date from Samples where user_id = 2 order by date desc limit 1;""" # end date #cursor.execute(sql_str) #end_date = cursor.fetchone()[0] #override end_date = datetime(2015,11,1) #Choose the data table to use zero_mean = True non_zero_grid_count_threshold = 10 #table which has data inserted for model training data_table = "Estimates_zeroMean" if zero_mean else "Estimates_old" #epochs are the time periods to iterate over #provide some buffer time (extra epoch) epochs = ((end_date - start_date).days*24 + 1) print "Start data and end date: {0} to {1}".format(start_date, end_date) print "Number of hours of data: {0}".format(epochs) first_date = start_date total_rows = 0 no_epoch_count = 0 skip_epoch_count = 0 for _ in xrange(epochs): #do a quick check to see if data for a datetime exists, skip if it does sql_str = """ select datetime from {0} where datetime="{1}" limit 1;""".format(data_table, first_date) cursor.execute(sql_str) if cursor.rowcount > 0: skip_epoch_count += 1 first_date += timedelta(seconds=3600) continue #is there sensor data, skip if not select_str = """select * from Samples where user_id != 2 and date like "{0}%" and co < 60 and co > 0 limit 1;""".format(first_date.strftime("%Y-%m-%d %H")) cursor.execute(select_str) if cursor.rowcount == 0: skip_epoch_count += 1 print "Skipped {0} due to lack of sensor data".format(first_date) first_date += timedelta(seconds=3600) continue; #get data for an hour select_str = """SELECT date as datetime, DATE_FORMAT(date,"%Y-%m-%d") AS date, DATE_FORMAT(date,"%H") as time, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, latitude, longitude, user_id, co FROM Samples WHERE user_id != 2 and date between "{0}" and date_add("{0}", interval 1 hour) and co is not null and latitude is not null and longitude is not null AND (latitude <= {1} AND latitude >= {2}) AND (longitude >= {3} AND longitude <= {4}) AND co > 0 AND co < 60 ORDER BY date asc """.format(first_date, NW_BOUND[0], SW_BOUND[0], NW_BOUND[1], NE_BOUND[1]) df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False) if df_mysql is None: print "No data returned for {0}".format(first_date) no_epoch_count += 1 first_date += timedelta(seconds=3600) continue #check the number of bins populated _, non_zero_grid_count = create_mean_value_grid(df_mysql) #discount grid if it doesn't have enough pixels (i.e. less than threshold) if non_zero_grid_count < non_zero_grid_count_threshold: skip_epoch_count += 1 print "Skipped {0} due to non zero grid count less than threshold".format(first_date) first_date += timedelta(seconds=3600) continue #interpolate to get a grid known, z, ask, _ = gridify_sydney(df_mysql, verbose=False, heatmap=False) if len(known) == 0: raise Exception("No data for {0}".format(first_date)) sys.exit() columns = df_mysql.columns.values vals = list(df_mysql.iloc[0]) row_dict = dict(zip(columns, vals)) relevant_columns = ['time','weekdays','dayoftheweek'] data_common = ['"{0}"'.format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00"))] + ['"{0}"'.format(row_dict['date'])] + ["{0}".format(row_dict[col]) for col in relevant_columns] + ["{0}".format(get_season(row_dict['datetime']))] if len(known) < 8: Nnear = len(known) else: Nnear = 8 # do the interpolation (interpolation_grid, interpol_name) = idw_interpol(known, z, ask, Nnear=Nnear) #implement for zero mean Estimates table if zero_mean: #do the zero mean bit interpolation_grid = interpolation_grid.flatten() interpolation_grid = (interpolation_grid - np.mean(interpolation_grid))/np.nanstd(interpolation_grid) #add each element to the db as a row for i in xrange(len(interpolation_grid)): total_rows += 1 # input data into sql data = data_common + ["{0}".format(x) for x in [i, interpolation_grid[i]]] #print data insert_str = """insert ignore into {0} () values ({1}); """.format(data_table, ','.join(data)) cursor.execute(insert_str) print "At {0}, Number of rows considered in total: {1}".format(first_date, total_rows) # commit at each epoch, i.e. every 10000 rows db.commit() first_date += timedelta(seconds=3600) db.close() print "No epoch count: {0} and Skip epoch counts {1}".format(no_epoch_count,skip_epoch_count)
def main(): """ SVM uses the known data, called SamplesGridData, training the model with this data and using the resultant model to infer unknown data points. There is no interpolation in this method.. """ # Open database connection db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" ) # prepare a cursor object using cursor() method cursor = db.cursor() # get the oldest date #sql_str = """select distinct date from Samples where user_id = 2 order by date asc limit 1;""" # start date #cursor.execute(sql_str) #start_date = cursor.fetchone()[0] start_date = datetime(2013,3,1) # get the newest date #sql_str = """select distinct date from Samples where user_id = 2 order by date desc limit 1;""" # end date #cursor.execute(sql_str) #end_date = cursor.fetchone()[0] #override end_date = datetime(2015,11,1) #Choose the data table to use zero_mean = True non_zero_grid_count_threshold = 10 #second pass is needed for inputting the mean and stddev for all the rows of the table populate_initially = True populate_second_pass = True #table which has data inserted for model training data_table = "samplesGridData" #epochs are time period to iterate over #provide some buffer time (extra epoch) epochs = ((end_date - start_date).days*24 + 1) print "Start data and end date: {0} to {1}".format(start_date, end_date) print "Number of hours of data: {0}".format(epochs) first_date = start_date total_rows = 0 no_epoch_count = 0 skip_epoch_count = 0 #populate the first stage of the process for samplesGridData if populate_initially: for _ in xrange(epochs): #do a quick check to see if data for a datetime exists, skip if it does sql_str = """ select datetime from {0} where datetime="{1}" limit 1;""".format(data_table, first_date) cursor.execute(sql_str) if cursor.rowcount > 0: skip_epoch_count += 1 first_date += timedelta(seconds=3600) continue #is there sensor data, skip if not select_str = """select * from Samples where user_id != 2 and date like "{0}%" and co < 60 and co > 0 limit 1;""".format(first_date.strftime("%Y-%m-%d %H")) cursor.execute(select_str) if cursor.rowcount == 0: skip_epoch_count += 1 print "Skipped {0} due to lack of sensor data".format(first_date) first_date += timedelta(seconds=3600) continue; #get data for an hour select_str = """SELECT date as datetime, DATE_FORMAT(date,"%Y-%m-%d") AS date, DATE_FORMAT(date,"%H") as time, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, latitude, longitude, user_id, co FROM Samples WHERE user_id != 2 and date between "{0}" and date_add("{0}", interval 1 hour) and co is not null and latitude is not null and longitude is not null AND (latitude <= {1} AND latitude >= {2}) AND (longitude >= {3} AND longitude <= {4}) AND co > 0 AND co < 60 ORDER BY date asc """.format(first_date, NW_BOUND[0], SW_BOUND[0], NW_BOUND[1], NE_BOUND[1]) df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False) if df_mysql is None: print "No data returned for {0}".format(first_date) no_epoch_count += 1 first_date += timedelta(seconds=3600) continue #check the number of bins or grid locations populated _, non_zero_grid_count = create_mean_value_grid(df_mysql) #discount grid if it doesn't have enough pixels (i.e. less than threshold) if non_zero_grid_count < non_zero_grid_count_threshold: skip_epoch_count += 1 print "Skipped {0} due to non zero grid count less than threshold".format(first_date) first_date += timedelta(seconds=3600) continue #interpolate to get a grid known, z, ask, _ = gridify_sydney(df_mysql, verbose=False, heatmap=False) if len(known) == 0: raise Exception("No data for {0}".format(first_date)) sys.exit() columns = df_mysql.columns.values vals = list(df_mysql.iloc[0]) row_dict = dict(zip(columns, vals)) relevant_columns = ['time','weekdays','dayoftheweek'] data_common = ['"{0}"'.format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00"))] + ['"{0}"'.format(row_dict['date'])] + ["{0}".format(row_dict[col]) for col in relevant_columns] + ["{0}".format(get_season(row_dict['datetime']))] if zero_mean: select_str = 'select date, location_name, co from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;'.format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00")) fixed_samples_data = data_from_db(select_str, verbose=False, exit_on_zero=False) assert len(fixed_samples_data) == 4 FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle'] mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS]) co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed for i, _ in enumerate(z): total_rows += 1 # input data into sql grid_location_row, grid_location_col = known[i] data = data_common + ["{0}".format(x) for x in [grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, z[i]]] #print data insert_str = """insert ignore into {0} (datetime, date, time, weekdays, dayoftheweek, season, grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, co_original) values ({1}); """.format(data_table, ','.join(data)) try: cursor.execute(insert_str) except: print insert_str pdb.set_trace() else: raise Exception("You should always be running this zero mean set") print "At {0}, Number of rows considered in total: {1}".format(first_date, total_rows) # commit db.commit() first_date += timedelta(seconds=3600) #after all the rows have been populated with the original co, we need to populate the normalised value, mean and std if populate_second_pass: select_str = """ select * from {0};""".format(data_table) df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False) co_mean, co_stddev = df_mysql['co_original'].mean(), df_mysql['co_original'].std(ddof=0) df_mysql['co_mean'] = co_mean df_mysql['co_stddev'] = co_stddev df_mysql['co'] = (df_mysql['co_original']-co_mean)/co_stddev for index, row in df_mysql.iterrows(): update_sql = "UPDATE {0} SET co={1}, co_mean={2}, co_stddev={3} WHERE datetime='{4}' AND grid_location_row={5} AND grid_location_col={6}".format(data_table, row['co'], row['co_mean'], row['co_stddev'], row['datetime'], row['grid_location_row'], row['grid_location_col']) cursor.execute(update_sql) db.commit() db.close() print "No epoch count: {0} and Skip epoch counts {1}".format(no_epoch_count,skip_epoch_count)
def main(granularity, start_date, end_date): """ SVM uses the known data, called SamplesGridData, training the model with this data and using the resultant model to infer unknown data points. There is no interpolation in this method.. """ global total_rows, skip_epoch_count, non_zero_grid_count, no_epoch_count # Open database connection #db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" ) # prepare a cursor object using cursor() method #cursor = db.cursor() non_zero_grid_count_threshold = 1 errors = [] #second pass is needed for inputting the mean and stddev for all the rows of the table populate_initially = True populate_second_pass = False interval = granularity['interval'] data_table = granularity['data_table'] interval_period = granularity['interval_period'] date_format = granularity["date_format"] epoch_variable = granularity['epoch_variable'] target_datetimes = get_time_periods_with_sensor_data(start_date, end_date, data_table, interval) #epochs are time period to iterate over epochs = len(target_datetimes) print "Start data and end date: {} to {}".format(start_date, end_date) print "Number of time periods of data: {}, granularity is {}".format(epochs, interval) db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" ) cursor = db.cursor() #populate the first stage of the process for samplesGridData if populate_initially: for target_datetime in target_datetimes: target_datetime = datetime.strptime(target_datetime, '%Y-%m-%d %H:%M') #get data for an time period select_str = """SELECT date as datetime, DATE_FORMAT(date,"%Y-%m-%d") AS date, DATE_FORMAT(date,"%H") as hour, DATE_FORMAT(date,"%i") as minute, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, latitude, longitude, user_id, co FROM Samples WHERE user_id != 2 AND date between "{0}" AND DATE_ADD("{0}", INTERVAL {5} SECOND) AND co is not null and latitude is not null and longitude is not null AND (latitude <= {1} AND latitude >= {2}) AND (longitude >= {3} AND longitude <= {4}) AND co > 0 AND co < 60 ORDER BY date asc """.format( target_datetime, NW_BOUND[0], SW_BOUND[0], NW_BOUND[1], NE_BOUND[1], interval_period ) df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False) if df_mysql is None: print "No data returned for {0}".format(target_datetime) no_epoch_count += 1 continue #check the number of bins or grid locations populated _, non_zero_grid_count = create_mean_value_grid(df_mysql) #discount grid if it doesn't have enough pixels (i.e. less than threshold) if non_zero_grid_count < non_zero_grid_count_threshold: skip_epoch_count += 1 print "Skipped {0} due to non zero grid count less than threshold".format(target_datetime) continue #interpolate to get a grid known, z, ask, _ = gridify_sydney(df_mysql, verbose=False, heatmap=False) if len(known) == 0: raise Exception("No data for {0}".format(target_datetime)) sys.exit() columns = df_mysql.columns.values vals = list(df_mysql.iloc[0]) row_dict = dict(zip(columns, vals)) relevant_columns = ['hour','minute', 'weekdays','dayoftheweek'] data_common = ['"{}"'.format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00"))] + \ ['"{}"'.format(row_dict['date'])] + \ ["{}".format(row_dict[col]) for col in relevant_columns] + \ ["{}".format(get_season(row_dict['datetime']))] # hour always needs to be used here to retrieve fixed station values select_str = """select date, location_name, co from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;""".format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00")) fixed_samples_data = data_from_db(select_str, verbose=False, exit_on_zero=False) try: assert len(set(fixed_samples_data.location_name)) == 4 except AssertionError: print "error: 4 fixed station values not found for {}".format(target_datetime) errors.append(target_datetime) continue FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle'] mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS]) co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] \ if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] \ if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0] \ if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0] \ if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed for i, _ in enumerate(z): total_rows += 1 # input data into sql grid_location_row, grid_location_col = known[i] data = data_common + \ ["{0}".format(x) for x in [grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, z[i]]] insert_str = """ insert ignore into {0} (datetime, date, hour, minute, weekdays, dayoftheweek, season, grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, co_original) values ({1}); """.format(data_table, ','.join(data)) try: cursor.execute(insert_str) except: print insert_str pdb.set_trace() print "At {0}, Number of rows considered in total: {1}".format(target_datetime, total_rows) # commit db.commit() print "No epoch count: {0} and Skip epoch counts {1}".format(no_epoch_count,skip_epoch_count) print "dates with no complete fixed station data are {}".format(errors) # after all the rows have been populated with the original co, # we need to populate the normalised value, mean and std if populate_second_pass: select_str = """ select * from {};""".format(data_table) df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False) if not df_mysql: print "no rows in {}. Script completed".format(data_table) return co_mean, co_stddev = df_mysql['co_original'].mean(), df_mysql['co_original'].std(ddof=0) df_mysql['co_mean'] = co_mean df_mysql['co_stddev'] = co_stddev df_mysql['co'] = (df_mysql['co_original']-co_mean)/co_stddev for index, row in df_mysql.iterrows(): update_sql = """ UPDATE {0} SET co={1}, co_mean={2}, co_stddev={3} WHERE datetime='{4}' AND grid_location_row={5} AND grid_location_col={6} """.format(data_table, row['co'], row['co_mean'], row['co_stddev'], row['datetime'], row['grid_location_row'], row['grid_location_col']) cursor.execute(update_sql) db.commit() db.close()