def main():

    if len(sys.argv) < 3:
        try:
            start_date = datetime(2013,5,26,16)
            #load current model
            file_name = "nn_current_model"
            fileObject = open(file_name,'rb')
        except:
            raise Exception("Please look at the variables defined in the script. The date format in '%Y-%m-%d %H:%M:%S', or the right name for the saved model are incorrect")
    else:
        try:
            start_date = time.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
            #load current model
            file_name = datetime(sys.argv[2])
            fileObject = open(file_name,'rb')
        except:
            raise Exception("Please enter the date format in '%Y-%m-%d %H:%M:%S', or the right name for the saved model")


    print("Using {}, for date: {}".format(file_name, start_date))
    pipeline = pickle.load(fileObject)

    #get data for model input
    sql_string = 'select date, location_name, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, co  from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;'.format(start_date)

    fixed_samples_data = data_from_db(sql_string, exit_on_zero=False, verbose=False)
    try:
        #assert that more than 4 stations need to be returned
        #sometimes 8 rows are returned (duplicate records..)
        assert fixed_samples_data is not None and len(fixed_samples_data) >= 4
    except AssertionError:
        #print("Assertion on number of rows returned failed")
        raise Exception("No rows on {0}\n".format(start_date));

    try:
        specific_hour = start_date.hour

        if use_hour_simplification_feature:
            hour_feature = classify_hour(specific_hour)
        else:
            hour_feature = specific_hour

        FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle']
        mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS])
        co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed
        co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed
        co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed
        co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed

        #prepare data to be inserted into svm_estimates table
        data = [fixed_samples_data['weekdays'].iloc[0], hour_feature, get_season(start_date), 0, 0, co_liverpool, co_prospect, co_chullora, co_rozelle]
    except Exception,  ex:
        raise Exception("Error on {}; SQL Statement is {}; Error is str({})\n".format(start_date, sql_string, str(ex)));
def main():
    # Open database connection
    db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" )
    # prepare a cursor object using cursor() method
    cursor = db.cursor()

    #start_date = datetime(2015,5,1)
    start_date = datetime(2015,5,17)
    end_date = datetime(2016,5,2)


    #log file
    log_file_name = "populate_SVM_estimates_{0}_log.txt".format(datetime.now().strftime("%Y-%m-%d %H"))
    logObject = open(log_file_name,'w')
    logObject.write("Start date: {0}, End date: {1}\n".format(start_date, end_date));

    #load current model
    file_name = "svm_current_model"
    fileObject = open(file_name,'rb')
    pipeline = pickle.load(fileObject)

    #variable to store the number of rows committed to the db
    row_count = 0

    #TODO: Make this set difference work
    #precheck to make the query go faster
    sql_string = """select distinct datetime from {0}; """.format(svm_estimates_table)
    cursor.execute(sql_string)
    inserted_datetimes = cursor.fetchall()
    
    total_hours = (end_date - start_date).days*24
    total_datetimes = {start_date + timedelta(seconds=i*3600) for i in xrange(total_hours)}

    remaining_datetimes = sorted(list(total_datetimes - set(inserted_datetimes)))

    #iterate through all the hours
    for start_date in remaining_datetimes:
        #precheck for date and hour in svm_estimates tables
        if start_date in inserted_datetimes or start_date.hour < 8:
            continue

        #get data for model input
        sql_string = 'select date, location_name, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, co  from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;'.format(start_date)

        fixed_samples_data = data_from_db(sql_string, exit_on_zero=False)
        try:
            #assert that more than 4 stations need to be returned
            #sometimes 8 rows are returned (duplicate records..)
            assert fixed_samples_data is not None and len(fixed_samples_data) >= 4
        except AssertionError as aex:
            #print("Assertion on number of rows returned failed")
            logObject.write("No rows on {0}\n".format(start_date));
            continue
            #pdb.set_trace()

        try:
            specific_hour = start_date.hour

            if use_hour_simplification_feature:
                hour_feature = classify_hour(specific_hour)
            else:
                hour_feature = specific_hour

            FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle']
            mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS])
            co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed
            co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed
            co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed
            co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed

            #prepare data to be inserted into svm_estimates table
            data = [fixed_samples_data['weekdays'].iloc[0], hour_feature, get_season(start_date), 0, 0, co_liverpool, co_prospect, co_chullora, co_rozelle]
        except Exception,  ex:
            logObject.write("Error on {}; SQL Statement is {}; Error is str({})\n".format(start_date, sql_string, str(ex)));
            continue

        #print X_train
        X = np.float64([data])

        #go through 100x100 grid pixels
        for i in xrange(100):
            for j in xrange(100):
                X[0][3] = i 
                X[0][4] = j
                y_val = pipeline.predict(X)[0]

                insert_data = ['"{0}"'.format(start_date), '"{0}"'.format(start_date.date()), start_date.hour, fixed_samples_data['weekdays'].iloc[0], fixed_samples_data['dayoftheweek'].iloc[0], get_season(start_date), i, j, co_liverpool, co_prospect, co_chullora, co_rozelle, 5.7464*y_val+3.48652]

                insert_str = """insert ignore into {0} (datetime, date, time, weekdays, dayoftheweek, season, grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, co_original) values ({1}); """.format(svm_estimates_table, ','.join([ str(x) for x in insert_data]))
                #print insert_str
                try:
                    cursor.execute(insert_str)
                except:
                    print insert_str
                    pdb.set_trace()
        #commit the db every 10000 rows
        db.commit()
        row_count += 10000
        print("Committed date: {0} with row count: {1}".format(start_date, row_count))
Пример #3
0
def main():
    """ 
    
    Implement the training of the model. This decides if nn or svn is run, 
    or we choose to instead run the zero mean analysis. These variables are 
    set below the imports, globally.
    
    """


    if run_zero_mean_analysis:
        zero_mean_analysis()

    if run_existing_model:
        use_existing_model()

    ##################################
    ###### Train Model here
    ##################################

    #name of the image file
    name = "time_vs_co_averages"
    name = images_base_dir + name

    #retrieve sql data for the period required
    sql_string = """select * from {0}  order by datetime asc, grid_location_row, grid_location_col asc; """.format(data_table)
    df_mysql = data_from_db(sql_string)


    X = [] 
    y = []

    #if use_nn:
    #    #output predicted here is 10000 co values
    #    grouped = df_mysql.groupby(['datetime'])
    #    #iterate through and group by datetime
    #    for name, group in grouped:
    #        X.append([group['dayoftheweek'].iloc[0], group['time'].iloc[0], group['season'].iloc[0]])
    #        assert(len(group['co'].values.flatten()) == 10000)
    #        y.append(group['co'].values.flatten())
    #else:

    #output predicted here is one co value
    for _, row in df_mysql.iterrows():
        if use_hour_simplification_feature:
            hour_feature = classify_hour(row['time'])
        else:
            hour_feature = row['time']

        X.append([row['weekdays'], hour_feature, row['season'], row['grid_location_row'], row['grid_location_col'], row['co_liverpool'], row['co_prospect'], row['co_chullora'], row['co_rozelle']])
        y.append(row['co'])

    #print X_train
    Z = np.float64(X)
    y = np.float64(y)

    X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size =0.0 , random_state=0)
    if use_nn:
        pipeline = Pipeline([
            ('min/max scaler', preprocessing.MinMaxScaler(feature_range=(0.0, 1.0))),
            ('nn', Regressor(
                layers=[
                    Layer("Rectifier", units=150),
                    #Layer("Rectifier", units=100),
                    Layer("Linear")
                    ],
                learning_rate=0.001,
                #regularize='L2', 
                #weight_decay=0.0000005,
                n_iter=70,
                valid_size=.33,
                verbose=True))
             ])

        param_grid = {
            'nn__learning_rate': [0.001],#np.arange(0.001, 0.040, 0.010),
            'nn__hidden0__units': [150],#np.arange(500,5000,1000), 
           }
    else:
        pipeline = Pipeline([
            ('svm', SVR(
                C=1.0, 
                epsilon=0.2, 
                gamma='auto',
                kernel='rbf', 
                verbose=True,
                cache_size=3000
                ))
             ])
    
        param_grid = {
            'svm__C': [1.00],#np.arange(0.001, 0.040, 0.010),
            'svm__epsilon': [0.2],#np.arange(500,5000,1000), 
        }

    #run if optimisation is needed
    if do_optimisation:
        optimise(pipeline, X_train, y_train)


    #run the below for regular model training
    # grid search is done on set of parameters (not actually a grid search)
    # the below is done for setting the scoring parameter and setting cross validation
    gs = GridSearchCV(pipeline, param_grid = param_grid, scoring="mean_squared_error", cv=10)
    gs.fit(X_train, y_train)

    print(gs.scorer_)

    import pdb; pdb.set_trace()

    print("Mean squared score for 10 fold cross validation is: ", -gs.best_score_ )
    pipeline = gs

    #save the model that's trained
    if pickle_model:
        import pdb; pdb.set_trace()
        if use_nn:
            file_name = "nn_current_model"
        else:
            file_name = "svm_current_model"
        fileObject = open(file_name,'wb')
        pickle.dump(pipeline, fileObject)
        fileObject.close()
        print("Pickled the model.")
Пример #4
0
def use_existing_model():
    """

    Function is used if flag is set to use existing model.
    The model is set based on the current svm or nn model available in
    a pickle file in the same directory as this script.

    """

    #no need to predict the full grid if you're using nn
    #if not use_nn:
    predict_full_grid = True
    #else:
    #    predict_full_grid = False


    #Assume there exists existing model of name "current_model" pickle file
    if use_nn:
        file_name = "nn_current_model"
    else:
        file_name = "svm_current_model"

    fileObject = open(file_name,'rb')
    pipeline = pickle.load(fileObject)

    #use the datetime specified and get the rows of data
    #weekend date
    #specified_date = "2013-05-26 12:"
    #weekday date
    specified_date = "2015-09-03 12:"
    print("date is ", specified_date)
    sql_string = """select * from {1} where datetime like "{0}%" order by datetime asc, grid_location_row, grid_location_col asc; """.format(specified_date, data_table)

    print(sql_string)
    df_mysql = data_from_db(sql_string)


    y_true = []
    y_pred = []

    for _, row in df_mysql.iterrows():

        if use_hour_simplification_feature:
            hour_feature = classify_hour(row['time'])
        else:
            hour_feature = row['time']

        X = [[row['weekdays'], hour_feature, row['season'], row['grid_location_row'], row['grid_location_col'], row['co_liverpool'], row['co_prospect'], row['co_chullora'], row['co_rozelle']]]
        y = row['co']


        #print X_train
        X = np.float64(X)
        y = np.float64(y)

        y_true.append(y)
        y_pred.append(pipeline.predict(X)[0])

    if predict_full_grid:
        y_pred_full = []

        for i in xrange(100):
            for j in xrange(100):
                X[0][3] = i 
                X[0][4] = j
                y_val = pipeline.predict(X)[0]
                y_pred_full.append(y_val)
                #print(i, j, y_val)

    #create the mesh here
    id_name = specified_date.replace(":","")
    #prediction_name = "_pred_zeroMean"
    #estimates_name = "_estimates_zeroMean"

    if not use_nn:
        prediction_name = "_pred_full_zeroMean_svm"
        y_pred_full = np.float64(y_pred_full)
        create_mesh(y_pred_full.reshape(100,100), images_base_dir + id_name + prediction_name, title_name="predicted_full_grid_with_svm")
    else:
        prediction_name = "_pred_full_zeroMean_nn"
        y_pred_full = np.float64(y_pred_full)
        create_mesh(y_pred_full.reshape(100,100), images_base_dir + id_name + prediction_name, title_name="predicted_full_grid_with_nn")
        #create_mesh(y_true.reshape(100,100), images_base_dir + id_name + estimates_name, title_name="estimates_nn")
        #create_mesh(y_pred.reshape(100,100), images_base_dir + id_name + prediction_name, title_name="predicted_nn")

    #print some stats
    #print("Mean squared error is: ", -mean_squared_error(y_true, y_pred))
    print("Mean absolute error is: ", abs(mean_absolute_error(y_true, y_pred)))
    #print("R^2 score is: ", -r2_score(y_true, y_pred, multioutput='uniform_average'))
    print(pipeline.get_params())

    #save a log of the parameters of the last run using the existing model
    with open(images_base_dir + id_name + "_existing_model_params.txt", "w") as text_file:
        text_file.write(specified_date+"\n")
        text_file.write(sql_string+"\n")
        text_file.write(str(pipeline)+"\n")
        text_file.write(str(pipeline.get_params())+"\n")
        #text_file.write("Mean squared error is: {0}\n".format(-mean_squared_error(y_true, y_pred)))
        text_file.write("Mean absolute error is: {0}\n".format(abs(mean_absolute_error(y_true, y_pred))))
        #text_file.write("R^2 score is: {0}\n".format(-r2_score(y_true, y_pred, multioutput='uniform_average')))
        
    fileObject.close()
    sys.exit()