def store_weather(weather_data, table_name): weather_data = flatten_dict(weather_data) # if there's no measurement for rainfall assume rainfall of 0 if "rain_1h" not in weather_data: # if there's a 3 hour measurement for rainfall (ie. for forecast data) - use this in place of 1 hr value if "rain_3h" in weather_data: weather_data["rain_1h"] = weather_data["rain_3h"] else: weather_data["rain_1h"] = 0.0 # if there's no timezone offset value then create one if "timezone" not in weather_data: gmt = pytz.timezone("GMT") dt = pytz.utc.localize(datetime.utcfromtimestamp(weather_data["dt"])) weather_data["timezone"] = dt.astimezone(gmt).utcoffset().total_seconds() # select only desired fields from raw data data = {"date": datetime.fromtimestamp(weather_data["dt"]).strftime("%Y-%m-%d %H:%M:%S"), "timezone": weather_data["timezone"], "temp": weather_data["main_temp"], "feels_like": weather_data["main_feels_like"], "wind_speed": weather_data["wind_speed"], "wind_deg": weather_data["wind_deg"], "weather_main": weather_data["weather_0_main"], "weather_description": weather_data["weather_0_description"], "rain": weather_data["rain_1h"] } # push new data to database... try: sql_query = db.construct_sql(query_type="insert", table_name=table_name, data=data) db.execute_sql(sql_query, database=database, user=user, password=password, host=host, port=port, retrieveing_data=False) except Exception as e: try: sql_query = db.construct_sql(query_type="update", table_name=table_name, data=data, predicates={"date": data["date"]}) db.execute_sql(sql_query, database=database, user=user, password=password, host=host, port=port, retrieveing_data=False) print("Update successful") except Exception as e: print(e) return e
def stops_on_route(route, main=False, direction=1): """query the database return a sequence of all stops on a given sub-route""" # create sql query if main: sql = db.construct_sql(table_name="routes", query_type="select_where", data={"ID": route}) else: sql = db.construct_sql(table_name="routes", query_type="select_where", data={"ID": route.split("_")[0]}) # execute sql query response = db.execute_sql(sql, database, user, password, host, port, retrieving_data=True) if main: # return the 'main' sub-route in the passed direction for sub_route in response[0][1].keys(): if response[0][1][sub_route]["main"] and (response[0][1][sub_route]["direction"] == direction): return response[0][1][sub_route]["stops"] else: return response[0][1][route]["stops"]
def get_weather_from_db(): """returns a tuple of the 'current' weather data from out postgres database.""" sql = db.construct_sql(table_name="weather_data_current", query_type="select_all") data = db.execute_sql(sql, database, user, password, host, port, retrieving_data=True) return data[0]
def get_mean_time(route, direction, segments, month, day, time): # create sql query table_name = "route_%s_%s_means" % (str(route), str(direction)) sql = db.construct_sql(table_name=table_name, query_type="select_where", data={"month": month, "weekday": day, "timegroup": str(time)}, column_names=segments, verbose=False) # execute sql query try: response = db.execute_sql(sql, database, user, password, host, port, retrieving_data=True) if len(response) < 1: # if the response is empty return None return None else: # return the sum of all proportions + the (number of missing values * the average journey time) return sum_values(response[0]) except Exception as e: # print(e) # print(sql) return e
def get_proportion(route, direction, startstop, endstop, weekday, month, time_group): """returns a proportion representing the amount of the total bus route journey that the users journey represents It will first attempt to do this using calculated proportions for that day of the week, month and time_group, but will resort to a simple percentage of the amount of the stops travelled compared to the amount of stops there are""" # dictionaries for use finding the relevant section of the code in the database days = { 0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday" } months = { 1: "January", 2: "Febuary", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December" } times = { 0: "0", 1: "1", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7", 8: "8", 9: "9", 10: "10", 11: "11", 12: "12", 13: "13", 14: "14", 15: "15", 16: "16", 17: "17", 18: "18", 19: "19", 20: "20", 21: "21", 22: "22", 23: "23", 24: "24", 25: "25", 26: "26", 27: "27", 28: "28" } # call proportions file in dictionary format - this proportions file returns a calculated average based # on previous journeys for a given... ...month, week and time_group... try: # construct sql query table_name = "route_%s_%s_proportions" % (route.lower(), direction) sql_values = db.construct_sql(table_name=table_name, query_type="select_where", data={ "month": months[month], "weekday": days[weekday], "timegroup": str(time_group) }) sql_keys = db.construct_sql(table_name=table_name, query_type="attr_names") response_values = db.execute_sql(sql_values, database, user, password, host, port, retrieving_data=True)[0] response_keys = db.execute_sql(sql_keys, database, user, password, host, port, retrieving_data=True) list_of_values = list(response_values[3:]) list_of_keys = list(response_keys[3:]) for item in list_of_keys: if list_of_keys.index(item) != len(list_of_keys): splitsegment = str(item).split("_") first_stop_segment = str(item).split("_")[0][3:] last_stop_segment = str(item).split("_")[1][:-2] if startstop == first_stop_segment: index1 = list_of_keys.index(item) if endstop == last_stop_segment: index2 = list_of_keys.index(item) total = 0 for i in range(index1, index2 + 1): if list_of_values[ i] is not None: # this is to handle the odd NaN value in our proporitons datasets. # NaNs occur at an average incidence # of 0.12% in the data. value = list_of_values[i] total += value proportion = total if proportion > 0: return proportion else: proportion = quickanddirty(route, direction, startstop, endstop) return proportion # otherwise simply return the percentage of the number of stops a user is travelling (*eyeroll*) except Exception as e: print(e) proportion = quickanddirty(route, direction, startstop, endstop) return proportion
def generate_test_dataframe(route, direction, date, time): """Returns a dataframe with the user entered trip details if given the route, direction, date and time This function is called from the generate_predictions function and in turn calls the get_weather_from_db function, the time_group_function and the get_active_columns function The list of columns in the dataframe varies per route, and the list of columns is stored in a json format on the database. Continuous features are added to that dataframe directly, whereas categorical features that need to be one hot encoded for are added to a temporary dataframe. The get_active_columns function returns a list of which categorical features in the dataframe needed to be marked 1 instead of 0.""" # check if *current* weather data should be used for prediction; dt = datetime.fromisoformat("%s %s" % (str(date), time_from_seconds(time))) now = datetime.now() if abs((dt - now).total_seconds()) < 3600: current = True else: current = False if current: # if so; get current weather from database weather = get_weather_from_db() else: # otherwise; request the nearest weather forecast from the database weather = get_nearest_forecast(dt)[0] # extract required parameters temp = weather[2] feels_like = weather[3] main = weather[6] description = weather[7] wind_speed = weather[4] wind_deg = weather[5] rain = weather[8] # create empty dataframe with correct headings from templates generated from list of columns from # the datasets used to train the linear regression models template_name = str(route) + "_" + str(direction) # construct sql query sql = db.construct_sql(table_name="model_features", query_type="select_where", column_names=["features"], data={"id": template_name}) # execute sql query response = db.execute_sql(sql, database, user, password, host, port, retrieving_data=True)[0][0] # make a single row to contain the test data and put 0 in every column. row = [0] * len(response) # create the dataframe test_frame = pd.DataFrame([row], columns=response) # now forget about the test dataframe for a while and pop my user entered data into a temporary dataframe... data = {"DAYOFSERVICE": [date], "TIME": [time]} temp_dataframe = pd.DataFrame(data) # ...add in the categorical features from the weather api request... temp_dataframe['weather_main'] = main temp_dataframe['weather_description'] = description # ...converting date to date format from string date_list = [] for row in temp_dataframe['DAYOFSERVICE']: x = datetime.strptime(row, '%Y-%m-%d') date_list.append(x) # add that date in date format to the temporary dataframe... temp_dataframe['DAYOFSERVICE'] = date_list # ...creating month and day of the week feature from that date... temp_dataframe['MONTH'] = temp_dataframe['DAYOFSERVICE'].dt.month temp_dataframe['DAYOFWEEK'] = temp_dataframe['DAYOFSERVICE'].dt.dayofweek # ...creating Time Group Feature from the time.... time_group_departure = time_group_function(time) temp_dataframe['TIME_GROUP'] = time_group_departure # ...drop the date and time features we don't need... temp_dataframe = temp_dataframe.drop(columns=['DAYOFSERVICE', 'TIME']) # remember that test dataframe we created at the beginning... well, now we... # use the get_active_columns function to figure out the names of the categorical # columns in the test dataframe that need to be encoded to 1 (instead of 0) and then... active_columns = get_active_columns(temp_dataframe) # ... itterate through this one line test dataframe and if the column is in the list of active columns... # ...assign the value of that column at index 0 (because there will only ever be one line) as 1 for column, row in test_frame.items(): if column in active_columns: row.iloc[0] = 1 # add the continuous features directly to the test dataframe test_frame['temp'] = temp test_frame['feels_like'] = feels_like test_frame['wind_speed'] = wind_speed test_frame['wind_deg'] = wind_deg test_frame['rain'] = rain # and return the test dataframe to the pickled linear regression return test_frame
def store_incidents(incidents): # iterate through incidents and work out if they are relevant - if so store on database # sql query for checking if incident is within ~500m of a bus route path_sql = """ select route_id from db_gtfs_shapes where (route_path <-> path'%s') < 0.006; """ now = datetime.now() for incident in incidents: # skip incidents that are already over dt = datetime.strptime(incident["end_time"], "%m/%d/%Y %H:%M:%S") if now > dt: continue # check if this incident intersects with any bus route sql = path_sql % incident["incident_path"] # return a list of bus routes that are effected by this disruption response = db.execute_sql(sql, database, user, password, host, port, retrieving_data=True) if len(response) > 0: # populate the lookup table for route in response: entry = { "incident_id": incident["incident_id"], "route_id": route[0] } sql = db.construct_sql(table_name="incident_lookup", query_type="insert", data=entry) # execute sql query db.execute_sql(sql, database, user, password, host, port, retrieving_data=False) # add this incident as an entry into the incident_data table sql = db.construct_sql(table_name="incident_data", query_type="insert", data=incident) db.execute_sql(sql, database, user, password, host, port, retrieving_data=False)