def add_single_login(login_timestamp): """Loads one client login data point i.e. "2012-03-01T00:05:55+00:00", into the database. If hour entry exists, adds 1 to existing value. Returns error message if anything goes wrong. """ login_dt = defo.validate_login_string(login_timestamp) if login_dt is None: return { 'error': 'Invalid timestamp', 'timestamp_example': '2012-03-01T00:05:55+00:00' } db = dbh.get_db() cur = db.cursor() cur.execute('SELECT * FROM login_history WHERE id=?', (login_dt,)) match = cur.fetchone() print login_dt print 'Match is:' print match added_login = {} if match: # Update hour entry, add 1 to existing value print match['num_logins'] cur.execute('UPDATE login_history SET num_logins=? WHERE id=?', (1+match['num_logins'], login_dt)) added_login['update'] = 1 else: # Entry does not exist cur.execute('INSERT INTO login_history ' + \ '(id, day_name, hour, num_logins) ' + \ 'values (?, ?, ?, ?)', \ (login_dt,defo.get_day_2char(login_dt), defo.get_hour(login_dt),1)) added_login['insert'] = 1 db.commit() added_login['timestamp'] = login_timestamp return added_login
def add_single_login(login_timestamp): """Loads one client login data point i.e. "2012-03-01T00:05:55+00:00", into the database. If hour entry exists, adds 1 to existing value. Returns error message if anything goes wrong. """ login_dt = defo.validate_login_string(login_timestamp) if login_dt is None: return { 'error': 'Invalid timestamp', 'timestamp_example': '2012-03-01T00:05:55+00:00' } db = dbh.get_db() cur = db.cursor() cur.execute('SELECT * FROM login_history WHERE id=?', (login_dt, )) match = cur.fetchone() print login_dt print 'Match is:' print match added_login = {} if match: # Update hour entry, add 1 to existing value print match['num_logins'] cur.execute('UPDATE login_history SET num_logins=? WHERE id=?', (1 + match['num_logins'], login_dt)) added_login['update'] = 1 else: # Entry does not exist cur.execute('INSERT INTO login_history ' + \ '(id, day_name, hour, num_logins) ' + \ 'values (?, ?, ?, ?)', \ (login_dt,defo.get_day_2char(login_dt), defo.get_hour(login_dt),1)) added_login['insert'] = 1 db.commit() added_login['timestamp'] = login_timestamp return added_login
def mark_outlier(outlier_id, reason='DefaultOutlier'): """Add analyzed (manual input) insights to database, marking outliers to ignore from predictions""" print "Marking outlier: %s" % outlier_id # Error check user input outlier_id = defo.validate_id(outlier_id) if outlier_id is None: return "Outlier ID Format Invalid" db = dbh.get_db() cur = db.cursor() cur.execute('SELECT * FROM login_history WHERE id=?', (outlier_id, )) match = cur.fetchone() if not match: return "ID not in database" else: print "Outlier Demand=%d" % (match['num_logins']) cur.execute('SELECT * FROM history_outliers WHERE id=?', (outlier_id, )) match = cur.fetchone() if match: # Replace matching entry in outlier table #print 'Updating %s in Outlier DB' % (outlier_id,) cur.execute('UPDATE history_outliers SET reason=? WHERE id=?', (str(reason), outlier_id)) else: #print 'Adding %s in Outlier DB' % (str(outlier_id)) cur.execute( 'INSERT INTO history_outliers (id, reason) values (?, ?)', (outlier_id, str(reason))) db.commit() return None
def mark_outlier(outlier_id, reason='DefaultOutlier'): """Add analyzed (manual input) insights to database, marking outliers to ignore from predictions""" print "Marking outlier: %s" % outlier_id # Error check user input outlier_id = defo.validate_id(outlier_id) if outlier_id is None: return "Outlier ID Format Invalid" db = dbh.get_db() cur = db.cursor() cur.execute('SELECT * FROM login_history WHERE id=?',(outlier_id,)) match = cur.fetchone() if not match: return "ID not in database" else: print "Outlier Demand=%d"%(match['num_logins']) cur.execute('SELECT * FROM history_outliers WHERE id=?',(outlier_id,)) match = cur.fetchone() if match: # Replace matching entry in outlier table #print 'Updating %s in Outlier DB' % (outlier_id,) cur.execute('UPDATE history_outliers SET reason=? WHERE id=?', (str(reason), outlier_id)) else: #print 'Adding %s in Outlier DB' % (str(outlier_id)) cur.execute('INSERT INTO history_outliers (id, reason) values (?, ?)', (outlier_id, str(reason))) db.commit() return None
def fill_missing_hours(): """Reads login data from database and fills in any missing hours. Inserts new entries with number of login counts set to 0.""" db = dbh.get_db() cur = db.cursor() cur.execute('SELECT id FROM login_history ORDER BY id ASC') all_data = cur.fetchall() if all_data: hours_missing = False prev_id = defo.add_x_hours(all_data[0]['id'], -1) # start at 1 hour before first entry for hour in all_data: if hour['id'] != defo.add_x_hours(prev_id, 1): print 'prev: %s'%(prev_id) print 'next: %s'%(hour['id']) if defo.dy_subtract_ids(hour['id'], prev_id) < 3: # Only insert 0's for missing entries when the gap # between data is less than 3 days missing_hour = defo.add_x_hours(prev_id, 1) while missing_hour < hour['id']: cur.execute('INSERT INTO login_history ' + \ '(id, day_name, hour, num_logins) ' + \ 'values (?, ?, ?, ?)', \ (missing_hour, defo.get_day_2char(missing_hour), \ defo.get_hour(missing_hour), 0)) missing_hour = defo.add_x_hours(missing_hour, 1) hours_missing = True prev_id = hour['id'] if hours_missing: db.commit()
def fill_missing_hours(): """Reads login data from database and fills in any missing hours. Inserts new entries with number of login counts set to 0.""" db = dbh.get_db() cur = db.cursor() cur.execute('SELECT id FROM login_history ORDER BY id ASC') all_data = cur.fetchall() if all_data: hours_missing = False prev_id = defo.add_x_hours(all_data[0]['id'], -1) # start at 1 hour before first entry for hour in all_data: if hour['id'] != defo.add_x_hours(prev_id, 1): print 'prev: %s' % (prev_id) print 'next: %s' % (hour['id']) if defo.dy_subtract_ids(hour['id'], prev_id) < 3: # Only insert 0's for missing entries when the gap # between data is less than 3 days missing_hour = defo.add_x_hours(prev_id, 1) while missing_hour < hour['id']: cur.execute('INSERT INTO login_history ' + \ '(id, day_name, hour, num_logins) ' + \ 'values (?, ?, ?, ?)', \ (missing_hour, defo.get_day_2char(missing_hour), \ defo.get_hour(missing_hour), 0)) missing_hour = defo.add_x_hours(missing_hour, 1) hours_missing = True prev_id = hour['id'] if hours_missing: db.commit()
def add_multiple_logins(login_data): login_dict = defo.datetimes_to_dict(login_data) if not login_dict: return { 'error': 'No valid timestamps', 'timestamps_example': '["2012-03-01T00:05:55+00:00", "2012-03-01T00:06:23+00:00"]'} else: latest_dt = None db = dbh.get_db() cur = db.cursor() added_logins = {} for id_str,hour in login_dict.items(): #print "Read in hour: %s"%(id_str) cur_hour = len(hour) # simple count of logins in this hour cur.execute('SELECT * FROM login_history WHERE id=?',(id_str,)) match = cur.fetchone() if match: print 'Updating hour count: %d + %d' % (match['num_logins'], cur_hour) cur.execute('UPDATE login_history SET num_logins=? WHERE id=?', (match['num_logins']+cur_hour, id_str)) added_logins['update'] = added_logins.get('update',0) + 1 else: print 'Adding %s with %d logins' % (id_str,len(hour)) cur.execute('INSERT INTO login_history ' + \ '(id, day_name, hour, num_logins) ' + \ 'values (?, ?, ?, ?)', \ (id_str, defo.get_day_2char(id_str), defo.get_hour(id_str), cur_hour)) added_logins['insert'] = added_logins.get('insert',0) + 1 # Commit changes db.commit() added_logins['timestamps'] = login_dict.keys() return added_logins
def predict_demand(year,month,day,num_days,enable_plots=None): """ Given a valid database DB with saved formatted *.json files, runs the enabled algorithms to produce predictions for all the days within the (inclusive) time span [(beg_year,beg_month,beg_day) to (end_year,end_month,end_day)] Returns error string if something goes wrong, None if prediction successful. """ print "Predicting Demand for %d days starting on %d/%d/%d" % (num_days,month,day,year) db = dbh.get_db() cur = db.cursor() # For now (smaller dataset), loading all 3 tables in memory is not a problem cur.execute('SELECT * FROM history_outliers') outlier_data = cur.fetchall() cur.execute('SELECT * FROM prediction_outliers') predicted_outlier_data = cur.fetchall() cur.execute('SELECT * FROM login_history ORDER BY id ASC') all_data = cur.fetchall() if not all_data: return {'error':'No data in login_history DB'} if len(all_data) < 7*24: return {'error':'Not enough data to accurately predict demand'} predicted_ids,predictions,predicted_slopes=depr.lin_reg_by_hour(all_data,outlier_data) cur_pred_id = defo.get_id_str(year, month, day, 0) end_pred_id = defo.add_x_hours(cur_pred_id,24*(num_days+1)) delta_days = defo.dy_delta_days(predicted_ids[0],cur_pred_id) # Filter predicted outlier ids to those within prediction timespan ol_dict = {} demand_predictions = {} if predicted_outlier_data: map(lambda y: ol_dict.update({str(y['id']):float(y['multiplier'])}), \ filter(lambda x: x['id']>=cur_pred_id and x['id']<end_pred_id, predicted_outlier_data)) for count in range(num_days): # Calculate the number of weeks to extrapolate on extrap_weeks = int(delta_days+count/7) #print 'Predicting %s, %d weeks from (%s,%s) predictions'%(cur_pred_id,extrap_weeks,predicted_ids[0],predicted_ids[-1]) pred_day_str = defo.get_day_str(year, month, day) prediction_list = [] pred_id_list = [] pred_data = [] for hour in range(24): cur_pred_id = defo.get_id_str(year, month, day, hour) offset = int(defo.hr_subtract_ids(cur_pred_id,predicted_ids[0])%(24*7)) prediction = predictions[offset] + extrap_weeks*predicted_slopes[offset] if cur_pred_id in ol_dict.keys(): prediction *= ol_dict[cur_pred_id] #print 'Predicted (%fx) Multiplier'%ol_dict[cur_pred_id] #print 'Prediction ID: %s, Logins: %f (%s: %fx%dWeeks + %f)'%(cur_pred_id,prediction, # predicted_ids[offset],predicted_slopes[offset],extrap_weeks,predictions[offset]) pred_data.append((cur_pred_id,prediction)) demand_predictions[cur_pred_id] = prediction # Add to database, doing predictions on a day at a time basis (always 24 entries/hours) cur.executemany("INSERT or REPLACE into login_predictions (id, num_logins) values (?, ?)",\ pred_data) # Move to next day year, month, day = defo.tp_add_x_days(year, month, day, 1) db.commit() return demand_predictions
def clear_existing_predictions(year, month, day): """Delete all predictions associated with the input day from the login_predictions database""" if defo.validate_id(defo.get_id_str(year,month,day,0)) is not None: db = dbh.get_db() cur = db.cursor() cur.execute("DELETE FROM login_predictions WHERE id LIKE '",\ (str(defo.get_id_str(year, month, day, 00))[:-2]+"__'",)) db.commit() else: print "Invalid Input to clear_existing_predictions"
def clear_existing_predictions(year, month, day): """Delete all predictions associated with the input day from the login_predictions database""" if defo.validate_id(defo.get_id_str(year, month, day, 0)) is not None: db = dbh.get_db() cur = db.cursor() cur.execute("DELETE FROM login_predictions WHERE id LIKE '",\ (str(defo.get_id_str(year, month, day, 00))[:-2]+"__'",)) db.commit() else: print "Invalid Input to clear_existing_predictions"
def mark_predicted_outlier(outlier_id, multiplier, reason='DefaultOutlier'): """Add analyzed (manual input) insights to database, marking predicted future outliers to adjust predictions""" # Error check user input outlier_id = defo.validate_id(outlier_id) if outlier_id is None: return "Outlier ID Format Invalid" multiplier = float(multiplier) print "Marking predicted (future) outlier: %s, with %f multiplier" %(outlier_id,multiplier) db = dbh.get_db() cur = db.cursor() cur.execute("INSERT or REPLACE into prediction_outliers (id, multiplier, reason) values (?, ?, ?)",\ (outlier_id, float(multiplier), reason)) db.commit() return None
def delete_predictions_with_actuals(): """Finds any predicted hours that have actual data in the login_history table, removes matching entries from login_predictions""" db = dbh.get_db() cur = db.cursor() cur.execute("SELECT id FROM login_predictions") pred_ids = cur.fetchall() cur.execute("SELECT id FROM login_history ORDER BY id ASC") hist_ids = cur.fetchall() if hist_ids: all_ids = [x['id'] for x in hist_ids] for id in pred_ids: if id['id'] in all_ids: cur.execute("DELETE FROM login_predictions WHERE id=?",\ (id['id'],)) db.commit()
def api_predict(num_days_to_predict): """Returns the predicted values that are in the database. If num_days_to_predict is None, returns all predictions. Otherwise, num_days_to_predict specifies the number of predicted days to be returned. If this value exceeds the number of predicted days currently in the database, returns an error.""" if num_days_to_predict is not None: if num_days_to_predict <= 0: return {'error':'Number of days to predict must be positive'} if num_days_to_predict >= 100: return {'error':'Cannot predict more than 99 days forward'} try: db = dbh.get_db() cur = db.cursor() predictions = None if num_days_to_predict is not None: cur.execute('SELECT id FROM login_predictions ORDER BY id ASC') first_pred = cur.fetchone() # Get first prediction id if first_pred: last_year, last_month, last_day = defo.tp_add_x_days_to_id(first_pred['id'], num_days_to_predict) last_pred = defo.get_id_str(last_year, last_month, last_day, 0) cur.execute('SELECT id, num_logins FROM login_predictions WHERE id<? ORDER BY id ASC',(last_pred,)) predictions = cur.fetchall() else: return {'error':'No predictions in DB - try to PUT api/predict resource first'} else: cur.execute('SELECT id, num_logins FROM login_predictions ORDER BY id ASC') predictions = cur.fetchall() if predictions: pred_dict = {} [pred_dict.update({pred['id']:pred['num_logins']}) for pred in predictions] return pred_dict else: return {'error':'No predictions in DB - try to PUT api/predict resource first'} return predictions except ValueError as err: return {'error':'No predictions in DB - try to PUT api/predict resource first'} cur.execute('SELECT id FROM login_history ORDER BY id DESC') latest = cur.fetchone() if latest: next_year, next_month, next_day = defo.tp_add_x_days_to_id(latest['id'], 1) return predict_demand(next_year,next_month,next_day,num_days_to_predict) else: return {'error':'No data in login_history DB'}
def run_analytics(debug=1): """Runs linear regression and smoothing models, outlier identification, and stores results in database for prediction to use. Pass empty array [] to turn off debug printouts""" db = dbh.get_db() cur = db.cursor() cur.execute('SELECT * FROM login_history ORDER BY id ASC') all_data = cur.fetchall() cur.execute('SELECT * FROM history_outliers') outlier_data = cur.fetchall() cur.execute('SELECT * FROM prediction_outliers') predicted_outlier_data = cur.fetchall() if not all_data: print "No data loaded in DB" return predicted_ids,predictions,predicted_slopes=depr.lin_reg_by_hour(all_data,outlier_data,debug) depl.scatter_plot(range(len(predicted_slopes)),predicted_slopes,'Predicted_Slopes','Hour','Slope',predicted_ids[-1])
def mark_predicted_outlier(outlier_id, multiplier, reason='DefaultOutlier'): """Add analyzed (manual input) insights to database, marking predicted future outliers to adjust predictions""" # Error check user input outlier_id = defo.validate_id(outlier_id) if outlier_id is None: return "Outlier ID Format Invalid" multiplier = float(multiplier) print "Marking predicted (future) outlier: %s, with %f multiplier" % ( outlier_id, multiplier) db = dbh.get_db() cur = db.cursor() cur.execute("INSERT or REPLACE into prediction_outliers (id, multiplier, reason) values (?, ?, ?)",\ (outlier_id, float(multiplier), reason)) db.commit() return None
def run_analytics(debug=1): """Runs linear regression and smoothing models, outlier identification, and stores results in database for prediction to use. Pass empty array [] to turn off debug printouts""" db = dbh.get_db() cur = db.cursor() cur.execute('SELECT * FROM login_history ORDER BY id ASC') all_data = cur.fetchall() cur.execute('SELECT * FROM history_outliers') outlier_data = cur.fetchall() cur.execute('SELECT * FROM prediction_outliers') predicted_outlier_data = cur.fetchall() if not all_data: print "No data loaded in DB" return predicted_ids, predictions, predicted_slopes = depr.lin_reg_by_hour( all_data, outlier_data, debug) depl.scatter_plot(range(len(predicted_slopes)), predicted_slopes, 'Predicted_Slopes', 'Hour', 'Slope', predicted_ids[-1])
def api_update_predictions(num_days_to_predict): """Updates the predictions based on historic logins that are contained within the database. Deletes existing predictions that have actual data for matching days and loads the predetermined outliers. The input paramter num_days_to_predict specifies the number of days that will be predicted, starting at the day following the latest actual (historic) timestamp.""" if num_days_to_predict <= 0: return {'error':'Number of days to predict must be positive'} if num_days_to_predict >= 100: return {'error':'Cannot predict more than 99 days forward'} delete_predictions_with_actuals() mark_predetermined_outliers() db = dbh.get_db() cur = db.cursor() cur.execute('SELECT id FROM login_history ORDER BY id DESC') latest = cur.fetchone() # Get latest id so we can start predictions on following day if latest: next_year, next_month, next_day = defo.tp_add_x_days_to_id(latest['id'], 1) return predict_demand(next_year,next_month,next_day,num_days_to_predict) else: return {'error':'No data in login_history DB'}
def add_multiple_logins(login_data): login_dict = defo.datetimes_to_dict(login_data) if not login_dict: return { 'error': 'No valid timestamps', 'timestamps_example': '["2012-03-01T00:05:55+00:00", "2012-03-01T00:06:23+00:00"]' } else: latest_dt = None db = dbh.get_db() cur = db.cursor() added_logins = {} for id_str, hour in login_dict.items(): #print "Read in hour: %s"%(id_str) cur_hour = len(hour) # simple count of logins in this hour cur.execute('SELECT * FROM login_history WHERE id=?', (id_str, )) match = cur.fetchone() if match: print 'Updating hour count: %d + %d' % (match['num_logins'], cur_hour) cur.execute('UPDATE login_history SET num_logins=? WHERE id=?', (match['num_logins'] + cur_hour, id_str)) added_logins['update'] = added_logins.get('update', 0) + 1 else: print 'Adding %s with %d logins' % (id_str, len(hour)) cur.execute('INSERT INTO login_history ' + \ '(id, day_name, hour, num_logins) ' + \ 'values (?, ?, ?, ?)', \ (id_str, defo.get_day_2char(id_str), defo.get_hour(id_str), cur_hour)) added_logins['insert'] = added_logins.get('insert', 0) + 1 # Commit changes db.commit() added_logins['timestamps'] = login_dict.keys() return added_logins
def api_update_predictions(num_days_to_predict): """Updates the predictions based on historic logins that are contained within the database. Deletes existing predictions that have actual data for matching days and loads the predetermined outliers. The input paramter num_days_to_predict specifies the number of days that will be predicted, starting at the day following the latest actual (historic) timestamp.""" if num_days_to_predict <= 0: return {'error': 'Number of days to predict must be positive'} if num_days_to_predict >= 100: return {'error': 'Cannot predict more than 99 days forward'} delete_predictions_with_actuals() mark_predetermined_outliers() db = dbh.get_db() cur = db.cursor() cur.execute('SELECT id FROM login_history ORDER BY id DESC') latest = cur.fetchone( ) # Get latest id so we can start predictions on following day if latest: next_year, next_month, next_day = defo.tp_add_x_days_to_id( latest['id'], 1) return predict_demand(next_year, next_month, next_day, num_days_to_predict) else: return {'error': 'No data in login_history DB'}
def api_predict(num_days_to_predict): """Returns the predicted values that are in the database. If num_days_to_predict is None, returns all predictions. Otherwise, num_days_to_predict specifies the number of predicted days to be returned. If this value exceeds the number of predicted days currently in the database, returns an error.""" if num_days_to_predict is not None: if num_days_to_predict <= 0: return {'error': 'Number of days to predict must be positive'} if num_days_to_predict >= 100: return {'error': 'Cannot predict more than 99 days forward'} try: db = dbh.get_db() cur = db.cursor() predictions = None if num_days_to_predict is not None: cur.execute('SELECT id FROM login_predictions ORDER BY id ASC') first_pred = cur.fetchone() # Get first prediction id if first_pred: last_year, last_month, last_day = defo.tp_add_x_days_to_id( first_pred['id'], num_days_to_predict) last_pred = defo.get_id_str(last_year, last_month, last_day, 0) cur.execute( 'SELECT id, num_logins FROM login_predictions WHERE id<? ORDER BY id ASC', (last_pred, )) predictions = cur.fetchall() else: return { 'error': 'No predictions in DB - try to PUT api/predict resource first' } else: cur.execute( 'SELECT id, num_logins FROM login_predictions ORDER BY id ASC') predictions = cur.fetchall() if predictions: pred_dict = {} [ pred_dict.update({pred['id']: pred['num_logins']}) for pred in predictions ] return pred_dict else: return { 'error': 'No predictions in DB - try to PUT api/predict resource first' } return predictions except ValueError as err: return { 'error': 'No predictions in DB - try to PUT api/predict resource first' } cur.execute('SELECT id FROM login_history ORDER BY id DESC') latest = cur.fetchone() if latest: next_year, next_month, next_day = defo.tp_add_x_days_to_id( latest['id'], 1) return predict_demand(next_year, next_month, next_day, num_days_to_predict) else: return {'error': 'No data in login_history DB'}
def plot_logins(): """Use the loaded history of client login data to create plots, which are saved within the predict_demand/plots folder. Used for manual analysis""" print "Running analytics on DB\n" db = dbh.get_db() cur = db.cursor() cur.execute('SELECT * FROM login_history ORDER BY id ASC') all_data = cur.fetchall() if not all_data: print "No data loaded in DB" return # Tabulate by hour hour_x = [] hour_y = [] base_day = None ## Get first predicted day (1 day past last history day) pred_year,pred_month,pred_day = defo.tp_add_x_days_to_id(all_data[-1]['id'],1) ## Plot trends per day over time (for first week predictions) for i in range(7): pred_day_str = defo.get_day_str(pred_year, pred_month, pred_day) hist_day = filter(lambda x: x['day_name']==pred_day_str, all_data) pred_id = defo.get_id_str(pred_year, pred_month, pred_day, 00) depl.plot_day_trend(pred_id, hist_day) pred_year,pred_month,pred_day = defo.tp_add_x_days(pred_year,pred_month,pred_day,1) # Weekday analysis depl.plot_weekdays([(x['id'],x['num_logins']) for x in all_data \ if x['day_name'] in ['Mo', 'Tu', 'We', 'Th']]) ## Tabulate by day depl.plot_each_day(all_data) # Creates dictionaries for each day, # where the keys are each hour of that day, # and values are a list of tuple pairs (id, count) cur.execute('SELECT MAX(num_logins) FROM login_history') max_login = cur.fetchone()[0] # Monday mo_dict = defo.get_hours_dict() map(lambda y: mo_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Mo', all_data)) depl.plot_day_dict(mo_dict, '1_Monday', max_login) # Tuesday tu_dict = defo.get_hours_dict() map(lambda y: tu_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Tu', all_data)) depl.plot_day_dict(tu_dict, '2_Tuesday', max_login) # Wednesday we_dict = defo.get_hours_dict() map(lambda y: we_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='We', all_data)) depl.plot_day_dict(we_dict, '3_Wednesday', max_login) # Thursday th_dict = defo.get_hours_dict() map(lambda y: th_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Th', all_data)) depl.plot_day_dict(th_dict, '4_Thursday', max_login) # Friday fr_dict = defo.get_hours_dict() map(lambda y: fr_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Fr', all_data)) depl.plot_day_dict(fr_dict, '5_Friday', max_login) # Saturday sa_dict = defo.get_hours_dict() map(lambda y: sa_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Sa', all_data)) depl.plot_day_dict(sa_dict, '6_Saturday', max_login) # Sunday su_dict = defo.get_hours_dict() map(lambda y: su_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Su', all_data)) depl.plot_day_dict(su_dict, '7_Sunday', max_login) day_dict = {} base_year = None ## Tabulate by week # Creates list of most recent week (7 consecutive days) last_complete_week_id = None temp_complete = deque([]) min_id = None for entry in all_data: # Getting keys of most recent consecutive week if not temp_complete: #print 'Starting w/ %d' % entry_doy temp_complete.append(entry['id']) # add subsequent day else: day_delta = defo.dy_subtract_ids(entry['id'],temp_complete[-1]) if day_delta == 1: temp_complete.append(entry['id']) # add subsequent day if len(temp_complete) > 7: # can also use while temp_complete.popleft() # Save id corresponding to day (hour 0) that has a complete weeks # worth of previous data if len(temp_complete) == 7: last_complete_week_id = entry['id'] elif day_delta != 0: # current entry is not the same or a subsequent day temp_complete = deque([entry['id']]) # Save minimum (start) ID key if min_id is None or min_id > entry['id']: min_id = entry['id'] # Create dictionary with key as the day of year # Handle multiple years in database year_str = defo.get_year(entry['id']) if base_year is None: base_year = year_str # if multiple years, assuming years are monotonically increasing if year_str != base_year: days = (int(year_str)-int(base_year))*365 # Doesn't account for leap years... else: days = 0 days = days+int(defo.get_day_of_year(entry['id'])) day_dict[days] = day_dict.get(days,0) + entry['num_logins'] if base_day is None: base_day = days hours = (days - base_day)*24 + entry['hour'] hour_x.append(hours) hour_y.append(entry['num_logins']) depl.scatter_plot(hour_x, hour_y) depl.plot_by_day(day_dict) # Need at least 1 consecutive week's worth of data if last_complete_week_id is not None: print 'Plotting week data...' end_id = last_complete_week_id[:-2]+'23' # Last hour of the day start_id = defo.subtract_one_week(end_id) print "%s to %s"%(start_id,end_id) # Plot full weeks starting at the latest complete week, # where complete means there is at least one data point for 7 consecutive days # Lexigraphical (default) string comparison should work with ID format yyyy-hh-ddThh while end_id > min_id: cur.execute('SELECT id, num_logins FROM login_history WHERE id>? AND id <=? ' \ + 'ORDER BY id ASC', (start_id, end_id)) wk_data = cur.fetchall() # Data from an entire week, sorted by most recent first # Find the time delta in hours (compute negative x values so # the most recent is on the right) if wk_data: last_time = wk_data[-1][0] time_delta = map(lambda entry: defo.hr_subtract_ids(entry[0], last_time), wk_data) id_list,val_list = [list(entry) for entry in zip(*wk_data)] depl.plot_by_week(time_delta, val_list, id_list, max_login) end_id = start_id start_id = defo.subtract_one_week(end_id) else: print('WARNING: Database does not have continuous week of data')
def plot_predictions(update_plots=None): """Updates the predictions (if update_plots is not None) which will also plot the linear regression predictions with past data, and Plots (saved to file) each predicted day in login_predictions""" db = dbh.get_db() cur = db.cursor() if update_plots is not None: num_days_predicted=15 delete_predictions_with_actuals() # Find start day for predictions (=1+last day of actuals) cur.execute("SELECT id FROM login_history ORDER BY id DESC") latest = cur.fetchone() if latest: start_year, start_month, start_day = defo.tp_add_x_days_to_id(latest[0], 1) predict_demand(start_year, start_month, start_day, num_days_predicted, 1) cur.execute("SELECT * FROM login_predictions ORDER BY id ASC") pred_data = cur.fetchall() if not pred_data: print "No predictions in database! Nothing to plot" return # Get list of days that have been predicted pred_dict = {} max_pred = 0 # Save max prediction to set identical y axis scales for hours in pred_data: if hours['num_logins'] > max_pred: max_pred = hours['num_logins'] day_id = hours['id'][:-3] if day_id in pred_dict: pred_dict[day_id].append(hours) else: pred_dict[day_id] = [hours] for day_id,pred_list in pred_dict.items(): depl.plot_single_day(pred_list, 'predicted/'+defo.get_year_month_day_str(day_id+'T00'), max_pred) pred_start = pred_data[0]['id'] pred_end = pred_data[-1]['id'] pred_week_start = defo.subtract_one_week(pred_end) while pred_week_start > pred_start: # Plot predicted weeks pred_y = [x['num_logins'] for x in pred_data if x['id']>pred_week_start and x['id']<=pred_end] pred_id = [x['id'] for x in pred_data if x['id']>pred_week_start and x['id']<=pred_end] if pred_id: depl.plot_by_week(x_list=range(-1*len(pred_y),0),y_list=pred_y,id_list=pred_id, fix_y=max_pred,predicted_color=1,savename='predicted/Week_'+pred_end[:10]) pred_end = pred_week_start pred_week_start = defo.subtract_one_week(pred_end) if pred_end > pred_start: # Print part predicted, part actual hist_start = defo.subtract_one_week(pred_end) cur.execute("SELECT id, num_logins FROM login_history WHERE id>? ORDER BY id ASC ",(hist_start,)) hist_data = cur.fetchall() if hist_data: hist_y = [x['num_logins'] for x in hist_data] # Shouldn't have overlap between actual & predicted hist_id = [x['id'] for x in hist_data] pred_y = [x['num_logins'] for x in pred_data if x['id']<=pred_end] pred_id = [x['id'] for x in pred_data if x['id']<=pred_end] plot_y = hist_y+pred_y depl.plot_by_week(x_list=range(-1*len(plot_y),0),y_list=plot_y,id_list=hist_id+pred_id, fix_y=max_pred,savename='predicted/Week_'+pred_end[:10],split=len(hist_y))
def plot_predictions(update_plots=None): """Updates the predictions (if update_plots is not None) which will also plot the linear regression predictions with past data, and Plots (saved to file) each predicted day in login_predictions""" db = dbh.get_db() cur = db.cursor() if update_plots is not None: num_days_predicted = 15 delete_predictions_with_actuals() # Find start day for predictions (=1+last day of actuals) cur.execute("SELECT id FROM login_history ORDER BY id DESC") latest = cur.fetchone() if latest: start_year, start_month, start_day = defo.tp_add_x_days_to_id( latest[0], 1) predict_demand(start_year, start_month, start_day, num_days_predicted, 1) cur.execute("SELECT * FROM login_predictions ORDER BY id ASC") pred_data = cur.fetchall() if not pred_data: print "No predictions in database! Nothing to plot" return # Get list of days that have been predicted pred_dict = {} max_pred = 0 # Save max prediction to set identical y axis scales for hours in pred_data: if hours['num_logins'] > max_pred: max_pred = hours['num_logins'] day_id = hours['id'][:-3] if day_id in pred_dict: pred_dict[day_id].append(hours) else: pred_dict[day_id] = [hours] for day_id, pred_list in pred_dict.items(): depl.plot_single_day( pred_list, 'predicted/' + defo.get_year_month_day_str(day_id + 'T00'), max_pred) pred_start = pred_data[0]['id'] pred_end = pred_data[-1]['id'] pred_week_start = defo.subtract_one_week(pred_end) while pred_week_start > pred_start: # Plot predicted weeks pred_y = [ x['num_logins'] for x in pred_data if x['id'] > pred_week_start and x['id'] <= pred_end ] pred_id = [ x['id'] for x in pred_data if x['id'] > pred_week_start and x['id'] <= pred_end ] if pred_id: depl.plot_by_week(x_list=range(-1 * len(pred_y), 0), y_list=pred_y, id_list=pred_id, fix_y=max_pred, predicted_color=1, savename='predicted/Week_' + pred_end[:10]) pred_end = pred_week_start pred_week_start = defo.subtract_one_week(pred_end) if pred_end > pred_start: # Print part predicted, part actual hist_start = defo.subtract_one_week(pred_end) cur.execute( "SELECT id, num_logins FROM login_history WHERE id>? ORDER BY id ASC ", (hist_start, )) hist_data = cur.fetchall() if hist_data: hist_y = [x['num_logins'] for x in hist_data ] # Shouldn't have overlap between actual & predicted hist_id = [x['id'] for x in hist_data] pred_y = [ x['num_logins'] for x in pred_data if x['id'] <= pred_end ] pred_id = [x['id'] for x in pred_data if x['id'] <= pred_end] plot_y = hist_y + pred_y depl.plot_by_week(x_list=range(-1 * len(plot_y), 0), y_list=plot_y, id_list=hist_id + pred_id, fix_y=max_pred, savename='predicted/Week_' + pred_end[:10], split=len(hist_y))
def predict_demand(year, month, day, num_days, enable_plots=None): """ Given a valid database DB with saved formatted *.json files, runs the enabled algorithms to produce predictions for all the days within the (inclusive) time span [(beg_year,beg_month,beg_day) to (end_year,end_month,end_day)] Returns error string if something goes wrong, None if prediction successful. """ print "Predicting Demand for %d days starting on %d/%d/%d" % ( num_days, month, day, year) db = dbh.get_db() cur = db.cursor() # For now (smaller dataset), loading all 3 tables in memory is not a problem cur.execute('SELECT * FROM history_outliers') outlier_data = cur.fetchall() cur.execute('SELECT * FROM prediction_outliers') predicted_outlier_data = cur.fetchall() cur.execute('SELECT * FROM login_history ORDER BY id ASC') all_data = cur.fetchall() if not all_data: return {'error': 'No data in login_history DB'} if len(all_data) < 7 * 24: return {'error': 'Not enough data to accurately predict demand'} predicted_ids, predictions, predicted_slopes = depr.lin_reg_by_hour( all_data, outlier_data) cur_pred_id = defo.get_id_str(year, month, day, 0) end_pred_id = defo.add_x_hours(cur_pred_id, 24 * (num_days + 1)) delta_days = defo.dy_delta_days(predicted_ids[0], cur_pred_id) # Filter predicted outlier ids to those within prediction timespan ol_dict = {} demand_predictions = {} if predicted_outlier_data: map(lambda y: ol_dict.update({str(y['id']):float(y['multiplier'])}), \ filter(lambda x: x['id']>=cur_pred_id and x['id']<end_pred_id, predicted_outlier_data)) for count in range(num_days): # Calculate the number of weeks to extrapolate on extrap_weeks = int(delta_days + count / 7) #print 'Predicting %s, %d weeks from (%s,%s) predictions'%(cur_pred_id,extrap_weeks,predicted_ids[0],predicted_ids[-1]) pred_day_str = defo.get_day_str(year, month, day) prediction_list = [] pred_id_list = [] pred_data = [] for hour in range(24): cur_pred_id = defo.get_id_str(year, month, day, hour) offset = int( defo.hr_subtract_ids(cur_pred_id, predicted_ids[0]) % (24 * 7)) prediction = predictions[ offset] + extrap_weeks * predicted_slopes[offset] if cur_pred_id in ol_dict.keys(): prediction *= ol_dict[cur_pred_id] #print 'Predicted (%fx) Multiplier'%ol_dict[cur_pred_id] #print 'Prediction ID: %s, Logins: %f (%s: %fx%dWeeks + %f)'%(cur_pred_id,prediction, # predicted_ids[offset],predicted_slopes[offset],extrap_weeks,predictions[offset]) pred_data.append((cur_pred_id, prediction)) demand_predictions[cur_pred_id] = prediction # Add to database, doing predictions on a day at a time basis (always 24 entries/hours) cur.executemany("INSERT or REPLACE into login_predictions (id, num_logins) values (?, ?)",\ pred_data) # Move to next day year, month, day = defo.tp_add_x_days(year, month, day, 1) db.commit() return demand_predictions
def plot_logins(): """Use the loaded history of client login data to create plots, which are saved within the predict_demand/plots folder. Used for manual analysis""" print "Running analytics on DB\n" db = dbh.get_db() cur = db.cursor() cur.execute('SELECT * FROM login_history ORDER BY id ASC') all_data = cur.fetchall() if not all_data: print "No data loaded in DB" return # Tabulate by hour hour_x = [] hour_y = [] base_day = None ## Get first predicted day (1 day past last history day) pred_year, pred_month, pred_day = defo.tp_add_x_days_to_id( all_data[-1]['id'], 1) ## Plot trends per day over time (for first week predictions) for i in range(7): pred_day_str = defo.get_day_str(pred_year, pred_month, pred_day) hist_day = filter(lambda x: x['day_name'] == pred_day_str, all_data) pred_id = defo.get_id_str(pred_year, pred_month, pred_day, 00) depl.plot_day_trend(pred_id, hist_day) pred_year, pred_month, pred_day = defo.tp_add_x_days( pred_year, pred_month, pred_day, 1) # Weekday analysis depl.plot_weekdays([(x['id'],x['num_logins']) for x in all_data \ if x['day_name'] in ['Mo', 'Tu', 'We', 'Th']]) ## Tabulate by day depl.plot_each_day(all_data) # Creates dictionaries for each day, # where the keys are each hour of that day, # and values are a list of tuple pairs (id, count) cur.execute('SELECT MAX(num_logins) FROM login_history') max_login = cur.fetchone()[0] # Monday mo_dict = defo.get_hours_dict() map(lambda y: mo_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Mo', all_data)) depl.plot_day_dict(mo_dict, '1_Monday', max_login) # Tuesday tu_dict = defo.get_hours_dict() map(lambda y: tu_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Tu', all_data)) depl.plot_day_dict(tu_dict, '2_Tuesday', max_login) # Wednesday we_dict = defo.get_hours_dict() map(lambda y: we_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='We', all_data)) depl.plot_day_dict(we_dict, '3_Wednesday', max_login) # Thursday th_dict = defo.get_hours_dict() map(lambda y: th_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Th', all_data)) depl.plot_day_dict(th_dict, '4_Thursday', max_login) # Friday fr_dict = defo.get_hours_dict() map(lambda y: fr_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Fr', all_data)) depl.plot_day_dict(fr_dict, '5_Friday', max_login) # Saturday sa_dict = defo.get_hours_dict() map(lambda y: sa_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Sa', all_data)) depl.plot_day_dict(sa_dict, '6_Saturday', max_login) # Sunday su_dict = defo.get_hours_dict() map(lambda y: su_dict[y['hour']].append((y['id'],y['num_logins'])), \ filter(lambda x: x['day_name']=='Su', all_data)) depl.plot_day_dict(su_dict, '7_Sunday', max_login) day_dict = {} base_year = None ## Tabulate by week # Creates list of most recent week (7 consecutive days) last_complete_week_id = None temp_complete = deque([]) min_id = None for entry in all_data: # Getting keys of most recent consecutive week if not temp_complete: #print 'Starting w/ %d' % entry_doy temp_complete.append(entry['id']) # add subsequent day else: day_delta = defo.dy_subtract_ids(entry['id'], temp_complete[-1]) if day_delta == 1: temp_complete.append(entry['id']) # add subsequent day if len(temp_complete) > 7: # can also use while temp_complete.popleft() # Save id corresponding to day (hour 0) that has a complete weeks # worth of previous data if len(temp_complete) == 7: last_complete_week_id = entry['id'] elif day_delta != 0: # current entry is not the same or a subsequent day temp_complete = deque([entry['id']]) # Save minimum (start) ID key if min_id is None or min_id > entry['id']: min_id = entry['id'] # Create dictionary with key as the day of year # Handle multiple years in database year_str = defo.get_year(entry['id']) if base_year is None: base_year = year_str # if multiple years, assuming years are monotonically increasing if year_str != base_year: days = (int(year_str) - int(base_year)) * 365 # Doesn't account for leap years... else: days = 0 days = days + int(defo.get_day_of_year(entry['id'])) day_dict[days] = day_dict.get(days, 0) + entry['num_logins'] if base_day is None: base_day = days hours = (days - base_day) * 24 + entry['hour'] hour_x.append(hours) hour_y.append(entry['num_logins']) depl.scatter_plot(hour_x, hour_y) depl.plot_by_day(day_dict) # Need at least 1 consecutive week's worth of data if last_complete_week_id is not None: print 'Plotting week data...' end_id = last_complete_week_id[:-2] + '23' # Last hour of the day start_id = defo.subtract_one_week(end_id) print "%s to %s" % (start_id, end_id) # Plot full weeks starting at the latest complete week, # where complete means there is at least one data point for 7 consecutive days # Lexigraphical (default) string comparison should work with ID format yyyy-hh-ddThh while end_id > min_id: cur.execute('SELECT id, num_logins FROM login_history WHERE id>? AND id <=? ' \ + 'ORDER BY id ASC', (start_id, end_id)) wk_data = cur.fetchall( ) # Data from an entire week, sorted by most recent first # Find the time delta in hours (compute negative x values so # the most recent is on the right) if wk_data: last_time = wk_data[-1][0] time_delta = map( lambda entry: defo.hr_subtract_ids(entry[0], last_time), wk_data) id_list, val_list = [list(entry) for entry in zip(*wk_data)] depl.plot_by_week(time_delta, val_list, id_list, max_login) end_id = start_id start_id = defo.subtract_one_week(end_id) else: print('WARNING: Database does not have continuous week of data')