def draw_stacked_chart(df, file_name, minimum_feature_contribution): logger.info('drawing stacked chart for ' + file_name) col_list, remaining_features, remaining_features_values = va_utils.get_significant_features(df, minimum_feature_contribution) X = list(df['Date']) data = OrderedDict() for col in col_list: data[col] = df[col] #finally add the remaining features as a combined single column data['everything-else'] = remaining_features_values feature_name = file_name[:-4] #chart_file_name = feature_name + '.html' #output_file(chart_file_name) file_name_wo_extn = file_name[:-4] chart_file_name = os.path.join(os.path.sep, os.getcwd(), OUTPUT_DIR_NAME, file_name_wo_extn + '_stacked_chart.html') output_file(chart_file_name) title = feature_name.upper() + ' distribution from ' + str(df['Date'][0]) + ' to ' + str(df['Date'][len(df) - 1]) bar = Bar(data, X, title= title, stacked=True, legend='bottom_left', tools='hover,pan,wheel_zoom,box_zoom,reset,resize', width=1300, height=500) # glyph_renderers = bar.select(dict(type=GlyphRenderer)) # bar_source = glyph_renderers[0].data_source # hover = bar.select(dict(type=HoverTool)) # hover.tooltips = [ # ('name',' @cat'), # ('number', '$y'), # ] save(bar) logger.info('saved the chart in ' + chart_file_name)
def model_tsa(df, file_name, minimum_feature_contribution): #first get a list of features to model #each significant feature (protocol or application) would be modeled as an ARIMA (auto regressive moving average) #the modeling artifacts i.e. charts, sumary etc would be stored in a directory by feature name logger.info('Begin feature extraction...') col_list, remaining_features, remaining_features_values = va_utils.get_significant_features(df, minimum_feature_contribution) #store the results in a dataframe df_output = pd.DataFrame() feature_name_list = [] model_name_list = [] MAE_list = [] model_selection_list = [] predicted_col_list_with_data = ['Date'] + col_list df_predictions = pd.DataFrame(columns = predicted_col_list_with_data ) logger.info('columns in oredicted df...') logger.info(predicted_col_list_with_data) df3 = df_predictions #df_w_predictions = copy.deepcopy(df[col_list]) for col in col_list: logger.info('-------- modeling ' + col + '-------------') curr_dir = os.getcwd() try: feature, model_names, models, results, MAE, predicted_dates, predicted, model_selection = model_feature(file_name, df, col) feature_name_list += [feature]*len(model_names) model_name_list += model_names MAE_list += MAE model_selection_list += model_selection df_predictions['Date'] = predicted_dates df_predictions[feature] = predicted except Exception, e: logger.info('Could not model feature ' + col) logger.info('Exception: ' + str(e)) os.chdir(curr_dir)