def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight() self._scriptStages = { "initialization":{ "summary":"Initialized the Decision Tree Regression Scripts", "weight":2 }, "predictionStart":{ "summary":"Decision Tree Regression Model Prediction Started", "weight":2 }, "predictionFinished":{ "summary":"Decision Tree Regression Model Prediction Finished", "weight":6 } } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/model" pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline" print "trained_model_path",trained_model_path print "pipeline_path",pipeline_path print "score_data_path",score_data_path pipelineModel = MLUtils.load_pipeline(pipeline_path) trained_model = MLUtils.load_dtree_regresssion_pyspark_model(trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) if result_column in transformed.columns: transformed = transformed.withColumnRenamed(result_column,"originalLabel") transformed = transformed.withColumnRenamed("prediction",result_column) pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] pandas_scored_df.to_csv(score_data_path,header=True,index=False) print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column]))) elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total") score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/"+self._dataframe_context.get_model_for_scoring()+".pkl" print "trained_model_path",trained_model_path print "score_data_path",score_data_path if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] trained_model = joblib.load(trained_model_path) model_columns = self._dataframe_context.get_model_features() print "model_columns",model_columns df = self._data_frame.toPandas() # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df,model_columns,result_column) if uid_col: pandas_df = pandas_df[[x for x in pandas_df.columns if x != uid_col]] y_score = trained_model.predict(pandas_df) scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print CommonUtils.convert_python_object_to_json(kpiCard) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path,header=True,index=False) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total") print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop pandas_scored_df = df[list(set(columns_to_keep+[result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print spark_scored_df.printSchema() df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis") descr_stats_obj.Run() print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds." except: print "Frequency Analysis Failed " # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension") two_way_obj.Run() print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds." except: print "Anova Analysis Failed"
def __init__(self, df_helper, df_context, result_setter, spark, story_narrative, meta_parser): self._story_narrative = story_narrative self._result_setter = result_setter self._spark = spark self._dataframe_helper = df_helper self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._data_frame = df_helper.get_data_frame() self._num_significant_digits = NarrativesUtils.get_significant_digit_settings( "trend") self._metaParser = meta_parser self._result_column = self._dataframe_context.get_result_column() self._string_columns = self._dataframe_helper.get_string_columns() self._timestamp_columns = self._dataframe_helper.get_timestamp_columns( ) # self._selected_date_columns = None self._selected_date_columns = self._dataframe_context.get_selected_date_columns( ) self._all_date_columns = self._dataframe_context.get_date_columns() self._string_columns = list( set(self._string_columns) - set(self._all_date_columns)) self._dateFormatDetected = False self._existingDateFormat = None self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict( ) self._dateColumnFormatDict = df_context.get_date_format_dict() if self._dataframe_context.get_requested_date_format() != None: self._requestedDateFormat = df_context.get_requested_date_format() else: self._requestedDateFormat = None self._analysistype = self._dataframe_context.get_analysis_type() self._trendSettings = self._dataframe_context.get_trend_settings() self._trendSpecificMeasure = False if self._trendSettings != None: if self._analysistype == "dimension" and self._trendSettings[ "name"] != "Count": self._trendSpecificMeasure = True self._analysistype = "measure" self._result_column = self._trendSettings["selectedMeasure"] elif self._analysistype == "measure" and self._trendSettings[ "name"] != "Count": self._result_column = self._trendSettings["selectedMeasure"] self._trend_subsection = self._result_setter.get_trend_section_name() self._regression_trend_card = None self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._highlightFlag = "|~HIGHLIGHT~|" self._trend_on_td_column = False self._number_of_dimensions_to_consider = 10 self._completionStatus = self._dataframe_context.get_completion_status( ) self._analysisName = self._dataframe_context.get_analysis_name() self._messageURL = self._dataframe_context.get_message_url() if self._analysistype == "dimension": self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Frequency Narratives", "weight": 0 }, "summarygeneration": { "summary": "Summary Generation Finished", "weight": 4 }, "completion": { "summary": "Frequency Stats Narratives Done", "weight": 0 }, } elif self._analysistype == "measure": if self._trendSpecificMeasure: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) self._scriptStages = { "trendNarrativeStart": { "summary": "Started The Descriptive Stats Narratives", "weight": 1 }, "trendNarrativeEnd": { "summary": "Narratives For Descriptive Stats Finished", "weight": 0 }, } self._base_dir = "/trend/" if self._pandas_flag and self._selected_date_columns and not self._dateColumnFormatDict and not self._timestamp_columns: for column in self._selected_date_columns: uniqueVals = self._data_frame[column].astype( str).unique().tolist() metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.shape[0]) if len(uniqueVals ) > 0 and metaHelperInstance.get_datetime_format_pandas( [ self._data_frame.sort_values( by=column, ascending=False)[column][0] ]) != None: dateColumnFormat = metaHelperInstance.get_datetime_format_pandas( uniqueVals) self._dateColumnFormatDict.update( {column: dateColumnFormat}) dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\ self._timestamp_columns,\ self._dateColumnFormatDict,\ self._dateFormatConversionDict, self._requestedDateFormat) print(dateColCheck) self._dateFormatDetected = dateColCheck["dateFormatDetected"] self._trend_on_td_column = dateColCheck["trendOnTdCol"] if self._dateFormatDetected: self._requestedDateFormat = dateColCheck["requestedDateFormat"] self._existingDateFormat = dateColCheck["existingDateFormat"] # self._date_column_suggested is the column used for trend self._date_column_suggested = dateColCheck["suggestedDateColumn"] if self._existingDateFormat: self._data_frame, dataRangeStats = NarrativesUtils.calculate_data_range_stats( self._data_frame, self._existingDateFormat, self._date_column_suggested, self._trend_on_td_column, self._pandas_flag) print(dataRangeStats) self._durationString = dataRangeStats["durationString"] self._duration = dataRangeStats["duration"] self._dataLevel = dataRangeStats["dataLevel"] first_date = dataRangeStats["firstDate"] last_date = dataRangeStats["lastDate"] if self._timestamp_columns != None: if self._selected_date_columns == None: self._selected_date_columns = self._timestamp_columns else: self._selected_date_columns += self._timestamp_columns if self._pandas_flag: pass else: if self._trend_subsection == "regression": if self._selected_date_columns != None: if self._dateFormatDetected: trend_subsection_data = self._result_setter.get_trend_section_data( ) measure_column = trend_subsection_data[ "measure_column"] result_column = trend_subsection_data["result_column"] base_dir = trend_subsection_data["base_dir"] card3heading = 'How ' + result_column + ' and ' + measure_column + ' changed over time' if self._dataLevel == "day": grouped_data = self._data_frame.groupBy( "suggestedDate").agg({ measure_column: 'sum', result_column: 'sum' }) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-1], result_column) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-2], measure_column) grouped_data = grouped_data.withColumn( "year_month", udf(lambda x: x.strftime("%b-%y"))( "suggestedDate")) grouped_data = grouped_data.orderBy( "suggestedDate", ascending=True) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[0], "key") grouped_data = grouped_data.toPandas() elif self._dataLevel == "month": grouped_data = self._data_frame.groupBy( "year_month").agg({ measure_column: 'sum', result_column: 'sum' }) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-1], result_column) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-2], measure_column) grouped_data = grouped_data.withColumn( "suggestedDate", udf(lambda x: datetime.strptime(x, "%b-%y"))( "year_month")) grouped_data = grouped_data.orderBy( "suggestedDate", ascending=True) grouped_data = grouped_data.withColumnRenamed( "suggestedDate", "key") grouped_data = grouped_data.select([ "key", measure_column, result_column, "year_month" ]).toPandas() grouped_data["key"] = grouped_data[ "year_month"].apply( lambda x: datetime.strptime(x, "%b-%y" ).date()) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) card3data = trend_narrative_obj.generate_regression_trend_data( grouped_data, measure_column, result_column, self._dataLevel, self._durationString) card3narrative = NarrativesUtils.get_template_output(base_dir,\ 'regression_card3.html',card3data) card3chart = trend_narrative_obj.generate_regression_trend_chart( grouped_data, self._dataLevel) card3paragraphs = NarrativesUtils.paragraph_splitter( card3narrative) card2 = { 'charts': card3chart, 'paragraphs': card3paragraphs, 'heading': card3heading } self.set_regression_trend_card_data(card2) else: print("NO DATE FORMAT DETECTED") else: print("NO DATE COLUMNS PRESENT") if self._analysistype == "measure": self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["trendNarrativeStart"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "trendNarrativeStart",\ "info",\ self._scriptStages["trendNarrativeStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) # self._startMeasureTrend = self._result_setter.get_trend_section_completion_status() self._startMeasureTrend = True if self._startMeasureTrend == True: self.narratives = { "SectionHeading": "", "card1": {}, "card2": {}, "card3": {} } if self._selected_date_columns != None: if self._dateFormatDetected: grouped_data = NarrativesUtils.get_grouped_data_for_trend( self._data_frame, self._dataLevel, self._result_column, self._analysistype, self._pandas_flag) if self._pandas_flag: self._data_frame = self._data_frame.drop( self._date_column_suggested, axis=1) else: self._data_frame = self._data_frame.drop( self._date_column_suggested) # self._data_frame = self._data_frame.withColumnRenamed("year_month", self._date_column_suggested) significant_dimensions = [] significant_dimension_dict = df_helper.get_significant_dimension( ) if significant_dimension_dict != {} and significant_dimension_dict != None: significant_dimension_tuple = tuple( significant_dimension_dict.items()) significant_dimension_tuple = sorted( significant_dimension_tuple, key=lambda x: x[1], reverse=True) significant_dimensions = [ x[0] for x in significant_dimension_tuple[:self. _number_of_dimensions_to_consider] ] else: significant_dimensions = self._string_columns[:self . _number_of_dimensions_to_consider] print("significant_dimensions", significant_dimensions) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) # grouped_data.to_csv("/home/gulshan/marlabs/datasets/trend_grouped_pandas.csv",index=False) dataDict = trend_narrative_obj.generateDataDict( grouped_data, self._dataLevel, self._durationString) # # update reference time with max value reference_time = dataDict["reference_time"] dataDict["duration"] = self._duration dataDict["dataLevel"] = self._dataLevel dataDict["durationString"] = self._durationString dataDict[ "significant_dimensions"] = significant_dimensions if len(significant_dimensions) > 0: if self._dataLevel == "day": datetimeformat = self._existingDateFormat elif self._dataLevel == "month": datetimeformat = "%b-%y" # xtraData = trend_narrative_obj.get_xtra_calculations(self._data_frame,grouped_data,significant_dimensions,self._date_column_suggested,self._result_column,self._existingDateFormat,reference_time,self._dataLevel, self._pandas_flag) xtraData = trend_narrative_obj.get_xtra_calculations( self._data_frame, grouped_data, significant_dimensions, self._date_column_suggested, self._result_column, datetimeformat, reference_time, self._dataLevel, self._pandas_flag) if xtraData != None: dataDict.update(xtraData) # print 'Trend dataDict: %s' %(json.dumps(dataDict, indent=2)) self._result_setter.update_executive_summary_data( dataDict) dataDict.update({ "blockSplitter": self._blockSplitter, "highlightFlag": self._highlightFlag }) summary1 = NarrativesUtils.get_template_output(self._base_dir,\ 'measure_trend_card1.html',dataDict) summary2 = NarrativesUtils.get_template_output(self._base_dir,\ 'measure_trend_card2.html',dataDict) measureTrendCard = NormalCard() measureTrendcard1Data = NarrativesUtils.block_splitter( summary1, self._blockSplitter, highlightFlag=self._highlightFlag) measureTrendcard2Data = NarrativesUtils.block_splitter( summary2, self._blockSplitter) # print measureTrendcard1Data bubbledata = dataDict["bubbleData"] # print bubbledata card1BubbleData = "<div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div><div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div>".format( bubbledata[0]["value"], bubbledata[0]["text"], bubbledata[1]["value"], bubbledata[1]["text"]) # print card1BubbleData trend_chart_data = list( grouped_data[["key", "value"]].T.to_dict().values()) trend_chart_data = sorted(trend_chart_data, key=lambda x: x["key"]) card1chartdata = {"actual": [], "predicted": []} if self._dataLevel == "day": card1chartdata["actual"] = [{ "key": str(val["key"]), "value": val["value"] } for val in trend_chart_data] elif self._dataLevel == "month": card1chartdata["actual"] = [{ "key": val["key"].strftime("%b-%y"), "value": val["value"] } for val in trend_chart_data] if self._duration < 365: prediction_window = 3 else: prediction_window = 6 predicted_values = trend_narrative_obj.get_forecast_values( grouped_data["value"], prediction_window)[len(grouped_data["value"]):] predicted_values = [ round(x, self._num_significant_digits) for x in predicted_values ] forecasted_data = [] forecasted_data.append(card1chartdata["actual"][-1]) forecasted_dates = [] # forecast_start_time = datetime.strptime(card1chartdata["actual"][-1]["key"],"%b-%y") if self._dataLevel == "month": forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], "%b-%y") elif self._dataLevel == "day": try: forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], "%Y-%m-%d") except: forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], '%Y-%m-%d %H:%M:%S') for val in range(prediction_window): if self._dataLevel == "month": key = forecast_start_time + relativedelta( months=1 + val) forecasted_dates.append(key) elif self._dataLevel == "day": key = forecast_start_time + relativedelta( days=1 + val) forecasted_dates.append(key) forecasted_list = list( zip(forecasted_dates, predicted_values)) if self._dataLevel == "month": forecasted_list = [{ "key": val[0].strftime("%b-%y"), "value": val[1] } for val in forecasted_list] elif self._dataLevel == "day": forecasted_list = [{ "key": val[0].strftime("%Y-%m-%d"), "value": val[1] } for val in forecasted_list] forecasted_data += forecasted_list card1chartdata["predicted"] = forecasted_data # print json.dumps(card1chartdata,indent=2) card1chartdata = ScatterChartData(data=card1chartdata) chartJson = ChartJson() chartJson.set_data(card1chartdata.get_data()) chartJson.set_label_text({ 'x': ' ', 'y': 'No. of Observations' }) chartJson.set_legend({ "actual": "Observed", "predicted": "Forecast" }) chartJson.set_chart_type("scatter_line") chartJson.set_axes({"x": "key", "y": "value"}) chartJson.set_yaxis_number_format(".2f") st_info = [ "Trend Analysis", "Forecast Method : Holt Winters Method" ] measureTrendcard1Data.insert( 1, C3ChartData(data=chartJson, info=st_info)) measureTrendcard1Data.append( HtmlData(data=card1BubbleData)) cardData = measureTrendcard1Data + measureTrendcard2Data measureTrendCard.set_card_data(cardData) measureTrendCard.set_card_name("Trend Analysis") trendStoryNode = NarrativesTree( "Trend", None, [], [measureTrendCard]) self._story_narrative.add_a_node(trendStoryNode) self._result_setter.set_trend_node(trendStoryNode) # prediction_data = [{"key":x["key"],"value":x["value"]} for x in trend_chart_data] # last_val = prediction_data[-1] # last_val.update({"predicted_value":last_val["value"]}) # prediction_data[-1] = last_val # # for val in range(prediction_window): # dataLevel = dataDict["dataLevel"] # if self._dataLevel == "month": # last_key = prediction_data[-1]["key"] # key = last_key+relativedelta(months=1) # prediction_data.append({"key":key,"predicted_value":predicted_values[val]}) # forecasted_data.append({"key":key,"value":predicted_values[val]}) # elif self._dataLevel == "day": # last_key = prediction_data[-1]["key"] # key = last_key+relativedelta(days=1) # prediction_data.append({"key":key,"predicted_value":predicted_values[val]}) # prediction_data_copy = prediction_data # prediction_data = [] # for val in prediction_data_copy: # val["key"] = val["key"].strftime("%b-%y") # prediction_data.append(val) # forecastDataDict = {"startForecast":predicted_values[0], # "endForecast":predicted_values[prediction_window-1], # "measure":dataDict["measure"], # "forecast":True, # "forecast_percentage": round((predicted_values[prediction_window-1]-predicted_values[0])/predicted_values[0],self._num_significant_digits), # "prediction_window_text": str(prediction_window) + " months" # } # # self._result_setter.update_executive_summary_data(forecastDataDict) # summary3 = NarrativesUtils.get_template_output(self._base_dir,\ # 'trend_narrative_card3.html',forecastDataDict) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["trendNarrativeEnd"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "trendNarrativeEnd",\ "info",\ self._scriptStages["trendNarrativeEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message( self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: # self._result_setter.update_executive_summary_data({"trend_present":False}) print("Trend Analysis for Measure Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print( "No date format for the date column %s was detected." % (self._date_column_suggested)) print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\ completionStatus,completionStatus) CommonUtils.save_progress_message( messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: # self._result_setter.update_executive_summary_data({"trend_present":False}) print("Trend Analysis for Measure Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print("No date column present for Trend Analysis.") print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "No Date Column Present",\ completionStatus,completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: print("overall Trend not Started YET") elif self._analysistype == "dimension": print("Dimension Trend Started") self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) self.narratives = {"card0": {}} if self._selected_date_columns != None: if self._dateFormatDetected: # result_column_levels = [x[0] for x in self._data_frame.select(self._result_column).distinct().collect()] try: result_column_levels = self._metaParser.get_unique_level_names( self._result_column) except: if self._pandas_flag: result_column_levels = list( self._data_frame[self._result_column].unique()) else: result_column_levels = [ x[0] for x in self._data_frame.select( self._result_column).distinct().collect() ] # result_column_levels = self._data_frame.agg((F.collect_set(self._result_column).alias(self._result_column))).first().asDict()[self._result_column] print("-" * 100) # TODO Implement meta parser getter here print(result_column_levels) if self._pandas_flag: level_count_df = self._data_frame[ self._result_column].value_counts()[0:2] top2levels = list(level_count_df.index) else: level_count_df = self._data_frame.groupBy( self._result_column).count().orderBy( "count", ascending=False) level_count_df_rows = level_count_df.collect() top2levels = [ level_count_df_rows[0][0], level_count_df_rows[1][0] ] cardData = [] chart_data = {} cardData1 = [] c3_chart = {"dataType": "c3Chart", "data": {}} print("#" * 40) overall_count = NarrativesUtils.get_grouped_count_data_for_dimension_trend( self._data_frame, self._dataLevel, self._result_column, self._pandas_flag) print("#" * 40) for idx, level in enumerate(top2levels): print("calculations in progress for the level :- ", level) if self._pandas_flag: leveldf = self._data_frame[self._data_frame[ self._result_column] == level] else: leveldf = self._data_frame.filter( col(self._result_column) == level) grouped_data = NarrativesUtils.get_grouped_data_for_trend( leveldf, self._dataLevel, self._result_column, self._analysistype, self._pandas_flag) grouped_data.rename(columns={"value": "value_count"}, inplace=True) grouped_data = pd.merge(grouped_data, overall_count, on='key', how='left') # grouped_data["value"] = grouped_data["value_count"].apply(lambda x:round(x*100/float(self._data_frame.count()),self._num_significant_digits)) grouped_data["value"] = old_div( grouped_data["value_count"], grouped_data["totalCount"]) grouped_data["value"] = grouped_data["value"].apply( lambda x: round(x * 100, self. _num_significant_digits)) if self._pandas_flag: leveldf = leveldf.drop(self._date_column_suggested, axis=1) leveldf = leveldf.rename( columns={ "year_month": self._date_column_suggested }) if "year_month" not in leveldf.columns: leveldf["year_month"] = leveldf[ self._date_column_suggested] leveldf["value_col"] = 1 else: leveldf = leveldf.drop(self._date_column_suggested) leveldf = leveldf.withColumnRenamed( "year_month", self._date_column_suggested) if "year_month" not in leveldf.columns: leveldf = leveldf.withColumn( "year_month", col(self._date_column_suggested)) leveldf = leveldf.withColumn('value_col', lit(1)) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) dataDict = trend_narrative_obj.generateDataDict( grouped_data, self._dataLevel, self._durationString) dataDict["target_column"] = dataDict["measure"] dataDict["measure"] = level dataDict["duration"] = self._duration dataDict["dataLevel"] = self._dataLevel dataDict["durationString"] = self._durationString # grouped_data.to_csv("/home/gulshan/marlabs/datasets/grouped_data"+str(idx)) # print json.dumps(dataDict,indent=2) significant_dimensions = [] significant_dimension_dict = df_helper.get_chisquare_significant_dimension( ) if significant_dimension_dict != {} and significant_dimension_dict != None: significant_dimension_tuple = tuple( significant_dimension_dict.items()) significant_dimension_tuple = sorted( significant_dimension_tuple, key=lambda x: x[1], reverse=True) significant_dimensions = [ x[0] for x in significant_dimension_tuple[:self. _number_of_dimensions_to_consider] ] else: significant_dimensions = self._string_columns[:self . _number_of_dimensions_to_consider] print("significant_dimensions", significant_dimensions) reference_time = dataDict["reference_time"] dataDict[ "significant_dimensions"] = significant_dimensions if len(significant_dimensions) > 0: st = time.time() xtraData = trend_narrative_obj.get_xtra_calculations( leveldf, grouped_data, significant_dimensions, self._date_column_suggested, "value_col", self._existingDateFormat, reference_time, self._dataLevel, self._pandas_flag) print("time for get_xtra_calculations", time.time() - st) if xtraData != None: dataDict.update(xtraData) dimensionCount = trend_narrative_obj.generate_dimension_extra_narrative( grouped_data, dataDict, self._dataLevel) if dimensionCount != None: dataDict.update(dimensionCount) dataDict.update({ "level_index": idx, "blockSplitter": self._blockSplitter, "highlightFlag": self._highlightFlag }) self._result_setter.update_executive_summary_data( dataDict) trendStory = NarrativesUtils.get_template_output(self._base_dir,\ 'dimension_trend.html',dataDict) blocks = NarrativesUtils.block_splitter( trendStory, self._blockSplitter) if idx != 0: cardData1 += blocks[2:] else: cardData1 += blocks trend_chart_data = [ x for x in list(grouped_data[ ["key", "value"]].T.to_dict().values()) if x['key'] != None ] trend_chart_data = sorted(trend_chart_data, key=lambda x: x["key"]) card1chartdata = trend_chart_data if self._dataLevel == "day": card1chartdata = [{ "key": str(val["key"]), "value": val["value"] } for val in card1chartdata] elif self._dataLevel == "month": card1chartdata = [{ "key": val["key"].strftime("%b-%y"), "value": val["value"] } for val in card1chartdata] chart_data[level] = card1chartdata labels = { "x": "key", "y": list(chart_data.keys())[0], "y2": list(chart_data.keys())[1] } c3Chart = { "data": chart_data, "format": "%b-%y", "label": labels, "label_text": { "x": "Time", "y": "Percentage of " + labels["y"], "y2": "Percentage of " + labels["y2"] } } c3_chart["data"] = c3Chart multiLineData = [] for idx in range(len(chart_data[top2levels[0]])): key = chart_data[top2levels[0]][idx]["key"] value = chart_data[top2levels[0]][idx]["value"] try: value1 = chart_data[top2levels[1]][idx]["value"] except: value1 = 0 multiLineData.append({ "key": key, top2levels[0]: value, top2levels[1]: value1 }) chartData = NormalChartData(multiLineData) chartJson = ChartJson() chartJson.set_data(chartData.get_data()) chartJson.set_label_text(c3Chart["label_text"]) chartJson.set_legend(c3Chart["label"]) chartJson.set_chart_type("line") chartJson.set_yaxis_number_format(".2f") chartJson.set_axes(labels) st_info = [ "Trend Analysis", "Forecast Method : Holt Winters Method" ] cardData1.insert(1, C3ChartData(data=chartJson, info=st_info)) trendCard = NormalCard(name="Trend Analysis", slug=None, cardData=cardData1) trendStoryNode = NarrativesTree("Trend", None, [], [trendCard]) self._story_narrative.add_a_node(trendStoryNode) self._result_setter.set_trend_node(trendStoryNode) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["summarygeneration"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "summarygeneration",\ "info",\ self._scriptStages["summarygeneration"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["completion"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "completion",\ "info",\ self._scriptStages["completion"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: self._result_setter.update_executive_summary_data( {"trend_present": False}) print("Trend Analysis for Dimension Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) if self._date_column_suggested: print( "No date format for the date column %s was detected." % (self._date_column_suggested)) print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( self._completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\ self._completionStatus,self._completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: self._result_setter.update_executive_summary_data( {"trend_present": False}) print("Trend Analysis for Dimension Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print("No date column present for Trend Analysis.") print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( self._completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "No Date Column Present",\ self._completionStatus,self._completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus)
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Random Forest Scripts", "weight": 2 }, "prediction": { "summary": "Random Forest Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # Match with the level_counts and then clean the data dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() cat_cols = self._dataframe_helper.get_string_columns() # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict") # if level_counts_train != {}: # for key in level_counts_train: # if key in level_counts_score: # if level_counts_train[key] != level_counts_score[key]: # dataSanity = False # else: # dataSanity = False categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" if score_data_path.startswith("file"): score_data_path = score_data_path[7:] trained_model_path = self._dataframe_context.get_model_path() trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pkl" if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] score_summary_path = self._dataframe_context.get_score_path( ) + "/Summary/summary.json" if score_summary_path.startswith("file"): score_summary_path = score_summary_path[7:] trained_model = joblib.load(trained_model_path) # pandas_df = self._data_frame.toPandas() df = self._data_frame.toPandas() model_columns = self._dataframe_context.get_model_features() pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] y_score = trained_model.predict(pandas_df) y_prob = trained_model.predict_proba(pandas_df) y_prob = MLUtils.calculate_predicted_probability(y_prob) y_prob = list([round(x, 2) for x in y_prob]) score = { "predicted_class": y_score, "predicted_probability": y_prob } df["predicted_class"] = score["predicted_class"] labelMappingDict = self._dataframe_context.get_label_map() df["predicted_class"] = df["predicted_class"].apply( lambda x: labelMappingDict[x] if x != None else "NA") df["predicted_probability"] = score["predicted_probability"] self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( df) self._score_summary["result_column"] = result_column if result_column in df.columns: df.drop(result_column, axis=1, inplace=True) df = df.rename(index=str, columns={"predicted_class": result_column}) df.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(df[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = df[df[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["prediction"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary})) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] # considercolumnstype = self._dataframe_context.get_score_consider_columns_type() # considercolumns = self._dataframe_context.get_score_consider_columns() # if considercolumnstype != None: # if considercolumns != None: # if considercolumnstype == ["excluding"]: # columns_to_drop = considercolumns # elif considercolumnstype == ["including"]: # columns_to_keep = considercolumns columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) df.drop(columns_to_drop, axis=1, inplace=True) resultColLevelCount = dict(df[result_column].value_counts()) # self._metaParser.update_level_counts(result_column,resultColLevelCount) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(list(resultColLevelCount.keys())) }) self._dataframe_context.set_story_on_scored_data(True) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) spark_scored_df = SQLctx.createDataFrame(df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column]) # df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj) # narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # narratives = CommonUtils.as_dict(narratives_obj) # # print "Frequency Analysis Done in ", time.time() - fs, " seconds." # self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "frequency",\ # "info",\ # self._scriptStages["frequency"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg) # self._dataframe_context.update_completion_status(self._completionStatus) # print "Frequency ",self._completionStatus # except: # print "Frequency Analysis Failed " # # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column]) # df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj) # chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)) # except: # print "ChiSquare Analysis Failed " if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except: print("DecisionTree Analysis Failed ") else: data_dict = { "npred": len(predictedClasses), "nactual": len(list(labelMappingDict.values())) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in list(resultColLevelCount.items()) if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in list(levelCountDict.values()) if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(old_div(v * 100, total)) + "%" if old_div(v * 100, total) >= 10 else str(int(old_div(v * 100, total))) + "%" }) for k, v in list(levelCountDict.items()) if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(list(levelCountDict.keys())) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Naive Bayes Scripts", "weight": 2 }, "prediction": { "summary": "Spark ML Naive Bayes Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["initialization"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() time_dimension_columns = self._dataframe_helper.get_timestamp_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] level_counts_score = CommonUtils.get_level_count_dict( self._data_frame, categorical_columns, self._dataframe_context.get_column_separator(), output_type="dict", dataType="spark") for key in level_counts_train: if key in level_counts_score: if level_counts_train[key] != level_counts_score[key]: dataSanity = False else: dataSanity = False test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = self._dataframe_context.get_model_path() trained_model_path = "/".join( trained_model_path.split("/")[:-1] ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring( ) # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json" pipelineModel = MLUtils.load_pipeline(trained_model_path) df = self._data_frame transformed = pipelineModel.transform(df) label_indexer_dict = MLUtils.read_string_indexer_mapping( trained_model_path, SQLctx) prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( result_column, prediction_to_levels(transformed.prediction)) if "probability" in transformed.columns: probability_dataframe = transformed.select( [result_column, "probability"]).toPandas() probability_dataframe = probability_dataframe.rename( index=str, columns={result_column: "predicted_class"}) probability_dataframe[ "predicted_probability"] = probability_dataframe[ "probability"].apply(lambda x: max(x)) self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( probability_dataframe) self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column, "probability"]).toPandas() scored_dataframe['predicted_probability'] = probability_dataframe[ "predicted_probability"].values # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"}) else: self._score_summary["prediction_split"] = [] self._score_summary["result_column"] = result_column scored_dataframe = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]).toPandas() labelMappingDict = self._dataframe_context.get_label_map() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] scored_dataframe.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(scored_dataframe[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = scored_dataframe[scored_dataframe[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["prediction"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] scored_df = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]) columns_to_drop = [ x for x in columns_to_drop if x in scored_df.columns ] modified_df = scored_df.select( [x for x in scored_df.columns if x not in columns_to_drop]) resultColLevelCount = dict( modified_df.groupby(result_column).count().collect()) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(resultColLevelCount.keys()) }) self._dataframe_context.set_story_on_scored_data(True) self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(modified_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except Exception as e: print("DecisionTree Analysis Failed ", str(e)) else: data_dict = { "npred": len(predictedClasses), "nactual": len(labelMappingDict.values()) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in resultColLevelCount.items() if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(v * 100 / total) + "%" }) for k, v in levelCountDict.items() if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(levelCountDict.keys()) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def generate_narratives(self): regression_narrative_obj = LinearRegressionNarrative( self._df_regression_result, self._correlations, self._dataframe_helper, self._dataframe_context, self._metaParser, self._spark ) main_card_data = regression_narrative_obj.generate_main_card_data() main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_main_card.html',main_card_data) self.narratives['main_card'] = {} self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative) self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column self.narratives["main_card"]['chart'] = {} self.narratives["main_card"]['chart']['heading'] = '' self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs], [j['coefficient'] for i,j in self._all_coeffs]] self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name', 'y': 'Change in ' + self.result_column + ' per unit increase'} main_card = NormalCard() main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>") main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter) main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])] main_card_chart = NormalChartData(data=main_card_chart_data) mainCardChartJson = ChartJson() mainCardChartJson.set_data(main_card_chart.get_data()) mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'}) mainCardChartJson.set_chart_type("bar") mainCardChartJson.set_axes({"x":"key","y":"value"}) mainCardChartJson.set_yaxis_number_format(".2f") # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"] chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True) statistical_info_array=[ ("Test Type","Regression"), ("Effect Size","Coefficients"), ("Max Effect Size",chart_data[0]["key"]), ("Min Effect Size",chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \ Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \ Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4)) else: statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \ Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4)) if statistical_inference != "": statistical_info_array.append(("Inference",statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)]) main_card.set_card_name("Key Influencers") self._regressionNode.add_a_card(main_card) count = 0 for measure_column in self.significant_measures: sigMeasureNode = NarrativesTree() sigMeasureNode.set_name(measure_column) measureCard1 = NormalCard() measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column)) measureCard1Data = [] if self._run_dimension_level_regression: measureCard2 = NormalCard() measureCard2.set_card_name("Key Areas where it Matters") measureCard2Data = [] measure_column_cards = {} card0 = {} card1data = regression_narrative_obj.generate_card1_data(measure_column) card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>" measureCard1Header = HtmlData(data=card1heading) card1data.update({"blockSplitter":self._blockSplitter}) card1narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card1.html',card1data) card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter) card0 = {"paragraphs":card1paragraphs} card0["charts"] = {} card0['charts']['chart2']={} # card0['charts']['chart2']['data']=card1data["chart_data"] # card0['charts']['chart2']['heading'] = '' # card0['charts']['chart2']['labels'] = {} card0['charts']['chart1']={} card0["heading"] = card1heading measure_column_cards['card0'] = card0 measureCard1Header = HtmlData(data=card1heading) measureCard1Data += [measureCard1Header] measureCard1para = card1paragraphs measureCard1Data += measureCard1para if self._run_dimension_level_regression: print("running narratives for key area dict") self._dim_regression = self.run_regression_for_dimension_levels() card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression) card2data.update({"blockSplitter":self._blockSplitter}) card2narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card2.html',card2data) card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter) card1 = {'tables': card2table, 'paragraphs' : card2paragraphs, 'heading' : 'Key Areas where ' + measure_column + ' matters'} measure_column_cards['card1'] = card1 measureCard2Data += card2paragraphs if "table1" in card2table: table1data = regression_narrative_obj.convert_table_data(card2table["table1"]) card2Table1 = TableData() card2Table1.set_table_data(table1data) card2Table1.set_table_type("heatMap") card2Table1.set_table_top_header(card2table["table1"]["heading"]) card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1)) # measureCard2Data.insert(3,card2Table1) measureCard2Data.insert(3,card2Table1Json) if "table2" in card2table: table2data = regression_narrative_obj.convert_table_data(card2table["table2"]) card2Table2 = TableData() card2Table2.set_table_data(table2data) card2Table2.set_table_type("heatMap") card2Table2.set_table_top_header(card2table["table2"]["heading"]) # measureCard2Data.insert(5,card2Table2) card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2)) # measureCard2Data.append(card2Table2) measureCard2Data.append(card2Table2Json) # self._result_setter.set_trend_section_data({"result_column":self.result_column, # "measure_column":measure_column, # "base_dir":self._base_dir # }) # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative) # card2 = trend_narratives_obj.get_regression_trend_card_data() # if card2: # measure_column_cards['card2'] = card2 # # # card3 = {} progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False) card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column) card4data.update({"blockSplitter":self._blockSplitter}) # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column card4narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card4.html',card4data) card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter) # card3 = {"paragraphs":card4paragraphs} card0['paragraphs'] = card1paragraphs+card4paragraphs card4Chart = card4data["charts"] # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))] statistical_info_array=[ ("Test Type","Regression"), ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))), ("P-Value","<= 0.05"), ("Intercept",str(round(self._df_regression_result.get_intercept(),2))), ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))), ] inferenceTuple = () coeff = self._df_regression_result.get_coeff(measure_column) if coeff > 0: inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) else: inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) if len(inferenceTuple) > 0: statistical_info_array.append(inferenceTuple) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array)) measureCard1Data += card4paragraphs self.narratives['cards'].append(measure_column_cards) if count == 0: card4data.pop("charts") self._result_setter.update_executive_summary_data(card4data) count += 1 measureCard1.set_card_data(measureCard1Data) if self._run_dimension_level_regression: measureCard2.set_card_data(measureCard2Data) sigMeasureNode.add_cards([measureCard1,measureCard2]) sigMeasureNode.add_cards([measureCard1]) self._regressionNode.add_a_node(sigMeasureNode) # self._result_setter.set_trend_section_completion_status(True) self._story_narrative.add_a_node(self._regressionNode)
class OneWayAnovaNarratives: THRESHHOLD_TOTAL = 0.75 ALPHA = 0.05 #@accepts(object, (str, basestring), (str, basestring), OneWayAnovaResult) def __init__(self, df_context, measure_column, dimension_column, measure_anova_result, trend_result, result_setter, dimensionNode, base_dir): self._dataframe_context = df_context self._dimensionNode = dimensionNode self._result_setter = result_setter self._measure_column = measure_column self._measure_column_capitalized = '%s%s' % (measure_column[0].upper(), measure_column[1:]) self._dimension_column = dimension_column self._dimension_column_capitalized = '%s%s' % ( dimension_column[0].upper(), dimension_column[1:]) self._measure_anova_result = measure_anova_result self._dimension_anova_result = self._measure_anova_result.get_one_way_anova_result( self._dimension_column) self._overall_trend_data = self._measure_anova_result.get_trend_data() if self._overall_trend_data: self._dataLevel = self._overall_trend_data.get_data_level() self._trendDuration = self._overall_trend_data.get_duration() else: self._trendDuration = 0 self._dataLevel = None self._dimension_trend_data = self._measure_anova_result.get_topLevelDfAnovaResult( self._dimension_column).get_trend_data() self._blockSplitter = "|~NEWBLOCK~|" self._highlightFlag = "|~HIGHLIGHT~|" # self.effect_size = anova_result.get_effect_size() self.card1 = '' self.card2 = '' self.card3 = '' self._base_dir = base_dir self._binAnalyzedCol = False customAnalysis = self._dataframe_context.get_custom_analysis_details() if customAnalysis != None: binnedColObj = [x["colName"] for x in customAnalysis] if binnedColObj != None and (self._dimension_column in binnedColObj): self._binAnalyzedCol = True print "BinAnalyzedCol..........." print self._binAnalyzedCol self._generate_narratives() def _generate_narratives(self): self._card3_required = False self._generate_card1() if self._dataframe_context.get_job_type() != "prediction": print "duration is ", self._trendDuration if self._trendDuration > 0: self._generate_card2() if self._card3_required: self._generate_card3() self._dimensionNode.add_a_card(self._anovaCard1) if self._card3_required and self._trendDuration > 0: self._dimensionNode.add_a_card(self._anovaCard3) def _generate_title(self): self.title = 'Impact of %s on %s' % ( self._dimension_column_capitalized, self._measure_column_capitalized) def _get_c3chart_card1_chart1(self, total, average): data = [] for key in total: data.append({ 'dimension': str(key), 'total': total[key], 'average': average[key] }) data = sorted(data, key=lambda x: x["total"], reverse=True) output = ChartJson(data=NormalChartData(data).get_data(), axes={ 'x': 'dimension', 'y': 'total', 'y2': 'average' }, label_text={ 'x': self._dimension_column_capitalized, 'y': 'Total ' + self._measure_column_capitalized, 'y2': 'Average ' + self._measure_column_capitalized }, chart_type='bar') return output def _get_c3chart_trend(self, data, x, y, y2): key_list = ['k1', 'k2', 'k3'] data_c3 = [] for row in zip(data[x], data[y], data[y2]): row_data = dict(zip(key_list, row)) try: row_data["k1"] = str(row_data["k1"].to_datetime().date()) except: row_data["k1"] = str(row_data["k1"]) data_c3.append(row_data) json_chart = ChartJson(data=NormalChartData(data_c3).get_data(), axes={ 'x': 'k1', 'y': 'k2', 'y2': 'k3' }, label_text={ 'x': x, 'y': y, 'y2': y2 }, legend={ "k1": x, "k2": y, "k3": y2 }, chart_type='line') json_chart.set_y2axis_number_format(".2s") json_chart.set_yaxis_number_format(".2s") return json_chart def _get_card3_scatterchart(self, data_c3): return ChartJson(data=NormalChartData(data_c3).get_data(), chart_type='scatter_tooltip') def _generate_card1(self): self._anovaCard1 = NormalCard(name='Impact on ' + self._measure_column_capitalized) lines = [] lines += NarrativesUtils.block_splitter( '<h3>' + self._measure_column_capitalized + ': Impact of ' + self._dimension_column_capitalized + ' on ' + self._measure_column_capitalized + '</h3>', self._blockSplitter) self.card1 = Card('Impact of ' + self._dimension_column_capitalized + ' on ' + self._measure_column_capitalized) dim_table = self._dimension_anova_result.get_level_dataframe() # print dim_table keys = dim_table['levels'] totals = dim_table['total'] means = dim_table['average'] counts = dim_table['count'] if len(keys) >= 5: self._card3_required = True group_by_total = {} group_by_mean = {} for k, t, m in zip(keys, totals, means): group_by_total[k] = t group_by_mean[k] = m chart1 = chart(data=group_by_total, labels={ self._dimension_column_capitalized: self._measure_column_capitalized }) chart2 = chart(data=group_by_mean, labels={ self._dimension_column_capitalized: self._measure_column_capitalized }) self.card1.add_chart('group_by_total', chart1) self.card1.add_chart('group_by_mean', chart2) # st_info = ["Test : ANOVA", "p-value: 0.05", "F-stat: "+str(round(self._dimension_anova_result.get_f_value(),2))] statistical_info_array = [ ("Test Type", "ANOVA"), ("P-Value", "0.05"), ("F Value", str(round(self._dimension_anova_result.get_f_value(), 2))), ("Inference", "There is a significant effect of {} on {} (target).".format( self._dimension_column_capitalized, self._measure_column_capitalized)) ] statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) card1_chart1 = C3ChartData(data=self._get_c3chart_card1_chart1( group_by_total, group_by_mean), info=statistical_info_array) self._result_setter.set_anova_chart_on_scored_data( {self._dimension_column: card1_chart1}) lines += [card1_chart1] # top_group_by_total = keys[totals.index(max(totals))] top_group_by_total = keys[totals.argmax()] sum_top_group_by_total = max(totals) avg_top_group_by_total = means[totals.argmax()] bubble1 = BubbleData( NarrativesUtils.round_number(sum_top_group_by_total, 1), top_group_by_total + ' is the largest contributor to ' + self._measure_column) # self.card1.add_bubble_data(bubble1) top_group_by_mean = keys[means.argmax()] sum_top_group_by_mean = totals[means.argmax()] avg_top_group_by_mean = max(means) bubble2 = BubbleData( NarrativesUtils.round_number(avg_top_group_by_mean, 1), top_group_by_mean + ' has the highest average ' + self._measure_column) # self.card1.add_bubble_data(bubble2) groups_by_total = sorted(zip(totals, keys), reverse=True) sum_total = sum(totals) uniformly_distributed = True five_percent_total = 0.05 * sum_total fifteen_percent_total = 0.15 * sum_total sorted_total = sorted(totals, reverse=True) if len(groups_by_total) % 2 == 0: fifty_percent_index = int(len(groups_by_total) / 2) top_fifty_total = sum(sorted_total[:fifty_percent_index]) bottom_fifty_total = sum(sorted_total[fifty_percent_index:]) if top_fifty_total - bottom_fifty_total >= fifteen_percent_total: uniformly_distributed = False else: fifty_percent_index = int(len(groups_by_total) / 2) + 1 top_fifty_total = sum(sorted_total[:fifty_percent_index]) bottom_fifty_total = sum(sorted_total[fifty_percent_index - 1:]) if top_fifty_total - bottom_fifty_total >= fifteen_percent_total: uniformly_distributed = False top_groups = None top_groups_contribution = None if (not uniformly_distributed) and len(groups_by_total) > 2: max_diff = 0 diffs = [ sorted_total[i] - sorted_total[i + 1] for i in range(fifty_percent_index) ] max_diff_index = diffs.index(max(diffs[1:])) top_groups = [k for t, k in groups_by_total[:max_diff_index + 1]] top_groups_contribution = sum( sorted_total[:max_diff_index + 1]) * 100 / sum_total bottom_groups = [] bottom_groups_contribution = 0 for t, k in groups_by_total[:0:-1]: bottom_groups.append(k) bottom_groups_contribution = bottom_groups_contribution + t if bottom_groups_contribution >= five_percent_total: break bottom_groups_contribution = bottom_groups_contribution * 100 / sum_total elif not uniformly_distributed: top_groups = [groups_by_total[0][1]] top_groups_contribution = groups_by_total[0][0] * 100 / sum_total bottom_groups = [groups_by_total[1][1]] bottom_groups_contribution = groups_by_total[1][0] * 100 / sum_total elif uniformly_distributed: top_groups = [] top_groups_contribution = 0 bottom_groups = [] bottom_groups_contribution = 0 num_groups = len(keys) data_dict = { 'uniformly_distributed': uniformly_distributed, 'top_groups': top_groups, 'num_top_groups': len(top_groups), 'top_groups_percent': NarrativesUtils.round_number(top_groups_contribution, 2), 'dimension_name': self._dimension_column, 'plural_dimension_name': NarrativesUtils.pluralize(self._dimension_column), 'measure_name': self._measure_column, 'best_category_by_mean': top_group_by_mean, 'best_category_by_mean_cont': round(100.0 * sum_top_group_by_mean / sum(totals), 2), 'best_category_by_mean_avg': NarrativesUtils.round_number(avg_top_group_by_mean, 2), 'best_category_by_total': top_group_by_total, 'best_category_by_total_cont': round(100.0 * sum_top_group_by_total / sum(totals), 2), 'best_category_by_total_avg': NarrativesUtils.round_number(avg_top_group_by_total, 2), 'best_category_by_total_sum': NarrativesUtils.round_number(sum_top_group_by_total, 2), 'bottom_groups': bottom_groups, 'num_bottom_groups': len(bottom_groups), 'bottom_groups_percent': NarrativesUtils.round_number(bottom_groups_contribution, 2), 'num_groups': num_groups } output = {'header': 'Overview', 'content': []} if self._binAnalyzedCol == True: narrativeText = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_3_binned_IV.html', data_dict) output['content'].append(narrativeText) self._result_setter.set_anova_narrative_on_scored_data( {self._dimension_column: narrativeText}) else: narrativeText = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_3.html', data_dict) output['content'].append(narrativeText) self._result_setter.set_anova_narrative_on_scored_data( {self._dimension_column: narrativeText}) for cnt in output['content']: lines += NarrativesUtils.block_splitter(cnt, self._blockSplitter) self._anovaCard1.set_card_data(lines) self.card1.add_paragraph(dict(output)) self._result_setter.set_anova_cards_regression_score(self.card1) # self.generate_top_dimension_narratives() def generate_top_dimension_narratives(self): topLevelAnova = self._measure_anova_result.get_topLevelDfAnovaResult( self._dimension_column) # print topLevelAnova top_level = topLevelAnova.get_top_level_name() # print top_level # tuple of (dimension name,anovaResult,effect_size) top_level_sig_dimensions = topLevelAnova.get_top_significant_dimensions( 3) significant_dimensions = [x[0] for x in top_level_sig_dimensions] print significant_dimensions contributorDict = {} for idx, obj in enumerate(top_level_sig_dimensions): leveldf = obj[1].get_level_dataframe() levelContribution = self.compute_level_contributions(leveldf) contributorDict[obj[0]] = {"level": levelContribution} totalCont = round(np.sum([c[1] for c in levelContribution[:3]]), 2) contributorDict[obj[0]].update({"total": totalCont}) print contributorDict print "data dict started" data_dict = { 'sig_dims': significant_dimensions, 'num_sig_dims': len(significant_dimensions), 'contributorDict': contributorDict, # 'top1_contributors' : top1_contributors, # 'top1_contribution' : NarrativesUtils.round_number(top1_contribution,2), # 'num_top1_contributors' : len(top1_contributors), # 'top2_contributors' : top2_contributors, # 'top2_contribution' : NarrativesUtils.round_number(top2_contribution,2), # 'num_top2_contributors' : len(top2_contributors), # 'top3_contributors' : top3_contributors, # 'top3_contribution' : NarrativesUtils.round_number(top3_contribution,2), # 'num_top3_contributors' : len(top3_contributors), 'target': self._measure_column, 'dimension': self._dimension_column, 'top_level': top_level, 'highlightFlag': self._highlightFlag, 'blockSplitter': self._blockSplitter } output = { 'header': 'Key Factors influencing ' + self._measure_column + ' from ' + top_level, 'content': [] } if self._binAnalyzedCol == True: output = { 'header': 'Key Factors influencing ' + self._measure_column + ' from ' + self._dimension_column + ' - ' + top_level, 'content': [] } output['content'].append( NarrativesUtils.get_template_output( self._base_dir, 'anova_template_4_binned_IV.html', data_dict)) else: output = { 'header': 'Key Factors influencing ' + self._measure_column + ' from ' + top_level, 'content': [] } output['content'].append( NarrativesUtils.get_template_output(self._base_dir, 'anova_template_4.html', data_dict)) lines = [] lines += NarrativesUtils.block_splitter( '<h4>' + output['header'] + '</h4>', self._blockSplitter) for cnt in output['content']: lines += NarrativesUtils.block_splitter( cnt, self._blockSplitter, highlightFlag=self._highlightFlag) self._anovaCard1.add_card_data(lines) self.card1.add_paragraph(dict(output)) def get_contributions_for_dimension(self, significant_dimensions, n, top_dimension_stats): if len(significant_dimensions) > n: dimension = significant_dimensions[n] contributions = top_dimension_stats.get_contributions(dimension) contributions = [(v * 100, k) for k, v in contributions.items()] contributions = sorted(contributions, reverse=True) diffs = [ contributions[i][0] - contributions[i + 1][0] for i in range(len(contributions) - 1) ] cutoff = diffs.index(max(diffs)) contributions = contributions[:cutoff + 1] total_contribution = sum([v for v, k in contributions]) contributions = [(round(v, 2), k) for v, k in contributions] return contributions, total_contribution return '', 0.0 def compute_level_contributions(self, df): df = df.sort_values(by=['total'], ascending=False) df.reset_index(drop=True, inplace=True) df['percent'] = (df['total'] * 100 / float(df["total"].sum())).round() # calculating the point where maximum difference is occuring max_diff_index = df.total.diff(1).argmax() df = df.iloc[:max_diff_index + 1] return sorted(zip(df['levels'], df['percent']), key=lambda x: x[1], reverse=True) def _generate_card2(self): subset_df = self._dimension_trend_data.get_grouped_data() overall_df = self._overall_trend_data.get_grouped_data() total_measure = 'Total ' + self._measure_column_capitalized if len(overall_df.columns) == 3: overall_df.columns = ["key", total_measure, "year_month"] else: overall_df.columns = ["key", total_measure] top_level_name = self._measure_anova_result.get_topLevelDfAnovaResult( self._dimension_column).get_top_level_name() subset_measure = top_level_name + ' ' + self._measure_column_capitalized if len(subset_df.columns) == 3: subset_df.columns = ['key', subset_measure, "year_month"] else: subset_df.columns = ['key', subset_measure] inner_join = overall_df.merge(subset_df[['key', subset_measure]], how='inner', on='key') inner_join["key"] = inner_join["key"].apply(lambda x: str(x)) # print "inner_join", inner_join correlation = inner_join[[total_measure, subset_measure ]].corr()[total_measure][subset_measure] if self._dataLevel == "month": data = { 'Time Period': list(inner_join['year_month']), total_measure: list(inner_join[total_measure]), subset_measure: list(inner_join[subset_measure]) } data_c3 = [['Time Period'] + list(inner_join['year_month']), [total_measure] + list(inner_join[total_measure]), [subset_measure] + list(inner_join[subset_measure])] elif self._dataLevel == "day": data = { 'Time Period': list(inner_join['key']), total_measure: list(inner_join[total_measure]), subset_measure: list(inner_join[subset_measure]) } data_c3 = [['Time Period'] + list(inner_join['key']), [total_measure] + list(inner_join[total_measure]), [subset_measure] + list(inner_join[subset_measure])] chart1 = chart(data=data) chart1.add_data_c3(data_c3) # self.card2.add_chart('trend_chart',chart1) self.card1.add_chart('trend_chart', chart1) overall_increase_percent = (overall_df[total_measure].iloc[-1] * 100 / overall_df[total_measure].iloc[0]) - 100 subset_increase_percent = (subset_df[subset_measure].iloc[-1] * 100 / subset_df[subset_measure].iloc[0]) - 100 overall_peak_index = overall_df[total_measure].argmax() overall_peak_value = overall_df[total_measure].ix[overall_peak_index] if self._dataLevel == "month": overall_peak_date = overall_df['year_month'].ix[overall_peak_index] elif self._dataLevel == "day": overall_peak_date = overall_df['key'].ix[overall_peak_index] subset_peak_index = subset_df[subset_measure].argmax() subset_peak_value = subset_df[subset_measure].ix[subset_peak_index] if self._dataLevel == "month": subset_peak_date = subset_df['year_month'].ix[subset_peak_index] elif self._dataLevel == "day": subset_peak_date = subset_df['key'].ix[subset_peak_index] overall_df['prev'] = overall_df[total_measure].shift(1) subset_df['prev'] = subset_df[subset_measure].shift(1) if math.isnan(overall_df['prev'].ix[overall_peak_index]): overall_peak_increase = 0 else: overall_peak_increase = ( subset_df[subset_measure].ix[subset_peak_index] / subset_df['prev'].ix[subset_peak_index]) * 100 - 100 if math.isnan(subset_df['prev'].ix[subset_peak_index]): subset_peak_increase = 0 else: subset_peak_increase = ( subset_df[subset_measure].ix[subset_peak_index] / subset_df['prev'].ix[subset_peak_index]) * 100 - 100 overall_df['avg_diff'] = overall_df[total_measure] - overall_df[ total_measure].mean() subset_df['avg_diff'] = subset_df[subset_measure] - subset_df[ subset_measure].mean() overall_df = self.streaks(overall_df, 'avg_diff') subset_df = self.streaks(subset_df, 'avg_diff') overall_longest_streak_end_index = overall_df['u_streak'].argmax() overall_longest_streak_contribution = overall_df[total_measure].ix[ overall_longest_streak_end_index] overall_streak_length = int( overall_df['u_streak'].ix[overall_longest_streak_end_index]) for i in range(1, int(overall_streak_length)): overall_longest_streak_contribution = overall_df[ total_measure].shift(i).ix[overall_longest_streak_end_index] overall_longest_streak_contribution = overall_longest_streak_contribution * 100 / overall_df[ total_measure].sum() if self._dataLevel == "month": overall_longest_streak_end_date = overall_df['year_month'].ix[ overall_longest_streak_end_index] overall_longest_streak_start_date = overall_df['year_month'].shift( overall_streak_length - 1).ix[overall_longest_streak_end_index] elif self._dataLevel == "day": overall_longest_streak_end_date = overall_df['key'].ix[ overall_longest_streak_end_index] overall_longest_streak_start_date = overall_df['key'].shift( overall_streak_length - 1).ix[overall_longest_streak_end_index] subset_longest_streak_end_index = subset_df['u_streak'].argmax() subset_longest_streak_contribution = subset_df[subset_measure].ix[ subset_longest_streak_end_index] subset_streak_length = int( subset_df['u_streak'].ix[subset_longest_streak_end_index]) for i in range(1, int(subset_streak_length)): subset_longest_streak_contribution = subset_df[ subset_measure].shift(i).ix[subset_longest_streak_end_index] subset_longest_streak_contribution = subset_longest_streak_contribution * 100 / subset_df[ subset_measure].sum() if self._dataLevel == "month": subset_longest_streak_end_date = subset_df['year_month'].ix[ subset_longest_streak_end_index] subset_longest_streak_start_date = subset_df['year_month'].shift( subset_streak_length - 1).ix[subset_longest_streak_end_index] elif self._dataLevel == "day": subset_longest_streak_end_date = subset_df['key'].ix[ subset_longest_streak_end_index] subset_longest_streak_start_date = subset_df['key'].shift( subset_streak_length - 1).ix[subset_longest_streak_end_index] data_dict = { 'correlation': correlation, 'overall_increase_percent': round(overall_increase_percent, 2), 'subset_increase_percent': round(subset_increase_percent, 2), 'overall_peak_value': NarrativesUtils.round_number(overall_peak_value, 2), 'overall_peak_date': overall_peak_date, 'overall_peak_increase': round(overall_peak_increase, 2), 'overall_streak_length': overall_streak_length, 'overall_streak_start_date': overall_longest_streak_start_date, 'overall_streak_end_date': overall_longest_streak_end_date, 'overall_streak_contribution': round(overall_longest_streak_contribution, 2), 'subset_peak_value': NarrativesUtils.round_number(subset_peak_value, 2), 'subset_peak_date': subset_peak_date, 'subset_peak_increase': round(subset_peak_increase, 2), 'subset_streak_length': subset_streak_length, 'subset_streak_start_date': subset_longest_streak_start_date, 'subset_streak_end_date': subset_longest_streak_end_date, 'subset_streak_contribution': round(subset_longest_streak_contribution, 2), 'target': self._measure_column, 'top_dimension': top_level_name, 'dimension': self._dimension_column, } print "data_dict - For anova_template_6 -------------------" print data_dict # print json.dumps(data_dict,indent=2) if self._binAnalyzedCol == True: print "Binned IV" output = {} output[ 'header'] = "<h4>" + self._dimension_column + " - " + top_level_name + "'s " + self._measure_column + " Performance over time" + "</h4>" output['content'] = [] output['content'].append( NarrativesUtils.get_template_output( self._base_dir, 'anova_template_6_binned_IV.html', data_dict)) else: output = {} output[ 'header'] = "<h4>" + top_level_name + "'s " + self._measure_column + " Performance over time" + "</h4>" output['content'] = [] output['content'].append( NarrativesUtils.get_template_output(self._base_dir, 'anova_template_6.html', data_dict)) # self.card2.add_paragraph(output) lines = [] lines += [HtmlData(data=output['header'])] lines += [ C3ChartData( self._get_c3chart_trend(data, 'Time Period', total_measure, subset_measure)) ] for cnt in output['content']: lines += NarrativesUtils.block_splitter(cnt, self._blockSplitter) self._anovaCard1.add_card_data(lines) self.card1.add_paragraph(dict(output)) # self.generate_trending_comments() def generate_trending_comments(self): grouped_data_frame = self._trend_result.get_grouped_data( self._dimension_column) grouped_data_frame['increase'] = ( grouped_data_frame['measure']['last'] - grouped_data_frame['measure']['first'] ) * 100 / grouped_data_frame['measure']['first'] positive_growth_dimensions = grouped_data_frame['dimension'].ix[ grouped_data_frame['increase'] > 3] negative_growth_dimensions = grouped_data_frame['dimension'].ix[ grouped_data_frame['increase'] < -2] stable_growth_dimensions = grouped_data_frame['dimension'].ix[ (grouped_data_frame['increase'] >= -2) & (grouped_data_frame['increase'] <= 3)] positive_growth_values = grouped_data_frame['increase'].ix[ grouped_data_frame['increase'] > 3] negative_growth_values = grouped_data_frame['increase'].ix[ grouped_data_frame['increase'] < -2] # stable_growth_values = grouped_data_frame['increase'].ix[(grouped_data_frame['increase']>=-2) & (grouped_data_frame['increase']<=3)] positive_growth_dimensions = [ i for j, i in sorted(zip(positive_growth_values, positive_growth_dimensions), reverse=True) ] negative_growth_dimensions = [ i for j, i in sorted( zip(negative_growth_values, negative_growth_dimensions)) ] positive_growth_values = sorted(positive_growth_values, reverse=True) negative_growth_values = sorted(negative_growth_values) overall_growth_rate = self._trend_result.get_overall_growth_percent() data_dict = { 'positive_growth_dimensions': positive_growth_dimensions, 'negative_growth_dimensions': negative_growth_dimensions, 'stable_growth_dimensions': stable_growth_dimensions, 'positive_growth_values': [ NarrativesUtils.round_number(i, 2) for i in positive_growth_values ], 'negative_growth_values': [ NarrativesUtils.round_number(i, 2) for i in negative_growth_values ], 'num_positive_growth_dimensions': len(positive_growth_dimensions), 'num_negative_growth_dimensions': len(negative_growth_dimensions), 'num_stable_growth_dimensions': len(stable_growth_dimensions), 'target': self._measure_column, 'dimension': self._dimension_column, 'overall_growth_rate': NarrativesUtils.round_number(overall_growth_rate), } output = {'header': "", 'content': []} output['content'].append( NarrativesUtils.get_template_output(self._base_dir, 'anova_template_7.html', data_dict)) # self.card2.add_paragraph(output) def streaks(self, df, col): sign = np.sign(df[col]) s = sign.groupby((sign != sign.shift()).cumsum()).cumsum() return df.assign(u_streak=s.where(s > 0, 0.0), d_streak=s.where(s < 0, 0.0).abs()) def get_category(self, x): if x['increase'] >= self._increase_limit: if x['contribution'] >= self._contribution_limit: return 'Leaders Club' else: return 'Playing Safe' else: if x['contribution'] >= self._contribution_limit: return 'Opportunity Bay' else: return 'Red Alert' def _generate_card3(self): self._anovaCard3 = NormalCard(name=self._dimension_column_capitalized + '- Decision Matrix') self.card3 = Card(self._dimension_column_capitalized + '-' + self._measure_column_capitalized + ' Performance Decision Matrix') self.card3.add_paragraph({ 'header': '', 'content': 'Based on the absolute ' + self._measure_column + ' values and the overall growth rates, mAdvisor presents the decision matrix for ' + self._measure_column + ' for ' + self._dimension_column + ' as displayed below.' }) lines = [] lines += NarrativesUtils.block_splitter( '<h3>' + self._dimension_column_capitalized + '-' + self._measure_column_capitalized + ' Performance Decision Matrix</h3><br>' + 'Based on the absolute ' + self._measure_column + ' values and the overall growth rates, mAdvisor presents the decision matrix for ' + self._measure_column + ' for ' + self._dimension_column + ' as displayed below.', self._blockSplitter) grouped_data_frame = self._dimension_trend_data.get_grouped_data() pivot_df = self._dimension_trend_data.get_level_pivot() grouped_data_frame['increase'] = [0] + [ round((x - y) * 100 / float(y), 2) for x, y in zip(grouped_data_frame["value"].iloc[1:], grouped_data_frame["value"]) ] grouped_data_frame['contribution'] = grouped_data_frame[ 'value'] * 100 / float(grouped_data_frame['value'].sum()) self._contribution_limit = grouped_data_frame['contribution'].mean() self._increase_limit = max(0.0, grouped_data_frame['increase'].mean()) dimensionLevel = list(set(pivot_df.columns) - {"year_month", "key"}) print dimensionLevel share = [] growth = [] for lvl in dimensionLevel: lvl_share = float(np.nansum(pivot_df[lvl])) * 100 / np.nansum( grouped_data_frame["value"]) share.append(lvl_share) lvl_val_array = list(pivot_df[lvl][~np.isnan(pivot_df[lvl])]) lvl_growth = float(lvl_val_array[-1] - lvl_val_array[0]) * 100 / lvl_val_array[0] growth.append(lvl_growth) tempDf = pd.DataFrame({ "dimension": dimensionLevel, "increase": growth, "contribution": share }) tempDf['category'] = tempDf.apply(self.get_category, axis=1) data = { 'Share of ' + self._measure_column: list(tempDf['contribution']), self._measure_column_capitalized + ' growth': list(tempDf['increase']), self._dimension_column: list(tempDf['dimension']), 'Category': list(tempDf['category']), } # data_c3 = [[self._measure_column_capitalized+' growth'] + list(grouped_data_frame['increase']), # ['Share of '+self._measure_column] + list(grouped_data_frame['contribution']), # [self._dimension_column] + list(grouped_data_frame['dimension']), # ['Category'] + list(grouped_data_frame['category'])] growth = list(tempDf['increase']) share = list(tempDf['contribution']) label = list(tempDf['dimension']) category_legend = list(tempDf['category']) all_data = sorted(zip(share, growth, label, category_legend)) share = [i[0] for i in all_data] growth = [i[1] for i in all_data] label = [i[2] for i in all_data] category_legend = [i[3] for i in all_data] modified_category_legend = [] for val in category_legend: if val == "Playing Safe": modified_category_legend.append("Opportunity Bay") elif val == "Opportunity Bay": modified_category_legend.append("Playing Safe") else: modified_category_legend.append(val) category_legend = modified_category_legend data_c3 = [['Growth'] + growth, ['Share'] + share, [self._dimension_column] + label, ['Category'] + category_legend] decisionMatrixChartJson = ChartJson( data=NormalChartData(data_c3).get_data(), chart_type='scatter_tooltip') decisionMatrixChartJson.set_legend( {"legendWillNotBeUsed": "legendWillNotBeUsed"}) decisionMatrixChartJson.set_label_text({ 'x': 'Percentage share of ' + self._measure_column, 'y': "Growth over time" }) lines += [C3ChartData(decisionMatrixChartJson)] chart_data = chart(data=data, labels={}) chart_data.add_data_c3(data_c3) self.card3.add_chart('decision_matrix', chart_data) leaders_club = list( tempDf['dimension'][tempDf['category'] == 'Leaders Club']) playing_safe = list( tempDf['dimension'][tempDf['category'] == 'Playing Safe']) opportunity_bay = list( tempDf['dimension'][tempDf['category'] == 'Opportunity Bay']) red_alert = list( tempDf['dimension'][tempDf['category'] == 'Red Alert']) data_dict = { 'leaders_club': leaders_club, 'playing_safe': playing_safe, 'opportunity_bay': opportunity_bay, 'red_alert': red_alert, 'num_leaders_club': len(leaders_club), 'num_playing_safe': len(playing_safe), 'num_opportunity_bay': len(opportunity_bay), 'num_red_alert': len(red_alert), 'target': self._measure_column, 'dimension': self._dimension_column } executive_summary_data = {} executive_summary_data[self._dimension_column] = { "num_red_alert": len(red_alert), "red_alert": red_alert } self._result_setter.update_executive_summary_data( executive_summary_data) output = {'header': '', 'content': []} output['content'].append( NarrativesUtils.get_template_output(self._base_dir, 'anova_template_5.html', data_dict)) self.card3.add_paragraph(output) for cnt in output['content']: lines += NarrativesUtils.block_splitter(cnt, self._blockSplitter) self._anovaCard3.set_card_data(lines)
def _generate_narratives(self): """ generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions """ for target_dimension in self._df_chisquare_result.keys(): target_chisquare_result = self._df_chisquare_result[ target_dimension] analysed_variables = target_chisquare_result.keys( ) ## List of all analyzed var. # List of significant var out of analyzed var. significant_variables = [ dim for dim in target_chisquare_result.keys() if target_chisquare_result[dim].get_pvalue() <= 0.05 ] effect_sizes = [ target_chisquare_result[dim].get_effect_size() for dim in significant_variables ] effect_size_dict = dict(zip(significant_variables, effect_sizes)) significant_variables = [ y for (x, y) in sorted(zip(effect_sizes, significant_variables), reverse=True) ] #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05] num_analysed_variables = len(analysed_variables) num_significant_variables = len(significant_variables) self.narratives['main_card'] = {} self.narratives['main_card'][ 'heading'] = 'Relationship between ' + target_dimension + ' and other factors' self.narratives['main_card']['paragraphs'] = {} data_dict = { 'num_variables': num_analysed_variables, 'num_significant_variables': num_significant_variables, 'significant_variables': significant_variables, 'target': target_dimension, 'analysed_dimensions': analysed_variables, 'blockSplitter': self._blockSplitter } # for both para 1 and para 2 paragraph = {} paragraph['header'] = '' paragraph['content'] = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) self.narratives['main_card']['paragraphs'] = [paragraph] self.narratives['cards'] = [] chart = { 'header': 'Strength of association between ' + target_dimension + ' and other dimensions' } chart['data'] = effect_size_dict chart['label_text'] = { 'x': 'Dimensions', 'y': 'Effect Size (Cramers-V)' } chart_data = [] chartDataValues = [] for k, v in effect_size_dict.items(): chart_data.append({"key": k, "value": float(v)}) chartDataValues.append(float(v)) chart_data = sorted(chart_data, key=lambda x: x["value"], reverse=True) chart_json = ChartJson() chart_json.set_data(chart_data) chart_json.set_chart_type("bar") # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'}) chart_json.set_label_text({ 'x': ' ', 'y': 'Effect Size (Cramers-V)' }) chart_json.set_axis_rotation(True) chart_json.set_axes({"x": "key", "y": "value"}) # chart_json.set_yaxis_number_format(".4f") chart_json.set_yaxis_number_format( NarrativesUtils.select_y_axis_format(chartDataValues)) self.narratives['main_card']['chart'] = chart main_card = NormalCard() header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>" main_card_data = [HtmlData(data=header)] main_card_narrative = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) main_card_narrative = NarrativesUtils.block_splitter( main_card_narrative, self._blockSplitter) main_card_data += main_card_narrative # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"] # print "chartdata",chart_data if len(chart_data) > 0: statistical_info_array = [ ("Test Type", "Chi-Square"), ("Effect Size", "Cramer's V"), ("Max Effect Size", chart_data[0]["key"]), ("Min Effect Size", chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["key"], self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["key"], chart_data[1]["key"], self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4), round(chart_data[1]["value"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4), round(chart_data[-1]["value"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) else: statistical_info_array = [] main_card_data.append( C3ChartData(data=chart_json, info=statistical_info_array)) main_card.set_card_data(main_card_data) main_card.set_card_name("Key Influencers") if self._storyOnScoredData != True: self._chiSquareNode.add_a_card(main_card) self._result_setter.add_a_score_chi_card(main_card) print "target_dimension", target_dimension if self._appid == '2' and num_significant_variables > 5: significant_variables = significant_variables[:5] else: if self._nColsToUse != None: significant_variables = significant_variables[:self. _nColsToUse] CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "custom", "info", display=True, customMsg="Analyzing key drivers", weightKey="narratives") for analysed_dimension in significant_variables[:self. _noOfSigDimsToShow]: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension, analysed_dimension) if self._appid == '2': print "APPID 2 is used" card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) elif self._appid == '1': print "APPID 1 is used" card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) else: target_dimension_card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) self.narratives['cards'].append(target_dimension_card) self._chiSquareNode.add_a_node( target_dimension_card.get_dimension_node()) self._story_narrative.add_a_node(self._chiSquareNode) self._result_setter.set_chisquare_node(self._chiSquareNode)
def _generate_narratives(self): chisquare_result = self._chisquare_result target_dimension = self._target_dimension analysed_dimension = self._analysed_dimension significant_variables = self._significant_variables num_analysed_variables = self._num_analysed_variables table = self._chiSquareTable total = self._chiSquareTable.get_total() levels = self._chiSquareTable.get_column_two_levels() level_counts = self._chiSquareTable.get_column_total() levels_count_sum = sum(level_counts) levels_percentages = [ i * 100.0 / levels_count_sum for i in level_counts ] sorted_levels = sorted(zip(level_counts, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) bottom_dim = sorted_levels[-1][1] bottom_dim_contribution = sorted_levels[-1][0] bottom_dims = [ y for x, y in sorted_levels if x == bottom_dim_contribution ] target_levels = self._chiSquareTable.get_column_one_levels() target_counts = self._chiSquareTable.get_row_total() sorted_target_levels = sorted(zip(target_counts, target_levels), reverse=True) top_target_count, top_target = sorted_target_levels[0] second_target_count, second_target = sorted_target_levels[1] top_target_contributions = [ table.get_value(top_target, i) for i in levels ] sum_top_target = sum(top_target_contributions) sorted_levels = sorted(zip(top_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) top_target_bottom_dim = sorted_levels[-1][1] top_target_bottom_dim_contribution = sorted_levels[-1][0] top_target_percentages = [ i * 100.0 / sum_top_target for i in top_target_contributions ] best_top_target_index = top_target_contributions.index( max(top_target_contributions)) worst_top_target_index = top_target_contributions.index( min(top_target_contributions)) top_target_differences = [ x - y for x, y in zip(levels_percentages, top_target_percentages) ] if len(top_target_differences) > 6: tops = 2 bottoms = -2 elif len(top_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(top_target_differences), key=lambda x: x[1], reverse=True) best_top_difference_indices = [x for x, y in sorted_[:tops]] worst_top_difference_indices = [x for x, y in sorted_[bottoms:]] top_target_shares = [ x * 100.0 / y for x, y in zip(top_target_contributions, level_counts) ] max_top_target_shares = max(top_target_shares) best_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == max_top_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts) min_top_target_shares = min([ x for x, y in zip(top_target_shares, level_counts) if y >= level_counts_threshold ]) worst_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == min_top_target_shares ] overall_top_percentage = sum_top_target * 100.0 / total second_target_contributions = [ table.get_value(second_target, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ i * 100.0 / sum_second_target for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [x for x, y in sorted_[bottoms:]] second_target_shares = [ x * 100.0 / y for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) # worst_second_target_share_index = second_target_shares.index(min_second_target_shares) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = sum_second_target * 100.0 / total targetCardDataDict = {} targetCardDataDict['target'] = target_dimension targetCardDataDict['colname'] = analysed_dimension targetCardDataDict['num_significant'] = len(significant_variables) targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) targetCardDataDict["blockSplitter"] = self._blockSplitter targetCardDataDict["binTargetCol"] = self._binTargetCol targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol targetCardDataDict['highlightFlag'] = self._highlightFlag targetCardDataDict['levels'] = levels data_dict = {} data_dict[ 'best_second_difference'] = best_second_difference_indices ##these changed data_dict['worst_second_difference'] = worst_second_difference_indices data_dict['best_top_difference'] = best_top_difference_indices data_dict['worst_top_difference'] = worst_top_difference_indices data_dict['levels_percentages'] = levels_percentages data_dict['top_target_percentages'] = top_target_percentages data_dict['second_target_percentages'] = second_target_percentages data_dict['levels'] = levels data_dict['best_top_share'] = best_top_target_share_index data_dict['worst_top_share'] = worst_top_target_share_index data_dict['best_second_share'] = best_second_target_share_index data_dict['worst_second_share'] = worst_second_target_share_index data_dict['top_target_shares'] = top_target_shares data_dict['second_target_shares'] = second_target_shares data_dict['overall_second'] = overall_second_percentage data_dict['overall_top'] = overall_top_percentage data_dict['num_significant'] = len(significant_variables) data_dict['colname'] = analysed_dimension data_dict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) data_dict['target'] = target_dimension data_dict['top_levels'] = top_dims data_dict['top_levels_percent'] = round( top_dims_contribution * 100.0 / total, 1) data_dict['bottom_level'] = bottom_dim data_dict['bottom_levels'] = bottom_dims data_dict['bottom_level_percent'] = round( bottom_dim_contribution * 100 / sum(level_counts), 2) data_dict['second_target'] = second_target data_dict['second_target_top_dims'] = second_target_top_dims data_dict[ 'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum( second_target_contributions) data_dict['second_target_bottom_dim'] = second_target_bottom_dim data_dict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution data_dict['best_second_target'] = levels[best_second_target_index] data_dict['best_second_target_count'] = second_target_contributions[ best_second_target_index] data_dict['best_second_target_percent'] = round( second_target_contributions[best_second_target_index] * 100.0 / sum(second_target_contributions), 2) data_dict['worst_second_target'] = levels[worst_second_target_index] data_dict['worst_second_target_percent'] = round( second_target_contributions[worst_second_target_index] * 100.0 / sum(second_target_contributions), 2) data_dict['top_target'] = top_target data_dict['top_target_top_dims'] = top_target_top_dims data_dict[ 'top_target_top_dims_contribution'] = top_target_top_dims_contribution * 100.0 / sum( top_target_contributions) data_dict['top_target_bottom_dim'] = top_target_bottom_dim data_dict[ 'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution data_dict['best_top_target'] = levels[best_top_target_index] data_dict['best_top_target_count'] = top_target_contributions[ best_top_target_index] data_dict['best_top_target_percent'] = round( top_target_contributions[best_top_target_index] * 100.0 / sum(top_target_contributions), 2) data_dict['worst_top_target'] = levels[worst_top_target_index] data_dict['worst_top_target_percent'] = round( top_target_contributions[worst_top_target_index] * 100.0 / sum(top_target_contributions), 2) data_dict["blockSplitter"] = self._blockSplitter data_dict["binTargetCol"] = self._binTargetCol data_dict["binAnalyzedCol"] = self._binAnalyzedCol data_dict['highlightFlag'] = self._highlightFlag ############### # CARD1 # ############### print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol if (self._binTargetCol == True & self._binAnalyzedCol == False): print "Only Target Column is Binned, : ", self._binTargetCol output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print "Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_and_IV.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output(self._base_dir, 'card1.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) targetDimCard1Data = [] targetDimcard1Heading = '<h3>Relationship between ' + self._target_dimension + ' and ' + self._analysed_dimension + "</h3>" toggledata = ToggleData() targetDimTable1Data = self.generate_card1_table1() targetDimCard1Table1 = TableData() targetDimCard1Table1.set_table_type("heatMap") targetDimCard1Table1.set_table_data(targetDimTable1Data) toggledata.set_toggleon_data({ "data": { "tableData": targetDimTable1Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimTable2Data = self.generate_card1_table2() targetDimCard1Table2 = TableData() targetDimCard1Table2.set_table_type("normal") table2Data = targetDimTable2Data["data1"] table2Data = [ innerList[1:] for innerList in table2Data if innerList[0].strip() != "" ] targetDimCard1Table2.set_table_data(table2Data) toggledata.set_toggleoff_data({ "data": { "tableData": table2Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading)) targetDimCard1Data.append(toggledata) targetDimCard1Data += output self._card1.set_card_data(targetDimCard1Data) self._card1.set_card_name("{}: Relationship with {}".format( self._analysed_dimension, self._target_dimension)) ############### # CARD2 # ############### if self._appid == None: key_factors = '' num_key_factors = len(self._second_level_dimensions) if len(self._second_level_dimensions) == 5: key_factors = ', '.join( self._second_level_dimensions[:4] ) + ' and ' + self._second_level_dimensions[4] elif len(self._second_level_dimensions) == 4: key_factors = ', '.join( self._second_level_dimensions[:3] ) + ' and ' + self._second_level_dimensions[3] elif len(self._second_level_dimensions) == 3: key_factors = ', '.join( self._second_level_dimensions[:2] ) + ' and ' + self._second_level_dimensions[2] elif len(self._second_level_dimensions) == 2: key_factors = ' and '.join(self._second_level_dimensions) elif len(self._second_level_dimensions) == 1: key_factors = self._second_level_dimensions[0] targetCardDataDict['num_key_factors'] = num_key_factors targetCardDataDict['key_factors'] = key_factors dict_for_test = {} for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]: targetLevel = tupleObj[1] targetCardDataDict['random_card2'] = random.randint(1, 100) targetCardDataDict['random_card4'] = random.randint(1, 100) second_target_contributions = [ table.get_value(targetLevel, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences. index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences. index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ i * 100.0 / sum_second_target for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [ x for x, y in sorted_[bottoms:] ] second_target_shares = [ x * 100.0 / y for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len( level_counts) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = sum_second_target * 100.0 / total # DataFrame for contribution calculation df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() # if self._chisquare_result.get_splits(): # splits = self._chisquare_result.get_splits() # idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0]) # idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0]) # splits[len(splits)-1] = splits[len(splits)-1]+1 # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\ # filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # else: # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # print self._data_frame.select('Sales').show() distribution_second = [] for d in self._second_level_dimensions: grouped = df_second_target.groupby(d).agg({ d: 'count' }).sort_values(d, ascending=False) contributions = df_second_dim.groupby(d).agg({d: 'count'}) contribution_index = list(contributions.index) contributions_val = contributions[d].tolist() contributions_list = dict( zip(contribution_index, contributions_val)) index_list = list(grouped.index) grouped_list = grouped[d].tolist() contributions_percent_list = [ round(y * 100.0 / contributions_list[x], 2) for x, y in zip(index_list, grouped_list) ] sum_ = grouped[d].sum() diffs = [0] + [ grouped_list[i] - grouped_list[i + 1] for i in range(len(grouped_list) - 1) ] max_diff = diffs.index(max(diffs)) index_txt = '' if max_diff == 1: index_txt = index_list[0] elif max_diff == 2: index_txt = index_list[0] + '(' + str( round(grouped_list[0] * 100.0 / sum_, 1) ) + '%)' + ' and ' + index_list[1] + '(' + str( round(grouped_list[1] * 100.0 / sum_, 1)) + '%)' elif max_diff > 2: index_txt = 'including ' + index_list[0] + '(' + str( round(grouped_list[0] * 100.0 / sum_, 1) ) + '%)' + ' and ' + index_list[1] + '(' + str( round(grouped_list[1] * 100.0 / sum_, 1)) + '%)' distribution_second.append({'contributions':[round(i*100.0/sum_,2) for i in grouped_list[:max_diff]],\ 'levels': index_list[:max_diff],'variation':random.randint(1,100),\ 'index_txt': index_txt, 'd':d,'contributions_percent':contributions_percent_list}) targetCardDataDict['distribution_second'] = distribution_second targetCardDataDict['second_target'] = targetLevel targetCardDataDict[ 'second_target_top_dims'] = second_target_top_dims targetCardDataDict[ 'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum( second_target_contributions) targetCardDataDict[ 'second_target_bottom_dim'] = second_target_bottom_dim targetCardDataDict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution targetCardDataDict['best_second_target'] = levels[ best_second_target_index] targetCardDataDict[ 'best_second_target_count'] = second_target_contributions[ best_second_target_index] targetCardDataDict['best_second_target_percent'] = round( second_target_contributions[best_second_target_index] * 100.0 / sum(second_target_contributions), 2) targetCardDataDict['worst_second_target'] = levels[ worst_second_target_index] targetCardDataDict['worst_second_target_percent'] = round( second_target_contributions[worst_second_target_index] * 100.0 / sum(second_target_contributions), 2) card2Data = [] targetLevelContributions = [ table.get_value(targetLevel, i) for i in levels ] card2Heading = '<h3>Distribution of ' + self._target_dimension + ' (' + targetLevel + ') across ' + self._analysed_dimension + "</h3>" chart, bubble = self.generate_distribution_card_chart( targetLevel, targetLevelContributions, levels, level_counts, total) card2ChartData = NormalChartData(data=chart["data"]) card2ChartJson = ChartJson() card2ChartJson.set_data(card2ChartData.get_data()) card2ChartJson.set_chart_type("combination") card2ChartJson.set_types({ "total": "bar", "percentage": "line" }) card2ChartJson.set_legend({ "total": "# of " + targetLevel, "percentage": "% of " + targetLevel }) card2ChartJson.set_axes({ "x": "key", "y": "total", "y2": "percentage" }) card2ChartJson.set_label_text({ "x": " ", "y": "Count", "y2": "Percentage" }) print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol if (self._binTargetCol == True & self._binAnalyzedCol == False): print "Only Target Column is Binned" output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target.html', targetCardDataDict), self._blockSplitter) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print "Target Column and IV is Binned" output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target_and_IV.html', targetCardDataDict), self._blockSplitter) else: print "In Else, self._binTargetCol should be False : ", self._binTargetCol output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2.html', targetCardDataDict), self._blockSplitter) card2Data.append(HtmlData(data=card2Heading)) statistical_info_array = [ ("Test Type", "Chi-Square"), ("Chi-Square statistic", str(round(self._chisquare_result.get_stat(), 3))), ("P-Value", str(round(self._chisquare_result.get_pvalue(), 3))), ("Inference", "Chi-squared analysis shows a significant association between {} (target) and {}." .format(self._target_dimension, self._analysed_dimension)) ] statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) card2Data.append( C3ChartData(data=card2ChartJson, info=statistical_info_array)) card2Data += output2 card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format( bubble[0]["value"], bubble[0]["text"], bubble[1]["value"], bubble[1]["text"]) card2Data.append(HtmlData(data=card2BubbleData)) targetCard = NormalCard() targetCard.set_card_data(card2Data) targetCard.set_card_name("{} : Distribution of {}".format( self._analysed_dimension, targetLevel)) self._targetCards.append(targetCard) dict_for_test[targetLevel] = targetCardDataDict out = {'data_dict': data_dict, 'target_dict': dict_for_test} return out
class ChiSquareAnalysis: def __init__(self, df_context, df_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, data_frame, measure_columns, base_dir, appid=None, target_chisquare_result=None): self._blockSplitter = "|~NEWBLOCK~|" self._highlightFlag = "|~HIGHLIGHT~|" self._dimensionNode = NarrativesTree() self._dimensionNode.set_name(target_dimension) self._data_frame = data_frame self._dataframe_context = df_context self._dataframe_helper = df_helper self._chisquare_result = chisquare_result self._target_dimension = target_dimension self._analysed_dimension = analysed_dimension self._significant_variables = significant_variables self._target_chisquare_result = target_chisquare_result self._measure_columns = self._dataframe_helper.get_numeric_columns() self._chiSquareLevelLimit = GLOBALSETTINGS.CHISQUARELEVELLIMIT self._num_analysed_variables = num_analysed_variables self._chiSquareTable = chisquare_result.get_contingency_table() significant_variables = list( set(significant_variables) - {analysed_dimension}) if len(significant_variables) <= 20: if len(significant_variables) <= 3: self._second_level_dimensions = list(significant_variables) else: self._second_level_dimensions = list(significant_variables)[:3] else: self._second_level_dimensions = list(significant_variables)[:5] print self._second_level_dimensions self._appid = appid self._card1 = NormalCard() self._targetCards = [] self._base_dir = base_dir self._binTargetCol = False self._binAnalyzedCol = False print "--------Chi-Square Narratives for ", analysed_dimension, "---------" if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] print "analysed_dimension : ", self._analysed_dimension if binnedColObj != None and self._target_dimension in binnedColObj: self._binTargetCol = True if binnedColObj != None and ( self._analysed_dimension in binnedColObj or self._analysed_dimension in self._measure_columns): self._binAnalyzedCol = True if self._appid == None: self._generate_narratives() self._dimensionNode.add_cards([self._card1] + self._targetCards) self._dimensionNode.set_name("{}".format(analysed_dimension)) elif self._appid == "2": self._generate_narratives() self._dimensionNode.add_cards([self._card1]) self._dimensionNode.set_name("{}".format(analysed_dimension)) elif self._appid == "1": self._generate_narratives() self._dimensionNode.add_cards([self._card1]) self._dimensionNode.set_name("{}".format(analysed_dimension)) def get_dimension_node(self): return json.loads( CommonUtils.convert_python_object_to_json(self._dimensionNode)) def get_dimension_card1(self): return self._card1 def _generate_narratives(self): chisquare_result = self._chisquare_result target_dimension = self._target_dimension analysed_dimension = self._analysed_dimension significant_variables = self._significant_variables num_analysed_variables = self._num_analysed_variables table = self._chiSquareTable total = self._chiSquareTable.get_total() levels = self._chiSquareTable.get_column_two_levels() level_counts = self._chiSquareTable.get_column_total() levels_count_sum = sum(level_counts) levels_percentages = [ i * 100.0 / levels_count_sum for i in level_counts ] sorted_levels = sorted(zip(level_counts, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) bottom_dim = sorted_levels[-1][1] bottom_dim_contribution = sorted_levels[-1][0] bottom_dims = [ y for x, y in sorted_levels if x == bottom_dim_contribution ] target_levels = self._chiSquareTable.get_column_one_levels() target_counts = self._chiSquareTable.get_row_total() sorted_target_levels = sorted(zip(target_counts, target_levels), reverse=True) top_target_count, top_target = sorted_target_levels[0] second_target_count, second_target = sorted_target_levels[1] top_target_contributions = [ table.get_value(top_target, i) for i in levels ] sum_top_target = sum(top_target_contributions) sorted_levels = sorted(zip(top_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) top_target_bottom_dim = sorted_levels[-1][1] top_target_bottom_dim_contribution = sorted_levels[-1][0] top_target_percentages = [ i * 100.0 / sum_top_target for i in top_target_contributions ] best_top_target_index = top_target_contributions.index( max(top_target_contributions)) worst_top_target_index = top_target_contributions.index( min(top_target_contributions)) top_target_differences = [ x - y for x, y in zip(levels_percentages, top_target_percentages) ] if len(top_target_differences) > 6: tops = 2 bottoms = -2 elif len(top_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(top_target_differences), key=lambda x: x[1], reverse=True) best_top_difference_indices = [x for x, y in sorted_[:tops]] worst_top_difference_indices = [x for x, y in sorted_[bottoms:]] top_target_shares = [ x * 100.0 / y for x, y in zip(top_target_contributions, level_counts) ] max_top_target_shares = max(top_target_shares) best_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == max_top_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts) min_top_target_shares = min([ x for x, y in zip(top_target_shares, level_counts) if y >= level_counts_threshold ]) worst_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == min_top_target_shares ] overall_top_percentage = sum_top_target * 100.0 / total second_target_contributions = [ table.get_value(second_target, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ i * 100.0 / sum_second_target for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [x for x, y in sorted_[bottoms:]] second_target_shares = [ x * 100.0 / y for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) # worst_second_target_share_index = second_target_shares.index(min_second_target_shares) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = sum_second_target * 100.0 / total targetCardDataDict = {} targetCardDataDict['target'] = target_dimension targetCardDataDict['colname'] = analysed_dimension targetCardDataDict['num_significant'] = len(significant_variables) targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) targetCardDataDict["blockSplitter"] = self._blockSplitter targetCardDataDict["binTargetCol"] = self._binTargetCol targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol targetCardDataDict['highlightFlag'] = self._highlightFlag targetCardDataDict['levels'] = levels data_dict = {} data_dict[ 'best_second_difference'] = best_second_difference_indices ##these changed data_dict['worst_second_difference'] = worst_second_difference_indices data_dict['best_top_difference'] = best_top_difference_indices data_dict['worst_top_difference'] = worst_top_difference_indices data_dict['levels_percentages'] = levels_percentages data_dict['top_target_percentages'] = top_target_percentages data_dict['second_target_percentages'] = second_target_percentages data_dict['levels'] = levels data_dict['best_top_share'] = best_top_target_share_index data_dict['worst_top_share'] = worst_top_target_share_index data_dict['best_second_share'] = best_second_target_share_index data_dict['worst_second_share'] = worst_second_target_share_index data_dict['top_target_shares'] = top_target_shares data_dict['second_target_shares'] = second_target_shares data_dict['overall_second'] = overall_second_percentage data_dict['overall_top'] = overall_top_percentage data_dict['num_significant'] = len(significant_variables) data_dict['colname'] = analysed_dimension data_dict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) data_dict['target'] = target_dimension data_dict['top_levels'] = top_dims data_dict['top_levels_percent'] = round( top_dims_contribution * 100.0 / total, 1) data_dict['bottom_level'] = bottom_dim data_dict['bottom_levels'] = bottom_dims data_dict['bottom_level_percent'] = round( bottom_dim_contribution * 100 / sum(level_counts), 2) data_dict['second_target'] = second_target data_dict['second_target_top_dims'] = second_target_top_dims data_dict[ 'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum( second_target_contributions) data_dict['second_target_bottom_dim'] = second_target_bottom_dim data_dict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution data_dict['best_second_target'] = levels[best_second_target_index] data_dict['best_second_target_count'] = second_target_contributions[ best_second_target_index] data_dict['best_second_target_percent'] = round( second_target_contributions[best_second_target_index] * 100.0 / sum(second_target_contributions), 2) data_dict['worst_second_target'] = levels[worst_second_target_index] data_dict['worst_second_target_percent'] = round( second_target_contributions[worst_second_target_index] * 100.0 / sum(second_target_contributions), 2) data_dict['top_target'] = top_target data_dict['top_target_top_dims'] = top_target_top_dims data_dict[ 'top_target_top_dims_contribution'] = top_target_top_dims_contribution * 100.0 / sum( top_target_contributions) data_dict['top_target_bottom_dim'] = top_target_bottom_dim data_dict[ 'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution data_dict['best_top_target'] = levels[best_top_target_index] data_dict['best_top_target_count'] = top_target_contributions[ best_top_target_index] data_dict['best_top_target_percent'] = round( top_target_contributions[best_top_target_index] * 100.0 / sum(top_target_contributions), 2) data_dict['worst_top_target'] = levels[worst_top_target_index] data_dict['worst_top_target_percent'] = round( top_target_contributions[worst_top_target_index] * 100.0 / sum(top_target_contributions), 2) data_dict["blockSplitter"] = self._blockSplitter data_dict["binTargetCol"] = self._binTargetCol data_dict["binAnalyzedCol"] = self._binAnalyzedCol data_dict['highlightFlag'] = self._highlightFlag ############### # CARD1 # ############### print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol if (self._binTargetCol == True & self._binAnalyzedCol == False): print "Only Target Column is Binned, : ", self._binTargetCol output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print "Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_and_IV.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output(self._base_dir, 'card1.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) targetDimCard1Data = [] targetDimcard1Heading = '<h3>Relationship between ' + self._target_dimension + ' and ' + self._analysed_dimension + "</h3>" toggledata = ToggleData() targetDimTable1Data = self.generate_card1_table1() targetDimCard1Table1 = TableData() targetDimCard1Table1.set_table_type("heatMap") targetDimCard1Table1.set_table_data(targetDimTable1Data) toggledata.set_toggleon_data({ "data": { "tableData": targetDimTable1Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimTable2Data = self.generate_card1_table2() targetDimCard1Table2 = TableData() targetDimCard1Table2.set_table_type("normal") table2Data = targetDimTable2Data["data1"] table2Data = [ innerList[1:] for innerList in table2Data if innerList[0].strip() != "" ] targetDimCard1Table2.set_table_data(table2Data) toggledata.set_toggleoff_data({ "data": { "tableData": table2Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading)) targetDimCard1Data.append(toggledata) targetDimCard1Data += output self._card1.set_card_data(targetDimCard1Data) self._card1.set_card_name("{}: Relationship with {}".format( self._analysed_dimension, self._target_dimension)) ############### # CARD2 # ############### if self._appid == None: key_factors = '' num_key_factors = len(self._second_level_dimensions) if len(self._second_level_dimensions) == 5: key_factors = ', '.join( self._second_level_dimensions[:4] ) + ' and ' + self._second_level_dimensions[4] elif len(self._second_level_dimensions) == 4: key_factors = ', '.join( self._second_level_dimensions[:3] ) + ' and ' + self._second_level_dimensions[3] elif len(self._second_level_dimensions) == 3: key_factors = ', '.join( self._second_level_dimensions[:2] ) + ' and ' + self._second_level_dimensions[2] elif len(self._second_level_dimensions) == 2: key_factors = ' and '.join(self._second_level_dimensions) elif len(self._second_level_dimensions) == 1: key_factors = self._second_level_dimensions[0] targetCardDataDict['num_key_factors'] = num_key_factors targetCardDataDict['key_factors'] = key_factors dict_for_test = {} for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]: targetLevel = tupleObj[1] targetCardDataDict['random_card2'] = random.randint(1, 100) targetCardDataDict['random_card4'] = random.randint(1, 100) second_target_contributions = [ table.get_value(targetLevel, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences. index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences. index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ i * 100.0 / sum_second_target for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [ x for x, y in sorted_[bottoms:] ] second_target_shares = [ x * 100.0 / y for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len( level_counts) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = sum_second_target * 100.0 / total # DataFrame for contribution calculation df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() # if self._chisquare_result.get_splits(): # splits = self._chisquare_result.get_splits() # idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0]) # idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0]) # splits[len(splits)-1] = splits[len(splits)-1]+1 # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\ # filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # else: # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # print self._data_frame.select('Sales').show() distribution_second = [] for d in self._second_level_dimensions: grouped = df_second_target.groupby(d).agg({ d: 'count' }).sort_values(d, ascending=False) contributions = df_second_dim.groupby(d).agg({d: 'count'}) contribution_index = list(contributions.index) contributions_val = contributions[d].tolist() contributions_list = dict( zip(contribution_index, contributions_val)) index_list = list(grouped.index) grouped_list = grouped[d].tolist() contributions_percent_list = [ round(y * 100.0 / contributions_list[x], 2) for x, y in zip(index_list, grouped_list) ] sum_ = grouped[d].sum() diffs = [0] + [ grouped_list[i] - grouped_list[i + 1] for i in range(len(grouped_list) - 1) ] max_diff = diffs.index(max(diffs)) index_txt = '' if max_diff == 1: index_txt = index_list[0] elif max_diff == 2: index_txt = index_list[0] + '(' + str( round(grouped_list[0] * 100.0 / sum_, 1) ) + '%)' + ' and ' + index_list[1] + '(' + str( round(grouped_list[1] * 100.0 / sum_, 1)) + '%)' elif max_diff > 2: index_txt = 'including ' + index_list[0] + '(' + str( round(grouped_list[0] * 100.0 / sum_, 1) ) + '%)' + ' and ' + index_list[1] + '(' + str( round(grouped_list[1] * 100.0 / sum_, 1)) + '%)' distribution_second.append({'contributions':[round(i*100.0/sum_,2) for i in grouped_list[:max_diff]],\ 'levels': index_list[:max_diff],'variation':random.randint(1,100),\ 'index_txt': index_txt, 'd':d,'contributions_percent':contributions_percent_list}) targetCardDataDict['distribution_second'] = distribution_second targetCardDataDict['second_target'] = targetLevel targetCardDataDict[ 'second_target_top_dims'] = second_target_top_dims targetCardDataDict[ 'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum( second_target_contributions) targetCardDataDict[ 'second_target_bottom_dim'] = second_target_bottom_dim targetCardDataDict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution targetCardDataDict['best_second_target'] = levels[ best_second_target_index] targetCardDataDict[ 'best_second_target_count'] = second_target_contributions[ best_second_target_index] targetCardDataDict['best_second_target_percent'] = round( second_target_contributions[best_second_target_index] * 100.0 / sum(second_target_contributions), 2) targetCardDataDict['worst_second_target'] = levels[ worst_second_target_index] targetCardDataDict['worst_second_target_percent'] = round( second_target_contributions[worst_second_target_index] * 100.0 / sum(second_target_contributions), 2) card2Data = [] targetLevelContributions = [ table.get_value(targetLevel, i) for i in levels ] card2Heading = '<h3>Distribution of ' + self._target_dimension + ' (' + targetLevel + ') across ' + self._analysed_dimension + "</h3>" chart, bubble = self.generate_distribution_card_chart( targetLevel, targetLevelContributions, levels, level_counts, total) card2ChartData = NormalChartData(data=chart["data"]) card2ChartJson = ChartJson() card2ChartJson.set_data(card2ChartData.get_data()) card2ChartJson.set_chart_type("combination") card2ChartJson.set_types({ "total": "bar", "percentage": "line" }) card2ChartJson.set_legend({ "total": "# of " + targetLevel, "percentage": "% of " + targetLevel }) card2ChartJson.set_axes({ "x": "key", "y": "total", "y2": "percentage" }) card2ChartJson.set_label_text({ "x": " ", "y": "Count", "y2": "Percentage" }) print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol if (self._binTargetCol == True & self._binAnalyzedCol == False): print "Only Target Column is Binned" output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target.html', targetCardDataDict), self._blockSplitter) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print "Target Column and IV is Binned" output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target_and_IV.html', targetCardDataDict), self._blockSplitter) else: print "In Else, self._binTargetCol should be False : ", self._binTargetCol output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2.html', targetCardDataDict), self._blockSplitter) card2Data.append(HtmlData(data=card2Heading)) statistical_info_array = [ ("Test Type", "Chi-Square"), ("Chi-Square statistic", str(round(self._chisquare_result.get_stat(), 3))), ("P-Value", str(round(self._chisquare_result.get_pvalue(), 3))), ("Inference", "Chi-squared analysis shows a significant association between {} (target) and {}." .format(self._target_dimension, self._analysed_dimension)) ] statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) card2Data.append( C3ChartData(data=card2ChartJson, info=statistical_info_array)) card2Data += output2 card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format( bubble[0]["value"], bubble[0]["text"], bubble[1]["value"], bubble[1]["text"]) card2Data.append(HtmlData(data=card2BubbleData)) targetCard = NormalCard() targetCard.set_card_data(card2Data) targetCard.set_card_name("{} : Distribution of {}".format( self._analysed_dimension, targetLevel)) self._targetCards.append(targetCard) dict_for_test[targetLevel] = targetCardDataDict out = {'data_dict': data_dict, 'target_dict': dict_for_test} return out # def generate_card2_narratives(self): def generate_distribution_card_chart(self, __target, __target_contributions, levels, levels_count, total): chart = {} label = {'total': '# of ' + __target, 'percentage': '% of ' + __target} label_text = { 'x': self._analysed_dimension, 'y': '# of ' + __target, 'y2': '% of ' + __target, } data = {} data['total'] = dict(zip(levels, __target_contributions)) __target_percentages = [ x * 100.0 / y for x, y in zip(__target_contributions, levels_count) ] data['percentage'] = dict(zip(levels, __target_percentages)) chartData = [] for val in zip(levels, __target_contributions, __target_percentages): chartData.append({ "key": val[0], "total": val[1], "percentage": val[2] }) # c3_data = [levels,__target_contributions,__target_percentages] chart_data = {'label': label, 'data': chartData} bubble_data1 = {} bubble_data2 = {} bubble_data1['value'] = str( round( max(__target_contributions) * 100.0 / sum(__target_contributions), 1)) + '%' m_index = __target_contributions.index(max(__target_contributions)) bubble_data1[ 'text'] = 'Overall ' + __target + ' comes from ' + levels[m_index] bubble_data2['value'] = str(round(max(__target_percentages), 1)) + '%' m_index = __target_percentages.index(max(__target_percentages)) bubble_data2[ 'text'] = levels[m_index] + ' has the highest rate of ' + __target bubble_data = [bubble_data1, bubble_data2] return chart_data, bubble_data def generate_card1_table1(self): table_percent_by_column = self._chiSquareTable.table_percent_by_column column_two_values = self._chiSquareTable.column_two_values header_row = [self._analysed_dimension ] + self._chiSquareTable.get_column_one_levels() all_columns = [column_two_values] + table_percent_by_column other_rows = zip(*all_columns) other_rows = [list(tup) for tup in other_rows] table_data = [header_row] + other_rows return table_data def generate_card1_table2(self): table = self._chiSquareTable.table table_percent = self._chiSquareTable.table_percent table_percent_by_row = self._chiSquareTable.table_percent_by_row table_percent_by_column = self._chiSquareTable.table_percent_by_column target_levels = self._chiSquareTable.get_column_one_levels() dim_levels = self._chiSquareTable.get_column_two_levels() header1 = [self._analysed_dimension] + target_levels + ['Total'] header = ['State', 'Active', 'Churn', 'Total'] #TODO remove data = [] data1 = [['Tag'] + header1] for idx, lvl in enumerate(dim_levels): first_row = ['Tag'] + header col_2_vals = zip(*table)[idx] data2 = ['bold'] + [lvl] + list(col_2_vals) + [sum(col_2_vals)] dict_ = dict(zip(first_row, data2)) data.append(dict_) data1.append(data2) col_2_vals = zip(*table_percent_by_column)[idx] data2 = [''] + ['As % within ' + self._analysed_dimension ] + list(col_2_vals) + [100.0] dict_ = dict(zip(first_row, data2)) data.append(dict_) data1.append(data2) col_2_vals = zip(*table_percent_by_row)[idx] col_2_vals1 = zip(*table_percent)[idx] data2 = [''] + [ 'As % within ' + self._target_dimension ] + list(col_2_vals) + [round(sum(col_2_vals1), 2)] dict_ = dict(zip(first_row, data2)) data.append(dict_) data1.append(data2) # col_2_vals = zip(*table_percent)[idx] data2 = [''] + ['As % of Total'] + list(col_2_vals1) + [ round(sum(col_2_vals1), 2) ] dict_ = dict(zip(first_row, data2)) data.append(dict_) data1.append(data2) out = { 'header': header, 'header1': header1, 'data': data, 'label': self._analysed_dimension, 'data1': data1 } return out
class ChiSquareAnalysis(object): def __init__(self, df_context, df_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, data_frame, measure_columns, base_dir, appid=None, target_chisquare_result=None): self._blockSplitter = "|~NEWBLOCK~|" self._highlightFlag = "|~HIGHLIGHT~|" self._dimensionNode = NarrativesTree() self._dimensionNode.set_name(target_dimension) self._data_frame = data_frame self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._dataframe_helper = df_helper self._chisquare_result = chisquare_result self._target_dimension = target_dimension self._analysed_dimension = analysed_dimension self._significant_variables = significant_variables self._target_chisquare_result = target_chisquare_result self._measure_columns = self._dataframe_helper.get_numeric_columns() self._chiSquareLevelLimit = GLOBALSETTINGS.CHISQUARELEVELLIMIT self._num_analysed_variables = num_analysed_variables self._chiSquareTable = chisquare_result.get_contingency_table() significant_variables = list( set(significant_variables) - {analysed_dimension}) if len(significant_variables) <= 20: if len(significant_variables) <= 3: self._second_level_dimensions = list(significant_variables) else: self._second_level_dimensions = list(significant_variables)[:3] else: self._second_level_dimensions = list(significant_variables)[:5] print(self._second_level_dimensions) self._appid = appid self._card1 = NormalCard() self._targetCards = [] self._base_dir = base_dir self._binTargetCol = False self._binAnalyzedCol = False print("--------Chi-Square Narratives for ", analysed_dimension, "---------") if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] print("analysed_dimension : ", self._analysed_dimension) if binnedColObj != None and self._target_dimension in binnedColObj: self._binTargetCol = True if binnedColObj != None and ( self._analysed_dimension in binnedColObj or self._analysed_dimension in self._measure_columns): self._binAnalyzedCol = True if self._appid == None: self._generate_narratives() self._dimensionNode.add_cards([self._card1] + self._targetCards) self._dimensionNode.set_name("{}".format(analysed_dimension)) elif self._appid == "2": self._generate_narratives() self._dimensionNode.add_cards([self._card1]) self._dimensionNode.set_name("{}".format(analysed_dimension)) elif self._appid == "1": self._generate_narratives() self._dimensionNode.add_cards([self._card1]) self._dimensionNode.set_name("{}".format(analysed_dimension)) def get_dimension_node(self): return json.loads( CommonUtils.convert_python_object_to_json(self._dimensionNode)) def get_dimension_card1(self): return self._card1 def _generate_narratives(self): chisquare_result = self._chisquare_result target_dimension = self._target_dimension analysed_dimension = self._analysed_dimension significant_variables = self._significant_variables num_analysed_variables = self._num_analysed_variables table = self._chiSquareTable total = self._chiSquareTable.get_total() levels = self._chiSquareTable.get_column_two_levels() level_counts = self._chiSquareTable.get_column_total() levels_count_sum = sum(level_counts) levels_percentages = [ old_div(i * 100.0, levels_count_sum) for i in level_counts ] sorted_levels = sorted(zip(level_counts, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) bottom_dim = sorted_levels[-1][1] bottom_dim_contribution = sorted_levels[-1][0] bottom_dims = [ y for x, y in sorted_levels if x == bottom_dim_contribution ] target_levels = self._chiSquareTable.get_column_one_levels() target_counts = self._chiSquareTable.get_row_total() sorted_target_levels = sorted(zip(target_counts, target_levels), reverse=True) top_target_count, top_target = sorted_target_levels[0] second_target_count, second_target = sorted_target_levels[1] top_target_contributions = [ table.get_value(top_target, i) for i in levels ] sum_top_target = sum(top_target_contributions) sorted_levels = sorted(zip(top_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) top_target_bottom_dim = sorted_levels[-1][1] top_target_bottom_dim_contribution = sorted_levels[-1][0] top_target_percentages = [ old_div(i * 100.0, sum_top_target) for i in top_target_contributions ] best_top_target_index = top_target_contributions.index( max(top_target_contributions)) worst_top_target_index = top_target_contributions.index( min(top_target_contributions)) top_target_differences = [ x - y for x, y in zip(levels_percentages, top_target_percentages) ] if len(top_target_differences) > 6: tops = 2 bottoms = -2 elif len(top_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(top_target_differences), key=lambda x: x[1], reverse=True) best_top_difference_indices = [x for x, y in sorted_[:tops]] worst_top_difference_indices = [x for x, y in sorted_[bottoms:]] top_target_shares = [ old_div(x * 100.0, y) for x, y in zip(top_target_contributions, level_counts) ] max_top_target_shares = max(top_target_shares) best_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == max_top_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) min_top_target_shares = min([ x for x, y in zip(top_target_shares, level_counts) if y >= level_counts_threshold ]) if max_top_target_shares == min_top_target_shares: worst_top_target_share_index = [] else: worst_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == min_top_target_shares ] overall_top_percentage = old_div(sum_top_target * 100.0, total) second_target_contributions = [ table.get_value(second_target, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ old_div(i * 100.0, sum_second_target) for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [x for x, y in sorted_[bottoms:]] second_target_shares = [ old_div(x * 100.0, y) for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) if min(second_target_shares) == 0: min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if x != 0 ]) else: min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) # worst_second_target_share_index = second_target_shares.index(min_second_target_shares) if max_second_target_shares == min_second_target_shares: worst_second_target_share_index = [] else: worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = old_div(sum_second_target * 100.0, total) targetCardDataDict = {} targetCardDataDict['target'] = target_dimension targetCardDataDict['colname'] = analysed_dimension targetCardDataDict['num_significant'] = len(significant_variables) targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) targetCardDataDict["blockSplitter"] = self._blockSplitter targetCardDataDict["binTargetCol"] = self._binTargetCol targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol targetCardDataDict['highlightFlag'] = self._highlightFlag targetCardDataDict['levels'] = levels data_dict = {} data_dict[ 'best_second_difference'] = best_second_difference_indices ##these changed data_dict['worst_second_difference'] = worst_second_difference_indices data_dict['best_top_difference'] = best_top_difference_indices data_dict['worst_top_difference'] = worst_top_difference_indices data_dict['levels_percentages'] = levels_percentages data_dict['top_target_percentages'] = top_target_percentages data_dict['second_target_percentages'] = second_target_percentages data_dict['levels'] = levels data_dict['best_top_share'] = best_top_target_share_index data_dict['worst_top_share'] = worst_top_target_share_index data_dict['best_second_share'] = best_second_target_share_index data_dict['worst_second_share'] = worst_second_target_share_index data_dict['top_target_shares'] = top_target_shares data_dict['second_target_shares'] = second_target_shares data_dict['overall_second'] = overall_second_percentage data_dict['overall_top'] = overall_top_percentage data_dict['num_significant'] = len(significant_variables) data_dict['colname'] = analysed_dimension data_dict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) data_dict['target'] = target_dimension data_dict['top_levels'] = top_dims data_dict['top_levels_percent'] = round( old_div(top_dims_contribution * 100.0, total), 1) data_dict['bottom_level'] = bottom_dim data_dict['bottom_levels'] = bottom_dims data_dict['bottom_level_percent'] = round( old_div(bottom_dim_contribution * 100, sum(level_counts)), 2) data_dict['second_target'] = second_target data_dict['second_target_top_dims'] = second_target_top_dims data_dict['second_target_top_dims_contribution'] = old_div( second_target_top_dims_contribution * 100.0, sum(second_target_contributions)) data_dict['second_target_bottom_dim'] = second_target_bottom_dim data_dict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution data_dict['best_second_target'] = levels[best_second_target_index] data_dict['best_second_target_count'] = second_target_contributions[ best_second_target_index] data_dict['best_second_target_percent'] = round( old_div( second_target_contributions[best_second_target_index] * 100.0, sum(second_target_contributions)), 2) data_dict['worst_second_target'] = levels[worst_second_target_index] data_dict['worst_second_target_percent'] = round( old_div( second_target_contributions[worst_second_target_index] * 100.0, sum(second_target_contributions)), 2) data_dict['top_target'] = top_target data_dict['top_target_top_dims'] = top_target_top_dims data_dict['top_target_top_dims_contribution'] = old_div( top_target_top_dims_contribution * 100.0, sum(top_target_contributions)) data_dict['top_target_bottom_dim'] = top_target_bottom_dim data_dict[ 'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution data_dict['best_top_target'] = levels[best_top_target_index] data_dict['best_top_target_count'] = top_target_contributions[ best_top_target_index] data_dict['best_top_target_percent'] = round( old_div(top_target_contributions[best_top_target_index] * 100.0, sum(top_target_contributions)), 2) data_dict['worst_top_target'] = levels[worst_top_target_index] data_dict['worst_top_target_percent'] = round( old_div(top_target_contributions[worst_top_target_index] * 100.0, sum(top_target_contributions)), 2) data_dict["blockSplitter"] = self._blockSplitter data_dict["binTargetCol"] = self._binTargetCol data_dict["binAnalyzedCol"] = self._binAnalyzedCol data_dict['highlightFlag'] = self._highlightFlag # print "_"*60 # print "DATA DICT - ", data_dict # print "_"*60 ############### # CARD1 # ############### print("self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol) if len(data_dict['worst_second_share']) == 0: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_worst_second.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: if (self._binTargetCol == True & self._binAnalyzedCol == False): print("Only Target Column is Binned, : ", self._binTargetCol) output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print("Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol) output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_and_IV.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) targetDimCard1Data = [] targetDimcard1Heading = '<h3>Impact of ' + self._analysed_dimension + ' on ' + self._target_dimension + "</h3>" toggledata = ToggleData() targetDimTable1Data = self.generate_card1_table1() targetDimCard1Table1 = TableData() targetDimCard1Table1.set_table_type("heatMap") targetDimCard1Table1.set_table_data(targetDimTable1Data) toggledata.set_toggleon_data({ "data": { "tableData": targetDimTable1Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimTable2Data = self.generate_card1_table2() targetDimCard1Table2 = TableData() targetDimCard1Table2.set_table_type("normal") table2Data = targetDimTable2Data["data1"] table2Data = [ innerList[1:] for innerList in table2Data if innerList[0].strip() != "" ] targetDimCard1Table2.set_table_data(table2Data) toggledata.set_toggleoff_data({ "data": { "tableData": table2Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading)) targetDimCard1Data.append(toggledata) targetDimCard1Data += output self._card1.set_card_data(targetDimCard1Data) self._card1.set_card_name("{}: Relationship with {}".format( self._analysed_dimension, self._target_dimension)) ############### # CARD2 # ############### if self._appid == None: key_factors = '' num_key_factors = len(self._second_level_dimensions) if len(self._second_level_dimensions) == 5: key_factors = ', '.join( self._second_level_dimensions[:4] ) + ' and ' + self._second_level_dimensions[4] elif len(self._second_level_dimensions) == 4: key_factors = ', '.join( self._second_level_dimensions[:3] ) + ' and ' + self._second_level_dimensions[3] elif len(self._second_level_dimensions) == 3: key_factors = ', '.join( self._second_level_dimensions[:2] ) + ' and ' + self._second_level_dimensions[2] elif len(self._second_level_dimensions) == 2: key_factors = ' and '.join(self._second_level_dimensions) elif len(self._second_level_dimensions) == 1: key_factors = self._second_level_dimensions[0] targetCardDataDict['num_key_factors'] = num_key_factors targetCardDataDict['key_factors'] = key_factors dict_for_test = {} for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]: targetLevel = tupleObj[1] targetCardDataDict['random_card2'] = random.randint(1, 100) targetCardDataDict['random_card4'] = random.randint(1, 100) second_target_contributions = [ table.get_value(targetLevel, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] level_diff_index = level_differences.index( max(level_differences)) if level_differences.index( max(level_differences)) > 0 else len( level_differences ) ##added for pipeline keyerror issue second_target_top_dims = [ j for i, j in sorted_levels[:level_diff_index] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences. index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ old_div(i * 100.0, sum_second_target) for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [ x for x, y in sorted_[bottoms:] ] second_target_shares = [ old_div(x * 100.0, y) for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = old_div(sum_second_target * 100.0, total) # DataFrame for contribution calculation if self._pandas_flag: df_second_target = self._data_frame[( self._data_frame[self._target_dimension] == targetLevel ) & (self._data_frame[self._analysed_dimension] == second_target_top_dims[0])][ self._second_level_dimensions] df_second_dim = self._data_frame[( self._data_frame[self._analysed_dimension] == second_target_top_dims[0] )][self._second_level_dimensions] else: df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() # if self._chisquare_result.get_splits(): # splits = self._chisquare_result.get_splits() # idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0]) # idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0]) # splits[len(splits)-1] = splits[len(splits)-1]+1 # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\ # filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # else: # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # print self._data_frame.select('Sales').show() distribution_second = [] d_l = [] for d in self._second_level_dimensions: grouped = df_second_target.groupby(d).agg({d: 'count'}) contributions = df_second_dim.groupby(d).agg({d: 'count'}) contribution_index = list(contributions.index) contributions_val = contributions[d].tolist() contributions_list = dict( list(zip(contribution_index, contributions_val))) index_list = list(grouped.index) grouped_list = grouped[d].tolist() contributions_percent_list = [ round(old_div(y * 100.0, contributions_list[x]), 2) for x, y in zip(index_list, grouped_list) ] sum_ = grouped[d].sum() diffs = [0] + [ grouped_list[i] - grouped_list[i + 1] for i in range(len(grouped_list) - 1) ] max_diff = diffs.index(max(diffs)) grouped_dict = dict(list(zip(index_list, grouped_list))) for val in contribution_index: if val not in list(grouped_dict.keys()): grouped_dict[val] = 0 else: pass index_list = [] grouped_list = [] contributions_val = [] for key in list(grouped_dict.keys()): index_list.append(str(key)) grouped_list.append(grouped_dict[key]) contributions_val.append(contributions_list[key]) ''' print "="*70 print "GROUPED - ", grouped print "INDEX LIST - ", index_list print "GROUPED LIST - ", grouped_list print "GROUPED DICT - ", grouped_dict print "CONTRIBUTIONS - ", contributions print "CONTRIBUTION INDEX - ", contribution_index print "CONTRIBUTIONS VAL - ", contributions_val print "CONTRIBUTIONS LIST - ", contributions_list print "CONTRIBUTIONS PERCENT LIST - ", contributions_percent_list print "SUM - ", sum_ print "DIFFS - ", diffs print "MAX DIFF - ", max_diff print "="*70 ''' informative_dict = { "levels": index_list, "positive_class_contribution": grouped_list, "positive_plus_others": contributions_val } informative_df = pd.DataFrame(informative_dict) informative_df["percentage_horizontal"] = old_div( informative_df["positive_class_contribution"] * 100, informative_df["positive_plus_others"]) informative_df["percentage_vertical"] = old_div( informative_df["positive_class_contribution"] * 100, sum_) informative_df.sort_values(["percentage_vertical"], inplace=True, ascending=False) informative_df = informative_df.reset_index(drop=True) percentage_vertical_sorted = list( informative_df["percentage_vertical"]) percentage_horizontal_sorted = list( informative_df["percentage_horizontal"]) levels_sorted = list(informative_df["levels"]) differences_list = [] for i in range(1, len(percentage_vertical_sorted)): difference = percentage_vertical_sorted[ i - 1] - percentage_vertical_sorted[i] differences_list.append(round(difference, 2)) ''' print "-"*70 print "DIFFERENCES LIST - ", differences_list print "-"*70 ''' index_txt = '' if differences_list: if differences_list[0] >= 30: print("showing 1st case") index_txt = levels_sorted[0] max_diff_equivalent = 1 else: if len(differences_list) >= 2: if differences_list[1] >= 10: print("showing 1st and 2nd case") index_txt = levels_sorted[0] + '(' + str( round(percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[1], 1)) + '%)' max_diff_equivalent = 2 else: print("showing 3rd case") index_txt = 'including ' + levels_sorted[ 0] + '(' + str( round( percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[ 1], 1)) + '%)' max_diff_equivalent = 3 else: print("showing 3rd case") index_txt = 'including ' + levels_sorted[ 0] + '(' + str( round(percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[1], 1)) + '%)' max_diff_equivalent = 3 else: max_diff_equivalent = 0 ''' print "-"*70 print informative_df.head(25) print "-"*70 ''' distribution_second.append({ 'contributions': [ round(i, 2) for i in percentage_vertical_sorted[:max_diff_equivalent] ], 'levels': levels_sorted[:max_diff_equivalent], 'variation': random.randint(1, 100), 'index_txt': index_txt, 'd': d, 'contributions_percent': percentage_horizontal_sorted }) ''' print "DISTRIBUTION SECOND - ", distribution_second print "<>"*50 ''' targetCardDataDict['distribution_second'] = distribution_second targetCardDataDict['second_target'] = targetLevel targetCardDataDict[ 'second_target_top_dims'] = second_target_top_dims targetCardDataDict[ 'second_target_top_dims_contribution'] = old_div( second_target_top_dims_contribution * 100.0, sum(second_target_contributions)) targetCardDataDict[ 'second_target_bottom_dim'] = second_target_bottom_dim targetCardDataDict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution targetCardDataDict['best_second_target'] = levels[ best_second_target_index] targetCardDataDict[ 'best_second_target_count'] = second_target_contributions[ best_second_target_index] targetCardDataDict['best_second_target_percent'] = round( old_div( second_target_contributions[best_second_target_index] * 100.0, sum(second_target_contributions)), 2) targetCardDataDict['worst_second_target'] = levels[ worst_second_target_index] targetCardDataDict['worst_second_target_percent'] = round( old_div( second_target_contributions[worst_second_target_index] * 100.0, sum(second_target_contributions)), 2) card2Data = [] targetLevelContributions = [ table.get_value(targetLevel, i) for i in levels ] impact_target_thershold = old_div( sum(targetLevelContributions) * 0.02, len(targetLevelContributions)) card2Heading = '<h3>Key Drivers of ' + self._target_dimension + ' (' + targetLevel + ')' + "</h3>" chart, bubble = self.generate_distribution_card_chart( targetLevel, targetLevelContributions, levels, level_counts, total, impact_target_thershold) card2ChartData = NormalChartData(data=chart["data"]) "rounding the chartdata values for key drivers tab inside table percentage(table data)" for d in card2ChartData.get_data(): d['percentage'] = round(d['percentage'], 2) d_l.append(d) card2ChartJson = ChartJson() card2ChartJson.set_data(d_l) card2ChartJson.set_chart_type("combination") card2ChartJson.set_types({ "total": "bar", "percentage": "line" }) card2ChartJson.set_legend({ "total": "# of " + targetLevel, "percentage": "% of " + targetLevel }) card2ChartJson.set_axes({ "x": "key", "y": "total", "y2": "percentage" }) card2ChartJson.set_label_text({ "x": " ", "y": "Count", "y2": "Percentage" }) print("self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol) if (self._binTargetCol == True & self._binAnalyzedCol == False): print("Only Target Column is Binned") output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target.html', targetCardDataDict), self._blockSplitter) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print("Target Column and IV is Binned") output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target_and_IV.html', targetCardDataDict), self._blockSplitter) else: print("In Else, self._binTargetCol should be False : ", self._binTargetCol) output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2.html', targetCardDataDict), self._blockSplitter) card2Data.append(HtmlData(data=card2Heading)) statistical_info_array = [ ("Test Type", "Chi-Square"), ("Chi-Square statistic", str(round(self._chisquare_result.get_stat(), 3))), ("P-Value", str(round(self._chisquare_result.get_pvalue(), 3))), ("Inference", "Chi-squared analysis shows a significant association between {} (target) and {}." .format(self._target_dimension, self._analysed_dimension)) ] statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) card2Data.append( C3ChartData(data=card2ChartJson, info=statistical_info_array)) card2Data += output2 card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format( bubble[0]["value"], bubble[0]["text"], bubble[1]["value"], bubble[1]["text"]) card2Data.append(HtmlData(data=card2BubbleData)) targetCard = NormalCard() targetCard.set_card_data(card2Data) targetCard.set_card_name("{} : Distribution of {}".format( self._analysed_dimension, targetLevel)) self._targetCards.append(targetCard) dict_for_test[targetLevel] = targetCardDataDict out = {'data_dict': data_dict, 'target_dict': dict_for_test} return out # def generate_card2_narratives(self): def generate_distribution_card_chart(self, __target, __target_contributions, levels, levels_count, total, thershold): chart = {} label = {'total': '# of ' + __target, 'percentage': '% of ' + __target} label_text = { 'x': self._analysed_dimension, 'y': '# of ' + __target, 'y2': '% of ' + __target, } data = {} data['total'] = dict(list(zip(levels, __target_contributions))) __target_percentages = [ old_div(x * 100.0, y) for x, y in zip(__target_contributions, levels_count) ] data['percentage'] = dict(list(zip(levels, __target_percentages))) chartData = [] for val in zip(levels, __target_contributions, __target_percentages): chartData.append({ "key": val[0], "total": val[1], "percentage": val[2] }) # c3_data = [levels,__target_contributions,__target_percentages] chart_data = {'label': label, 'data': chartData} bubble_data1 = {} bubble_data2 = {} bubble_data1['value'] = str( round( old_div( max(__target_contributions) * 100.0, sum(__target_contributions)), 1)) + '%' m_index = __target_contributions.index(max(__target_contributions)) bubble_data1[ 'text'] = 'Overall ' + __target + ' comes from ' + levels[m_index] intial = -1 for k, v, i in zip(__target_contributions, __target_percentages, list(range(len(__target_contributions)))): if k > thershold: if intial < v: intial = v bubble_data2['value'] = str(round(intial)) + '%' #m_index = __target_percentages.index(i) bubble_data2['text'] = levels[ i] + ' has the highest rate of ' + __target bubble_data = [bubble_data1, bubble_data2] return chart_data, bubble_data def generate_card1_table1(self): table_percent_by_column = self._chiSquareTable.table_percent_by_column column_two_values = self._chiSquareTable.column_two_values header_row = [self._analysed_dimension ] + self._chiSquareTable.get_column_one_levels() all_columns = [column_two_values] + table_percent_by_column other_rows = list(zip(*all_columns)) other_rows = [list(tup) for tup in other_rows] table_data = [header_row] + other_rows return table_data def generate_card1_table2(self): table = self._chiSquareTable.table table_percent = self._chiSquareTable.table_percent table_percent_by_row = self._chiSquareTable.table_percent_by_row table_percent_by_column = self._chiSquareTable.table_percent_by_column target_levels = self._chiSquareTable.get_column_one_levels() dim_levels = self._chiSquareTable.get_column_two_levels() header1 = [self._analysed_dimension] + target_levels + ['Total'] header = ['State', 'Active', 'Churn', 'Total'] #TODO remove data = [] data1 = [['Tag'] + header1] for idx, lvl in enumerate(dim_levels): first_row = ['Tag'] + header col_2_vals = list(zip(*table))[idx] data2 = ['bold'] + [lvl] + list(col_2_vals) + [sum(col_2_vals)] dict_ = dict(list(zip(first_row, data2))) data.append(dict_) data1.append(data2) col_2_vals = list(zip(*table_percent_by_column))[idx] data2 = [''] + ['As % within ' + self._analysed_dimension ] + list(col_2_vals) + [100.0] dict_ = dict(list(zip(first_row, data2))) data.append(dict_) data1.append(data2) col_2_vals = list(zip(*table_percent_by_row))[idx] col_2_vals1 = list(zip(*table_percent))[idx] data2 = [''] + [ 'As % within ' + self._target_dimension ] + list(col_2_vals) + [round(sum(col_2_vals1), 2)] dict_ = dict(list(zip(first_row, data2))) data.append(dict_) data1.append(data2) # col_2_vals = zip(*table_percent)[idx] data2 = [''] + ['As % of Total'] + list(col_2_vals1) + [ round(sum(col_2_vals1), 2) ] dict_ = dict(list(zip(first_row, data2))) data.append(dict_) data1.append(data2) out = { 'header': header, 'header1': header1, 'data': data, 'label': self._analysed_dimension, 'data1': data1 } return out
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (PyTorch) Scripts", "weight": 2 }, "predictionStart": { "summary": "Neural Network (PyTorch) Prediction Started", "weight": 2 }, "predictionFinished": { "summary": "Neural Network (PyTorch) Prediction Finished", "weight": 6 } } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionStart", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path( ) trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pt" print("trained_model_path", trained_model_path) print("score_data_path", score_data_path) if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] #trained_model = joblib.load(trained_model_path) trained_model = torch.load(trained_model_path, map_location=torch.device('cpu')) model_columns = self._dataframe_context.get_model_features() print("model_columns", model_columns) try: df = self._data_frame.toPandas() except: df = self._data_frame # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] test_df = np.stack( [pandas_df[col].values for col in pandas_df.columns], 1) tensored_test_df = torch.tensor(test_df, dtype=torch.float) outputs_test_df_tensored = trained_model(tensored_test_df.float()) y_score_mid = outputs_test_df_tensored.tolist() y_score = [x[0] for x in y_score_mid] scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print(CommonUtils.convert_python_object_to_json(kpiCard)) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path, header=True, index=False) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionFinished", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("STARTING Measure ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns( ) if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) pandas_scored_df = df[list(set(columns_to_keep + [result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print(spark_scored_df.printSchema()) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, scriptWeight=self._scriptWeightDict, analysisName="Descriptive analysis") descr_stats_obj.Run() print("DescriptiveStats Analysis Done in ", time.time() - fs, " seconds.") except: print("Frequency Analysis Failed ") # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Measure vs. Dimension") two_way_obj.Run() print("OneWayAnova Analysis Done in ", time.time() - fs, " seconds.") except: print("Anova Analysis Failed")
def _generate_narratives(self): chisquare_result = self._chisquare_result target_dimension = self._target_dimension analysed_dimension = self._analysed_dimension significant_variables = self._significant_variables num_analysed_variables = self._num_analysed_variables table = self._chiSquareTable total = self._chiSquareTable.get_total() levels = self._chiSquareTable.get_column_two_levels() level_counts = self._chiSquareTable.get_column_total() levels_count_sum = sum(level_counts) levels_percentages = [ old_div(i * 100.0, levels_count_sum) for i in level_counts ] sorted_levels = sorted(zip(level_counts, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) bottom_dim = sorted_levels[-1][1] bottom_dim_contribution = sorted_levels[-1][0] bottom_dims = [ y for x, y in sorted_levels if x == bottom_dim_contribution ] target_levels = self._chiSquareTable.get_column_one_levels() target_counts = self._chiSquareTable.get_row_total() sorted_target_levels = sorted(zip(target_counts, target_levels), reverse=True) top_target_count, top_target = sorted_target_levels[0] second_target_count, second_target = sorted_target_levels[1] top_target_contributions = [ table.get_value(top_target, i) for i in levels ] sum_top_target = sum(top_target_contributions) sorted_levels = sorted(zip(top_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) top_target_bottom_dim = sorted_levels[-1][1] top_target_bottom_dim_contribution = sorted_levels[-1][0] top_target_percentages = [ old_div(i * 100.0, sum_top_target) for i in top_target_contributions ] best_top_target_index = top_target_contributions.index( max(top_target_contributions)) worst_top_target_index = top_target_contributions.index( min(top_target_contributions)) top_target_differences = [ x - y for x, y in zip(levels_percentages, top_target_percentages) ] if len(top_target_differences) > 6: tops = 2 bottoms = -2 elif len(top_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(top_target_differences), key=lambda x: x[1], reverse=True) best_top_difference_indices = [x for x, y in sorted_[:tops]] worst_top_difference_indices = [x for x, y in sorted_[bottoms:]] top_target_shares = [ old_div(x * 100.0, y) for x, y in zip(top_target_contributions, level_counts) ] max_top_target_shares = max(top_target_shares) best_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == max_top_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) min_top_target_shares = min([ x for x, y in zip(top_target_shares, level_counts) if y >= level_counts_threshold ]) if max_top_target_shares == min_top_target_shares: worst_top_target_share_index = [] else: worst_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == min_top_target_shares ] overall_top_percentage = old_div(sum_top_target * 100.0, total) second_target_contributions = [ table.get_value(second_target, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ old_div(i * 100.0, sum_second_target) for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [x for x, y in sorted_[bottoms:]] second_target_shares = [ old_div(x * 100.0, y) for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) if min(second_target_shares) == 0: min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if x != 0 ]) else: min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) # worst_second_target_share_index = second_target_shares.index(min_second_target_shares) if max_second_target_shares == min_second_target_shares: worst_second_target_share_index = [] else: worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = old_div(sum_second_target * 100.0, total) targetCardDataDict = {} targetCardDataDict['target'] = target_dimension targetCardDataDict['colname'] = analysed_dimension targetCardDataDict['num_significant'] = len(significant_variables) targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) targetCardDataDict["blockSplitter"] = self._blockSplitter targetCardDataDict["binTargetCol"] = self._binTargetCol targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol targetCardDataDict['highlightFlag'] = self._highlightFlag targetCardDataDict['levels'] = levels data_dict = {} data_dict[ 'best_second_difference'] = best_second_difference_indices ##these changed data_dict['worst_second_difference'] = worst_second_difference_indices data_dict['best_top_difference'] = best_top_difference_indices data_dict['worst_top_difference'] = worst_top_difference_indices data_dict['levels_percentages'] = levels_percentages data_dict['top_target_percentages'] = top_target_percentages data_dict['second_target_percentages'] = second_target_percentages data_dict['levels'] = levels data_dict['best_top_share'] = best_top_target_share_index data_dict['worst_top_share'] = worst_top_target_share_index data_dict['best_second_share'] = best_second_target_share_index data_dict['worst_second_share'] = worst_second_target_share_index data_dict['top_target_shares'] = top_target_shares data_dict['second_target_shares'] = second_target_shares data_dict['overall_second'] = overall_second_percentage data_dict['overall_top'] = overall_top_percentage data_dict['num_significant'] = len(significant_variables) data_dict['colname'] = analysed_dimension data_dict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) data_dict['target'] = target_dimension data_dict['top_levels'] = top_dims data_dict['top_levels_percent'] = round( old_div(top_dims_contribution * 100.0, total), 1) data_dict['bottom_level'] = bottom_dim data_dict['bottom_levels'] = bottom_dims data_dict['bottom_level_percent'] = round( old_div(bottom_dim_contribution * 100, sum(level_counts)), 2) data_dict['second_target'] = second_target data_dict['second_target_top_dims'] = second_target_top_dims data_dict['second_target_top_dims_contribution'] = old_div( second_target_top_dims_contribution * 100.0, sum(second_target_contributions)) data_dict['second_target_bottom_dim'] = second_target_bottom_dim data_dict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution data_dict['best_second_target'] = levels[best_second_target_index] data_dict['best_second_target_count'] = second_target_contributions[ best_second_target_index] data_dict['best_second_target_percent'] = round( old_div( second_target_contributions[best_second_target_index] * 100.0, sum(second_target_contributions)), 2) data_dict['worst_second_target'] = levels[worst_second_target_index] data_dict['worst_second_target_percent'] = round( old_div( second_target_contributions[worst_second_target_index] * 100.0, sum(second_target_contributions)), 2) data_dict['top_target'] = top_target data_dict['top_target_top_dims'] = top_target_top_dims data_dict['top_target_top_dims_contribution'] = old_div( top_target_top_dims_contribution * 100.0, sum(top_target_contributions)) data_dict['top_target_bottom_dim'] = top_target_bottom_dim data_dict[ 'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution data_dict['best_top_target'] = levels[best_top_target_index] data_dict['best_top_target_count'] = top_target_contributions[ best_top_target_index] data_dict['best_top_target_percent'] = round( old_div(top_target_contributions[best_top_target_index] * 100.0, sum(top_target_contributions)), 2) data_dict['worst_top_target'] = levels[worst_top_target_index] data_dict['worst_top_target_percent'] = round( old_div(top_target_contributions[worst_top_target_index] * 100.0, sum(top_target_contributions)), 2) data_dict["blockSplitter"] = self._blockSplitter data_dict["binTargetCol"] = self._binTargetCol data_dict["binAnalyzedCol"] = self._binAnalyzedCol data_dict['highlightFlag'] = self._highlightFlag # print "_"*60 # print "DATA DICT - ", data_dict # print "_"*60 ############### # CARD1 # ############### print("self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol) if len(data_dict['worst_second_share']) == 0: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_worst_second.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: if (self._binTargetCol == True & self._binAnalyzedCol == False): print("Only Target Column is Binned, : ", self._binTargetCol) output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print("Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol) output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_and_IV.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) targetDimCard1Data = [] targetDimcard1Heading = '<h3>Impact of ' + self._analysed_dimension + ' on ' + self._target_dimension + "</h3>" toggledata = ToggleData() targetDimTable1Data = self.generate_card1_table1() targetDimCard1Table1 = TableData() targetDimCard1Table1.set_table_type("heatMap") targetDimCard1Table1.set_table_data(targetDimTable1Data) toggledata.set_toggleon_data({ "data": { "tableData": targetDimTable1Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimTable2Data = self.generate_card1_table2() targetDimCard1Table2 = TableData() targetDimCard1Table2.set_table_type("normal") table2Data = targetDimTable2Data["data1"] table2Data = [ innerList[1:] for innerList in table2Data if innerList[0].strip() != "" ] targetDimCard1Table2.set_table_data(table2Data) toggledata.set_toggleoff_data({ "data": { "tableData": table2Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading)) targetDimCard1Data.append(toggledata) targetDimCard1Data += output self._card1.set_card_data(targetDimCard1Data) self._card1.set_card_name("{}: Relationship with {}".format( self._analysed_dimension, self._target_dimension)) ############### # CARD2 # ############### if self._appid == None: key_factors = '' num_key_factors = len(self._second_level_dimensions) if len(self._second_level_dimensions) == 5: key_factors = ', '.join( self._second_level_dimensions[:4] ) + ' and ' + self._second_level_dimensions[4] elif len(self._second_level_dimensions) == 4: key_factors = ', '.join( self._second_level_dimensions[:3] ) + ' and ' + self._second_level_dimensions[3] elif len(self._second_level_dimensions) == 3: key_factors = ', '.join( self._second_level_dimensions[:2] ) + ' and ' + self._second_level_dimensions[2] elif len(self._second_level_dimensions) == 2: key_factors = ' and '.join(self._second_level_dimensions) elif len(self._second_level_dimensions) == 1: key_factors = self._second_level_dimensions[0] targetCardDataDict['num_key_factors'] = num_key_factors targetCardDataDict['key_factors'] = key_factors dict_for_test = {} for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]: targetLevel = tupleObj[1] targetCardDataDict['random_card2'] = random.randint(1, 100) targetCardDataDict['random_card4'] = random.randint(1, 100) second_target_contributions = [ table.get_value(targetLevel, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] level_diff_index = level_differences.index( max(level_differences)) if level_differences.index( max(level_differences)) > 0 else len( level_differences ) ##added for pipeline keyerror issue second_target_top_dims = [ j for i, j in sorted_levels[:level_diff_index] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences. index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ old_div(i * 100.0, sum_second_target) for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [ x for x, y in sorted_[bottoms:] ] second_target_shares = [ old_div(x * 100.0, y) for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = old_div(sum_second_target * 100.0, total) # DataFrame for contribution calculation if self._pandas_flag: df_second_target = self._data_frame[( self._data_frame[self._target_dimension] == targetLevel ) & (self._data_frame[self._analysed_dimension] == second_target_top_dims[0])][ self._second_level_dimensions] df_second_dim = self._data_frame[( self._data_frame[self._analysed_dimension] == second_target_top_dims[0] )][self._second_level_dimensions] else: df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() # if self._chisquare_result.get_splits(): # splits = self._chisquare_result.get_splits() # idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0]) # idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0]) # splits[len(splits)-1] = splits[len(splits)-1]+1 # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\ # filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # else: # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # print self._data_frame.select('Sales').show() distribution_second = [] d_l = [] for d in self._second_level_dimensions: grouped = df_second_target.groupby(d).agg({d: 'count'}) contributions = df_second_dim.groupby(d).agg({d: 'count'}) contribution_index = list(contributions.index) contributions_val = contributions[d].tolist() contributions_list = dict( list(zip(contribution_index, contributions_val))) index_list = list(grouped.index) grouped_list = grouped[d].tolist() contributions_percent_list = [ round(old_div(y * 100.0, contributions_list[x]), 2) for x, y in zip(index_list, grouped_list) ] sum_ = grouped[d].sum() diffs = [0] + [ grouped_list[i] - grouped_list[i + 1] for i in range(len(grouped_list) - 1) ] max_diff = diffs.index(max(diffs)) grouped_dict = dict(list(zip(index_list, grouped_list))) for val in contribution_index: if val not in list(grouped_dict.keys()): grouped_dict[val] = 0 else: pass index_list = [] grouped_list = [] contributions_val = [] for key in list(grouped_dict.keys()): index_list.append(str(key)) grouped_list.append(grouped_dict[key]) contributions_val.append(contributions_list[key]) ''' print "="*70 print "GROUPED - ", grouped print "INDEX LIST - ", index_list print "GROUPED LIST - ", grouped_list print "GROUPED DICT - ", grouped_dict print "CONTRIBUTIONS - ", contributions print "CONTRIBUTION INDEX - ", contribution_index print "CONTRIBUTIONS VAL - ", contributions_val print "CONTRIBUTIONS LIST - ", contributions_list print "CONTRIBUTIONS PERCENT LIST - ", contributions_percent_list print "SUM - ", sum_ print "DIFFS - ", diffs print "MAX DIFF - ", max_diff print "="*70 ''' informative_dict = { "levels": index_list, "positive_class_contribution": grouped_list, "positive_plus_others": contributions_val } informative_df = pd.DataFrame(informative_dict) informative_df["percentage_horizontal"] = old_div( informative_df["positive_class_contribution"] * 100, informative_df["positive_plus_others"]) informative_df["percentage_vertical"] = old_div( informative_df["positive_class_contribution"] * 100, sum_) informative_df.sort_values(["percentage_vertical"], inplace=True, ascending=False) informative_df = informative_df.reset_index(drop=True) percentage_vertical_sorted = list( informative_df["percentage_vertical"]) percentage_horizontal_sorted = list( informative_df["percentage_horizontal"]) levels_sorted = list(informative_df["levels"]) differences_list = [] for i in range(1, len(percentage_vertical_sorted)): difference = percentage_vertical_sorted[ i - 1] - percentage_vertical_sorted[i] differences_list.append(round(difference, 2)) ''' print "-"*70 print "DIFFERENCES LIST - ", differences_list print "-"*70 ''' index_txt = '' if differences_list: if differences_list[0] >= 30: print("showing 1st case") index_txt = levels_sorted[0] max_diff_equivalent = 1 else: if len(differences_list) >= 2: if differences_list[1] >= 10: print("showing 1st and 2nd case") index_txt = levels_sorted[0] + '(' + str( round(percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[1], 1)) + '%)' max_diff_equivalent = 2 else: print("showing 3rd case") index_txt = 'including ' + levels_sorted[ 0] + '(' + str( round( percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[ 1], 1)) + '%)' max_diff_equivalent = 3 else: print("showing 3rd case") index_txt = 'including ' + levels_sorted[ 0] + '(' + str( round(percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[1], 1)) + '%)' max_diff_equivalent = 3 else: max_diff_equivalent = 0 ''' print "-"*70 print informative_df.head(25) print "-"*70 ''' distribution_second.append({ 'contributions': [ round(i, 2) for i in percentage_vertical_sorted[:max_diff_equivalent] ], 'levels': levels_sorted[:max_diff_equivalent], 'variation': random.randint(1, 100), 'index_txt': index_txt, 'd': d, 'contributions_percent': percentage_horizontal_sorted }) ''' print "DISTRIBUTION SECOND - ", distribution_second print "<>"*50 ''' targetCardDataDict['distribution_second'] = distribution_second targetCardDataDict['second_target'] = targetLevel targetCardDataDict[ 'second_target_top_dims'] = second_target_top_dims targetCardDataDict[ 'second_target_top_dims_contribution'] = old_div( second_target_top_dims_contribution * 100.0, sum(second_target_contributions)) targetCardDataDict[ 'second_target_bottom_dim'] = second_target_bottom_dim targetCardDataDict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution targetCardDataDict['best_second_target'] = levels[ best_second_target_index] targetCardDataDict[ 'best_second_target_count'] = second_target_contributions[ best_second_target_index] targetCardDataDict['best_second_target_percent'] = round( old_div( second_target_contributions[best_second_target_index] * 100.0, sum(second_target_contributions)), 2) targetCardDataDict['worst_second_target'] = levels[ worst_second_target_index] targetCardDataDict['worst_second_target_percent'] = round( old_div( second_target_contributions[worst_second_target_index] * 100.0, sum(second_target_contributions)), 2) card2Data = [] targetLevelContributions = [ table.get_value(targetLevel, i) for i in levels ] impact_target_thershold = old_div( sum(targetLevelContributions) * 0.02, len(targetLevelContributions)) card2Heading = '<h3>Key Drivers of ' + self._target_dimension + ' (' + targetLevel + ')' + "</h3>" chart, bubble = self.generate_distribution_card_chart( targetLevel, targetLevelContributions, levels, level_counts, total, impact_target_thershold) card2ChartData = NormalChartData(data=chart["data"]) "rounding the chartdata values for key drivers tab inside table percentage(table data)" for d in card2ChartData.get_data(): d['percentage'] = round(d['percentage'], 2) d_l.append(d) card2ChartJson = ChartJson() card2ChartJson.set_data(d_l) card2ChartJson.set_chart_type("combination") card2ChartJson.set_types({ "total": "bar", "percentage": "line" }) card2ChartJson.set_legend({ "total": "# of " + targetLevel, "percentage": "% of " + targetLevel }) card2ChartJson.set_axes({ "x": "key", "y": "total", "y2": "percentage" }) card2ChartJson.set_label_text({ "x": " ", "y": "Count", "y2": "Percentage" }) print("self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol) if (self._binTargetCol == True & self._binAnalyzedCol == False): print("Only Target Column is Binned") output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target.html', targetCardDataDict), self._blockSplitter) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print("Target Column and IV is Binned") output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target_and_IV.html', targetCardDataDict), self._blockSplitter) else: print("In Else, self._binTargetCol should be False : ", self._binTargetCol) output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2.html', targetCardDataDict), self._blockSplitter) card2Data.append(HtmlData(data=card2Heading)) statistical_info_array = [ ("Test Type", "Chi-Square"), ("Chi-Square statistic", str(round(self._chisquare_result.get_stat(), 3))), ("P-Value", str(round(self._chisquare_result.get_pvalue(), 3))), ("Inference", "Chi-squared analysis shows a significant association between {} (target) and {}." .format(self._target_dimension, self._analysed_dimension)) ] statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) card2Data.append( C3ChartData(data=card2ChartJson, info=statistical_info_array)) card2Data += output2 card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format( bubble[0]["value"], bubble[0]["text"], bubble[1]["value"], bubble[1]["text"]) card2Data.append(HtmlData(data=card2BubbleData)) targetCard = NormalCard() targetCard.set_card_data(card2Data) targetCard.set_card_name("{} : Distribution of {}".format( self._analysed_dimension, targetLevel)) self._targetCards.append(targetCard) dict_for_test[targetLevel] = targetCardDataDict out = {'data_dict': data_dict, 'target_dict': dict_for_test} return out
def _generate_summary(self): data_dict = {} rules_dict = self._table data_dict["blockSplitter"] = self._blockSplitter data_dict["targetcol"] = self._colname groups = rules_dict.keys() probabilityCutoff = 75 probabilityGroups = [{ "probability": probabilityCutoff, "count": 0, "range": [probabilityCutoff, 100] }, { "probability": probabilityCutoff - 1, "count": 0, "range": [0, probabilityCutoff - 1] }] tableArray = [[ "Prediction Rule", "Probability", "Prediction", "Freq", "group", "richRules" ]] dropdownData = [] chartDict = {} targetLevel = self._dataframe_context.get_target_level_for_model() probabilityArrayAll = [] self._completionStatus = self._dataframe_context.get_completion_status( ) progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Generating Prediction rules", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) self._dataframe_context.update_completion_status( self._completionStatus) targetValues = [x for x in rules_dict.keys() if x == targetLevel ] + [x for x in rules_dict.keys() if x != targetLevel] for idx, target in enumerate(targetValues): if idx == 0: if self._dataframe_context.get_story_on_scored_data() != True: dropdownData.append({ "displayName": target, "name": target, "selected": True, "id": idx + 1 }) else: dropdownData.append({ "displayName": "{} : {}".format(self._colname, target), "name": target, "selected": True, "id": idx + 1 }) else: if self._dataframe_context.get_story_on_scored_data() != True: dropdownData.append({ "displayName": target, "name": target, "selected": False, "id": idx + 1 }) else: dropdownData.append({ "displayName": "{} : {}".format(self._colname, target), "name": target, "selected": False, "id": idx + 1 }) rulesArray = rules_dict[target] probabilityArray = [ round(x, 2) for x in self.success_percent[target] ] probabilityArrayAll += probabilityArray groupArray = [ "strong" if x >= probabilityCutoff else "mixed" for x in probabilityArray ] for idx2, obj in enumerate(probabilityGroups): grpCount = len([ x for x in probabilityArray if x >= obj["range"][0] and x <= obj["range"][1] ]) obj["count"] += grpCount probabilityGroups[idx2] = obj predictionArray = [target] * len(rulesArray) freqArray = self.total_predictions[target] chartDict[target] = sum(freqArray) success = self.successful_predictions[target] success_percent = self.success_percent[target] richRulesArray = [] crudeRuleArray = [] analysisType = self._dataframe_context.get_analysis_type() targetCol = self._dataframe_context.get_result_column() binFlag = False if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] if binnedColObj != None and targetCol in binnedColObj: binFlag = True for idx2, crudeRule in enumerate(rulesArray): richRule, crudeRule = NarrativesUtils.generate_rules( self._colname, target, crudeRule, freqArray[idx2], success[idx2], success_percent[idx2], analysisType, binFlag=binFlag) richRulesArray.append(richRule) crudeRuleArray.append(crudeRule) probabilityArray = map( lambda x: humanize.apnumber(x) + "%" if x >= 10 else str(int(x)) + "%", probabilityArray) # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray) targetArray = zip(crudeRuleArray, probabilityArray, predictionArray, freqArray, groupArray, richRulesArray) targetArray = [list(x) for x in targetArray] tableArray += targetArray donutChartMaxLevel = 10 if self._dataframe_context.get_story_on_scored_data() == True: chartDict = {} probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART chartDict = dict( zip(probabilityRangeForChart.keys(), [0] * len(probabilityRangeForChart))) for val in probabilityArrayAll: for grps, grpRange in probabilityRangeForChart.items(): if val > grpRange[0] and val <= grpRange[1]: chartDict[grps] = chartDict[grps] + 1 chartDict = {k: v for k, v in chartDict.items() if v != 0} else: chartDict = dict([(k, sum(v)) for k, v in self.total_predictions.items()]) chartDict = {k: v for k, v in chartDict.items() if v != 0} if len(chartDict) > donutChartMaxLevel: chartDict = NarrativesUtils.restructure_donut_chart_data( chartDict, nLevels=donutChartMaxLevel) chartData = NormalChartData([chartDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(self._colname) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(45) # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}} dropdownDict = { "dataType": "dropdown", "label": "Showing prediction rules for", "data": dropdownData } data_dict["probabilityGroups"] = probabilityGroups if self._dataframe_context.get_story_on_scored_data() != True: maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\ 'decisiontreesummary.html',data_dict) else: predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]] predictedLevelCountDict = {} # predictedLevelcountDict = defaultdict(predictedLevelcountArray) for val in predictedLevelcountArray: predictedLevelCountDict.setdefault(val[0], []).append(val[1]) levelCountDict = {} for k, v in predictedLevelCountDict.items(): levelCountDict[k] = sum(v) # levelCountDict = self._metaParser.get_unique_level_dict(self._colname) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [{ "name": k, "count": v, "percentage": round(v * 100 / total, 2) } for k, v in levelCountDict.items() if v != None] percentageArray = [x["percentage"] for x in levelCountTuple] percentageArray = NarrativesUtils.ret_smart_round(percentageArray) levelCountTuple = [{ "name": obj["name"], "count": obj["count"], "percentage": str(percentageArray[idx]) + "%" } for idx, obj in enumerate(levelCountTuple)] data_dict["nlevel"] = len(levelCountDict) print "levelCountTuple", levelCountTuple print "levelCountDict", levelCountDict if targetLevel in levelCountDict: data_dict["topLevel"] = [ x for x in levelCountTuple if x["name"] == targetLevel ][0] if len(levelCountTuple) > 1: data_dict["secondLevel"] = max([ x for x in levelCountTuple if x["name"] != targetLevel ], key=lambda x: x["count"]) else: data_dict["secondLevel"] = None else: data_dict["topLevel"] = levelCountTuple[0] if len(levelCountTuple) > 1: data_dict["secondLevel"] = levelCountTuple[1] else: data_dict["secondLevel"] = None print data_dict maincardSummary = NarrativesUtils.get_template_output( self._base_dir, 'decisiontreescore.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, self._blockSplitter) main_card_data += main_card_narrative main_card_data.append(mainCardChart) main_card_data.append(dropdownDict) main_card_table = TableData() if self._dataframe_context.get_story_on_scored_data() == True: main_card_table.set_table_width(75) main_card_table.set_table_data(tableArray) main_card_table.set_table_type("popupDecisionTreeTable") main_card_data.append(main_card_table) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) else: main_card_table.set_table_width(100) main_card.set_card_data(main_card_data) main_card.set_card_name("Predicting Key Drivers of {}".format( self._colname)) self._decisionTreeNode.add_a_card(main_card)
def _generate_narratives(self): try: nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] except: nColsToUse = None self._anovaNodes = NarrativesTree() self._anovaNodes.set_name("Performance") for measure_column in self._df_anova_result.get_measure_columns(): measure_anova_result = self._df_anova_result.get_measure_result( measure_column) significant_dimensions_dict, insignificant_dimensions = measure_anova_result.get_OneWayAnovaSignificantDimensions( ) num_dimensions = len(list(significant_dimensions_dict.items()) ) + len(insignificant_dimensions) significant_dimensions = [ k for k, v in sorted(list(significant_dimensions_dict.items()), key=lambda x: -x[1]) ] if nColsToUse != None: significant_dimensions = significant_dimensions[:nColsToUse] num_significant_dimensions = len(significant_dimensions) num_insignificant_dimensions = len(insignificant_dimensions) print("num_significant_dimensions", num_significant_dimensions) if num_significant_dimensions > 0: mainCard = NormalCard(name="Overview of Key Factors") data_c3 = [] for sig_dim in significant_dimensions: data_c3.append({ 'dimension': sig_dim, 'effect_size': float(significant_dimensions_dict[sig_dim]) }) self.narratives = {} self.narratives[AnovaNarratives. KEY_HEADING] = "%s Performance Analysis" % ( measure_column, ) self.narratives['main_card'] = {} self.narratives['cards'] = [] self.narratives['main_card'][ AnovaNarratives. KEY_SUBHEADING] = "Relationship between %s and other Dimensions" % ( measure_column) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH] = [] data_dict = { \ 'significant_dimensions' : significant_dimensions, 'insignificant_dimensions' : insignificant_dimensions, 'num_significant_dimensions' : num_significant_dimensions, 'num_insignificant_dimensions' : num_insignificant_dimensions, 'num_dimensions' : num_significant_dimensions+num_insignificant_dimensions, 'target' : measure_column \ } output = {'header': ''} output['content'] = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_1.html', data_dict) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH].append(output) output1 = {'header': ''} output1['content'] = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_2.html', data_dict) lines = [] lines += NarrativesUtils.block_splitter( output['content'], self._blockSplitter) data_c3 = NormalChartData(data_c3) chart_data = data_c3.get_data() chartDataValues = [] effect_size_values = [] for obj in chart_data: effect_size_values.append(obj["effect_size"]) chart_data_min = min(effect_size_values) if chart_data_min < 0.00001: for obj in chart_data: chartDataValues.append(str(obj["effect_size"])) else: for obj in chart_data: chartDataValues.append(obj["effect_size"]) chart_json = ChartJson(data=chart_data, axes={ 'x': 'dimension', 'y': 'effect_size' }, label_text={ 'x': '', 'y': 'Effect Size (scaled exp values)' }, chart_type='bar') chart_json.set_axis_rotation(True) # chart_json.set_yaxis_number_format(".4f") chart_json.set_yaxis_number_format( NarrativesUtils.select_y_axis_format(chartDataValues)) # st_info = ["Test : ANOVA", "Threshold for p-value : 0.05", "Effect Size : Tukey's HSD"] statistical_info_array = [ ("Test Type", "ANOVA"), ("Effect Size", "ETA squared"), ("Max Effect Size", chart_data[0]["dimension"]), ("Min Effect Size", chart_data[-1]["dimension"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["dimension"], self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["dimension"], chart_data[1]["dimension"], self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4), round(chart_data[1]["effect_size"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4), round(chart_data[-1]["effect_size"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) lines += [ C3ChartData(data=chart_json, info=statistical_info_array) ] lines += NarrativesUtils.block_splitter( output1['content'], self._blockSplitter) mainCard.set_card_data(lines) self._anovaNodes.add_a_card(mainCard) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH].append(output1) self.narratives['main_card'][AnovaNarratives.KEY_CHART] = {} effect_size_chart = { 'heading': '', 'labels': { 'Dimension': 'Effect Size' }, 'data': significant_dimensions_dict } print(significant_dimensions_dict) self.narratives['main_card'][AnovaNarratives.KEY_CHART][ 'effect_size'] = effect_size_chart progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Analyzing Key Drivers", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) self._generate_dimension_narratives(significant_dimensions, measure_anova_result, measure_column) else: mainCard = NormalCard(name="Overview of Key Factors") cardText = HtmlData( "There are no dimensions in the dataset that have significant influence on {}" .format(measure_column)) mainCard.set_card_data([cardText]) self._anovaNodes.add_a_card(mainCard)
def _generate_summary(self): data_dict = {} rules_dict = self._table data_dict["blockSplitter"] = self._blockSplitter data_dict["targetcol"] = self._colname groups = rules_dict.keys() probabilityCutoff = 75 probabilityGroups = [{ "probability": probabilityCutoff, "count": 0, "range": [probabilityCutoff, 100] }, { "probability": probabilityCutoff - 1, "count": 0, "range": [0, probabilityCutoff - 1] }] tableArray = [[ "Prediction Rule", "Probability", "Prediction", "Freq", "group", "richRules" ]] dropdownData = [] chartDict = {} self._completionStatus = self._dataframe_context.get_completion_status( ) progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Generating Prediction rules", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) for idx, target in enumerate(rules_dict.keys()): targetToDisplayInTable = target.split(":")[0].strip() if idx == 0: dropdownData.append({ "displayName": target, "name": targetToDisplayInTable, "searchTerm": targetToDisplayInTable, "selected": True, "id": idx + 1 }) else: dropdownData.append({ "displayName": target, "name": targetToDisplayInTable, "searchTerm": targetToDisplayInTable, "selected": False, "id": idx + 1 }) rulesArray = rules_dict[target] probabilityArray = [ round(x, 2) for x in self.success_percent[target] ] groupArray = [ "strong" if x >= probabilityCutoff else "mixed" for x in probabilityArray ] for idx2, obj in enumerate(probabilityGroups): grpCount = len([ x for x in probabilityArray if x >= obj["range"][0] and x <= obj["range"][1] ]) obj["count"] += grpCount probabilityGroups[idx2] = obj predictionArray = [targetToDisplayInTable] * len(rulesArray) freqArray = self.total_predictions[target] chartDict[target] = sum(freqArray) success = self.successful_predictions[target] success_percent = self.success_percent[target] richRulesArray = [] crudeRuleArray = [] analysisType = self._dataframe_context.get_analysis_type() targetCol = self._dataframe_context.get_result_column() binFlag = False if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] if binnedColObj != None and targetCol in binnedColObj: binFlag = True for idx2, crudeRule in enumerate(rulesArray): richRule, crudeRule = NarrativesUtils.generate_rules( self._colname, target, crudeRule, freqArray[idx2], success[idx2], success_percent[idx2], analysisType, binFlag=binFlag) richRulesArray.append(richRule) crudeRuleArray.append(crudeRule) probabilityArray = map( lambda x: humanize.apnumber(x) + "%" if x >= 10 else str(int(x)) + "%", probabilityArray) # targetArray = zip(rulesArray,probabilityArray,predictionArray,freqArray,groupArray) targetArray = zip(crudeRuleArray, probabilityArray, predictionArray, freqArray, groupArray, richRulesArray) targetArray = [list(x) for x in targetArray] tableArray += targetArray donutChartMaxLevel = 10 if len(chartDict) > donutChartMaxLevel: chartDict = NarrativesUtils.restructure_donut_chart_data( chartDict, nLevels=donutChartMaxLevel) chartData = NormalChartData([chartDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(self._colname) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(45) # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}} dropdownDict = { "dataType": "dropdown", "label": "Showing prediction rules for", "data": dropdownData } data_dict["probabilityGroups"] = probabilityGroups maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\ 'decisiontreesummary.html',data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, self._blockSplitter) main_card_data += main_card_narrative main_card_data.append(mainCardChart) main_card_data.append(dropdownDict) main_card_table = TableData() main_card_table.set_table_data(tableArray) main_card_table.set_table_type("popupDecisionTreeTable") main_card_data.append(main_card_table) main_card.set_card_data(main_card_data) main_card.set_card_name("Predicting Key Drivers of {}".format( self._colname)) self._decisionTreeNode.add_a_card(main_card)
class BusinessCard(object): """ Functionalities """ def __init__(self, story_result, meta_parser, result_setter, dataframe_context, dataframe_helper, start_time, analysis_type): self._story_result = story_result self._meta_parser = meta_parser self._result_setter = result_setter self._dataframe_context = dataframe_context self._dataframe_helper = dataframe_helper self.subheader = "Impact" self.business_card1 = NormalCard() self.business_card1.set_card_name("Overview") self.businessCardData = [] self.start_time = start_time self.analysis_type = analysis_type def set_params(self): self.target_levels = self._dataframe_helper.get_num_unique_values( self._dataframe_context.get_result_column()) self.number_variables = self.get_number_variables() self.number_measures = self.get_number_measures() self.number_dimensions = self.get_number_dimensions() if self.analysis_type == 'dimension': self.analysis_list = [ "overview_rules", "association_summary", "association_rules", "prediction_rules" ] elif self.analysis_type == 'measure': self.analysis_list = [ "overview_rules", "performance_summary", "performance_rules", "influencers_summary", "influencers_rules", "prediction_rules" ] self.data_points = self.get_number_data_points() self.number_charts = self.get_number_charts() self.number_prediction_rules = self.get_number_prediction_rules() self.number_pages = self.get_number_pages() self.number_analysis = self.get_number_analysis() self.number_queries = self.get_number_queries() self.time_mAdvisor = time.time() - self.start_time self.time_analyst = self.get_time_analyst() self.time_saved = self.get_time_saved() self.impact_on_productivity = self.get_impact_on_productivity() def get_number_charts(self): return json.dumps(self._story_result, indent=2).count("c3Chart") def get_number_analysis(self): if self.analysis_type == 'dimension': significant_variables_levels = {"None": 0} for each in self._story_result['listOfNodes']: try: if each['name'] == 'Key Drivers': for node in each['listOfNodes']: significant_variables_levels[node['name']] = [ self._meta_parser.get_num_unique_values( node['name']) if node['name'] in self._dataframe_helper.get_string_columns() else 5 ][0] except: for key in each.keys(): if not key.startswith('maxdepth'): if each['name'] == 'Key Drivers': for node in each['listOfNodes']: significant_variables_levels[ node['name']] = [ self._meta_parser. get_num_unique_values(node['name']) if node['name'] in self._dataframe_helper. get_string_columns() else 5 ][0] self.number_analysis_dict = {} self.number_analysis_dict[ "overview_rules"] = self.target_levels * 2 self.number_analysis_dict['association_summary'] = ( self.number_dimensions + self.number_measures) * 2 self.number_analysis_dict["association_rules"] = sum( significant_variables_levels.values()) * 6 self.number_analysis_dict[ "prediction_rules"] = self.number_prediction_rules * 5 return sum(self.number_analysis_dict.values()) elif self.analysis_type == 'measure': significant_variables_levels = {"None": 0} for each in self._story_result['listOfNodes']: if each['name'] == 'Performance': for node in each['listOfNodes']: significant_variables_levels[node['name']] = [ self._dataframe_helper.get_num_unique_values( node['name']) if node['name'] in self._dataframe_helper.get_string_columns() else 5 ][0] self.number_analysis_dict = {} self.number_analysis_dict[ "overview_rules"] = self.target_levels * 2 self.number_analysis_dict["performance_summary"] = ( self.number_dimensions + self.number_measures) * 2 self.number_analysis_dict["performance_rules"] = sum( significant_variables_levels.values()) * 6 self.number_analysis_dict[ "prediction_rules"] = self.number_prediction_rules * 5 self.number_analysis_dict[ "influencers_summary"] = self.number_measures * 2 self.number_analysis_dict["influencers_rules"] = 8 return sum(self.number_analysis_dict.values()) def get_number_queries(self): if self.analysis_type == 'dimension': queries_per_analysis_dict = { "overview_rules": 15, "association_summary": 120, "association_rules": 600, "prediction_rules": 200 } elif self.analysis_type == 'measure': queries_per_analysis_dict = { "overview_rules": 15, "performance_summary": 120, "performance_rules": 600, "influencers_summary": 100, "influencers_rules": 80, "prediction_rules": 200 } sum = 0 for analysis in self.analysis_list: sum += self.number_analysis_dict[ analysis] * queries_per_analysis_dict[analysis] return sum def get_number_prediction_rules(self): num_prediction_rules = 0 for each_node in self._story_result['listOfNodes']: try: if each_node['name'] == 'Prediction': for card in each_node['listOfCards'][0]['cardData']: if card['dataType'] == 'table': num_prediction_rules = len( card['data']['tableData']) except: for key in each_node.keys(): if key.startswith('maxdepth'): if each_node['maxdepth3'][ 'name'] == 'Prediction' or each_node[ 'maxdepth4'][ 'name'] == 'Prediction' or each_node[ 'maxdepth5'][ 'name'] == 'Prediction': for Depth in range(3, 6): for card in each_node['maxdepth' + str( Depth)]['listOfCards'][0]['cardData']: if card['dataType'] == 'table': num_prediction_rules += len( card['data']['tableData']) return num_prediction_rules def get_number_pages(self): sum = 0 for each in self._story_result['listOfNodes']: try: if each['listOfNodes']: for items in each['listOfNodes']: sum += len(items['listOfCards']) sum += len(each['listOfCards']) else: sum += len(each['listOfCards']) except: for key in each.keys(): if key.startswith('maxdepth'): if each['maxdepth3']['listOfNodes'] or each[ 'maxdepth4']['listOfNodes'] or each[ 'maxdepth5']['listOfNodes']: for Depth in range(3, 6): for items in each['maxdepth' + str(Depth)]['listOfNodes']: sum += len( items['maxdepth' + str(Depth)]['listOfCards']) sum += len(each['maxdepth' + str(Depth)]['listOfCards']) else: for Depth in range(3, 6): sum += len(each['maxdepth' + str(Depth)]['listOfCards']) return sum def get_number_data_points(self): return self._meta_parser.get_num_rows( ) * self._meta_parser.get_num_columns() def get_number_variables(self): return self._meta_parser.get_num_columns() def get_number_dimensions(self): self.number_dimensions = len( self._dataframe_helper.get_string_columns()) return self.number_dimensions def get_number_measures(self): self.number_measures = len( self._dataframe_helper.get_numeric_columns()) return self.number_measures def get_time_analyst(self): if self.analysis_type == 'dimension': time_per_analysis_dict = { "overview_rules": 10, "association_summary": 120, "association_rules": 180, "prediction_rules": 300 } elif self.analysis_type == 'measure': time_per_analysis_dict = { "overview_rules": 10, "performance_summary": 120, "performance_rules": 180, "influencers_summary": 120, "influencers_rules": 180, "prediction_rules": 300 } sum = 0 for analysis in self.analysis_list: sum += self.number_analysis_dict[ analysis] * time_per_analysis_dict[analysis] return sum def get_time_saved(self): ''' Total Time Saved - 21 Hrs ( Productitvity Gain = Time taken by data scientist - time taken by mAdvisor) ''' return self.time_analyst - self.time_mAdvisor def get_impact_on_productivity(self): ''' Impact on Productivity - 3.5 X ( Impact on Productivity = Time taken by data scientist / time taken by mAdvisor) ''' productivity = str( round(old_div(self.time_analyst, self.time_mAdvisor), 1)) + "X" return productivity def get_summary_data(self): summaryData = [{ "name": "Total Data Points", "value": str(self.data_points) }, { "name": "Number of Queries", "value": str(self.number_queries) }, { "name": "Number of Analysis", "value": str(self.number_analysis) }, { "name": "Total Pages", "value": str(self.number_pages) }, { "name": "Total Time Saved", "value": CommonUtils.humanize_time(self.time_saved) }, { "name": "Impact on Productivity", "value": str(self.impact_on_productivity) }] # summaryData = HtmlData(data="<p> Hello World!!! </p>") summaryDataClass = DataBox(data=summaryData) self.businessCardData.append(summaryDataClass) # businessCardData.append(summaryData) # self.business_card1.set_card_data(self.businessCardData) # self._businessImpactNode.add_a_card(self.business_card1) def get_summary_para(self): para_normal = """<blockquote><p> <b>Great Job !!!</b> You have analysed the dataset that contains {} variables after executing about <b>{}</b> analytics queries and <b>{}</b> Statistical and ML analysis in parallel. Using mAdvisor, you have completed the analysis within <b>{}</b> which would have required around <b>{}</b>. </p></blockquote> """.format(self.number_variables, self.number_queries, self.number_analysis, CommonUtils.humanize_time(self.time_mAdvisor), CommonUtils.humanize_time(self.time_analyst)) para_images = """<div class="col-md-6"> <div class="d_analyst_block"> <span class="d_analyst_img"></span> <h1 class="pull-left xs-mt-40 xs-ml-10"> <small>Data Analyst <span class="bImpact_time_icon xs-ml-10"></span></small> <br> <small>{}</small> </h1> </div> </div> <div class="col-md-6"> <div class="d_m_block"> <span class="d_m_img"></span> <h1 class="pull-left xs-mt-40 xs-ml-10"><span class="bImpact_time_icon"></span><br> <small>{}</small> </h1> </div> </div> <div class="clearfix xs-m-50"></div> """.format(CommonUtils.humanize_time(self.time_analyst), CommonUtils.humanize_time(self.time_mAdvisor)) para_concatinated = """ <div class="row"> <div class="col-md-8 col-md-offset-2 xs-mt-20"> {}{} </div> </div> """.format(para_images, para_normal) paraDataClass = HtmlData(data=para_concatinated) self.businessCardData.append(paraDataClass) def Run(self): print("In Run of BusinessCard") self._businessImpactNode = NarrativesTree() self._businessImpactNode.set_name("Impact") self.set_params() summary = self.get_summary_data() summary_para = self.get_summary_para() self.business_card1.set_card_data(self.businessCardData) self._businessImpactNode.add_a_card(self.business_card1) self._result_setter.set_business_impact_node(self._businessImpactNode)