Пример #1
0
def submit_job_through_yarn():
    # print sys.argv
    # print json.loads(sys.argv[1])
    json_config = json.loads(sys.argv[1])
    # json_config["config"] = ""
    configJson = json_config["job_config"]
    config = configJson["config"]
    jobConfig = configJson["job_config"]
    jobType = jobConfig["job_type"]
    jobName = jobConfig["job_name"]
    jobURL = jobConfig["job_url"]
    messageURL = jobConfig["message_url"]
    killURL = jobConfig["kill_url"]
    try:
        main(json_config["job_config"])
    except Exception as e:
        # print jobURL, killURL
        data = {"status": "killed", "jobURL": jobURL}
        resp = send_kill_command(killURL, data)
        while str(resp.text) != '{"result": "success"}':
            data = {"status": "killed", "jobURL": jobURL}
            resp = send_kill_command(killURL, data)
        # print resp.text
        print('Main Method Did Not End ....., ', str(e))
        progressMessage = CommonUtils.create_progress_message_object(
            "Main Method Did Not End .....", "Main Method Did Not End .....",
            "Error", str(e), "Failed", 100)
        CommonUtils.save_progress_message(messageURL,
                                          progressMessage,
                                          emptyBin=True)
Пример #2
0
    def __init__(self, data_frame, df_helper, df_context, meta_parser, spark):
        self._data_frame = data_frame
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._metaParser = meta_parser
        self._spark = spark

        self._ignoreRegressionElasticityMessages = self._dataframe_context.get_ignore_msg_regression_elasticity(
        )
        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        self._messageURL = self._dataframe_context.get_message_url()
        self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
        )
        self._scriptStages = {
            "regressionTrainingStart": {
                "summary": "Started the Regression Script",
                "weight": 0
            },
            "regressionTrainingEnd": {
                "summary": "Regression coefficients calculated",
                "weight": 10
            },
        }
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionTrainingStart",\
                                    "info",\
                                    self._scriptStages["regressionTrainingStart"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        if self._ignoreRegressionElasticityMessages != True:
            CommonUtils.save_progress_message(
                self._messageURL,
                progressMessage,
                ignore=self._ignoreRegressionElasticityMessages)
            self._dataframe_context.update_completion_status(
                self._completionStatus)

        self._data_frame = self._dataframe_helper.fill_missing_values(
            self._data_frame)
Пример #3
0
    def __init__(self,
                 data_frame,
                 df_helper,
                 df_context,
                 scriptWeight=None,
                 analysisName=None):
        self._data_frame = data_frame
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._messageURL = self._dataframe_context.get_message_url()
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "statCalculationStart": {
                "summary": "Initialized the Descriptive Stats Scripts",
                "weight": 0
            },
            "statCalculationEnd": {
                "summary": "Descriptive Stats Calculated",
                "weight": 10
            },
        }
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "statCalculationStart",\
                                    "info",\
                                    self._scriptStages["statCalculationStart"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
Пример #4
0
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Random Forest Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Random Forest Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["initialization"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # Match with the level_counts and then clean the data
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        cat_cols = self._dataframe_helper.get_string_columns()
        # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict")
        # if level_counts_train != {}:
        #     for key in level_counts_train:
        #         if key in level_counts_score:
        #             if level_counts_train[key] != level_counts_score[key]:
        #                 dataSanity = False
        #         else:
        #             dataSanity = False
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()

        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":

            score_data_path = self._dataframe_context.get_score_path(
            ) + "/data.csv"
            if score_data_path.startswith("file"):
                score_data_path = score_data_path[7:]
            trained_model_path = self._dataframe_context.get_model_path()
            trained_model_path += "/" + self._dataframe_context.get_model_for_scoring(
            ) + ".pkl"
            if trained_model_path.startswith("file"):
                trained_model_path = trained_model_path[7:]
            score_summary_path = self._dataframe_context.get_score_path(
            ) + "/Summary/summary.json"
            if score_summary_path.startswith("file"):
                score_summary_path = score_summary_path[7:]
            trained_model = joblib.load(trained_model_path)
            # pandas_df = self._data_frame.toPandas()
            df = self._data_frame.toPandas()
            model_columns = self._dataframe_context.get_model_features()
            pandas_df = MLUtils.create_dummy_columns(
                df, [x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns,
                                                     result_column)
            if uid_col:
                pandas_df = pandas_df[[
                    x for x in pandas_df.columns if x != uid_col
                ]]
            y_score = trained_model.predict(pandas_df)
            y_prob = trained_model.predict_proba(pandas_df)
            y_prob = MLUtils.calculate_predicted_probability(y_prob)
            y_prob = list([round(x, 2) for x in y_prob])
            score = {
                "predicted_class": y_score,
                "predicted_probability": y_prob
            }

        df["predicted_class"] = score["predicted_class"]
        labelMappingDict = self._dataframe_context.get_label_map()
        df["predicted_class"] = df["predicted_class"].apply(
            lambda x: labelMappingDict[x] if x != None else "NA")
        df["predicted_probability"] = score["predicted_probability"]
        self._score_summary[
            "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                df)
        self._score_summary["result_column"] = result_column
        if result_column in df.columns:
            df.drop(result_column, axis=1, inplace=True)
        df = df.rename(index=str, columns={"predicted_class": result_column})
        df.to_csv(score_data_path, header=True, index=False)
        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(df[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = df[df[result_column] == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["prediction"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary}))

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        # considercolumnstype = self._dataframe_context.get_score_consider_columns_type()
        # considercolumns = self._dataframe_context.get_score_consider_columns()
        # if considercolumnstype != None:
        #     if considercolumns != None:
        #         if considercolumnstype == ["excluding"]:
        #             columns_to_drop = considercolumns
        #         elif considercolumnstype == ["including"]:
        #             columns_to_keep = considercolumns

        columns_to_keep = self._dataframe_context.get_score_consider_columns()
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]
        columns_to_drop = [
            x for x in columns_to_drop
            if x in df.columns and x != result_column
        ]
        print("columns_to_drop", columns_to_drop)
        df.drop(columns_to_drop, axis=1, inplace=True)

        resultColLevelCount = dict(df[result_column].value_counts())
        # self._metaParser.update_level_counts(result_column,resultColLevelCount)
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(list(resultColLevelCount.keys()))
            })
        self._dataframe_context.set_story_on_scored_data(True)
        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        spark_scored_df = SQLctx.createDataFrame(df)
        # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True)
        # TODO update metadata for the newly created dataframe
        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column])
        #     df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj)
        #     narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     narratives = CommonUtils.as_dict(narratives_obj)
        #
        #     print "Frequency Analysis Done in ", time.time() - fs,  " seconds."
        #     self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10
        #     progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                                 "frequency",\
        #                                 "info",\
        #                                 self._scriptStages["frequency"]["summary"],\
        #                                 self._completionStatus,\
        #                                 self._completionStatus)
        #     CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg)
        #     self._dataframe_context.update_completion_status(self._completionStatus)
        #     print "Frequency ",self._completionStatus
        # except:
        #     print "Frequency Analysis Failed "
        #
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column])
        #     df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
        #     chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName))
        # except:
        #     print "ChiSquare Analysis Failed "
        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except:
                print("DecisionTree Analysis Failed ")
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(list(labelMappingDict.values()))
            }
            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in list(resultColLevelCount.items())
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in list(levelCountDict.values()) if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(old_div(v * 100, total)) +
                "%" if old_div(v * 100, total) >= 10 else
                str(int(old_div(v * 100, total))) + "%"
            }) for k, v in list(levelCountDict.items()) if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(list(levelCountDict.keys()))
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
Пример #5
0
    def __init__(self, df_helper, df_context, result_setter, spark,
                 story_narrative, meta_parser):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._spark = spark
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._data_frame = df_helper.get_data_frame()
        self._num_significant_digits = NarrativesUtils.get_significant_digit_settings(
            "trend")
        self._metaParser = meta_parser

        self._result_column = self._dataframe_context.get_result_column()
        self._string_columns = self._dataframe_helper.get_string_columns()
        self._timestamp_columns = self._dataframe_helper.get_timestamp_columns(
        )

        # self._selected_date_columns = None
        self._selected_date_columns = self._dataframe_context.get_selected_date_columns(
        )
        self._all_date_columns = self._dataframe_context.get_date_columns()
        self._string_columns = list(
            set(self._string_columns) - set(self._all_date_columns))

        self._dateFormatDetected = False
        self._existingDateFormat = None
        self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict(
        )
        self._dateColumnFormatDict = df_context.get_date_format_dict()
        if self._dataframe_context.get_requested_date_format() != None:
            self._requestedDateFormat = df_context.get_requested_date_format()
        else:
            self._requestedDateFormat = None

        self._analysistype = self._dataframe_context.get_analysis_type()
        self._trendSettings = self._dataframe_context.get_trend_settings()
        self._trendSpecificMeasure = False
        if self._trendSettings != None:
            if self._analysistype == "dimension" and self._trendSettings[
                    "name"] != "Count":
                self._trendSpecificMeasure = True
                self._analysistype = "measure"
                self._result_column = self._trendSettings["selectedMeasure"]
            elif self._analysistype == "measure" and self._trendSettings[
                    "name"] != "Count":
                self._result_column = self._trendSettings["selectedMeasure"]

        self._trend_subsection = self._result_setter.get_trend_section_name()
        self._regression_trend_card = None
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._trend_on_td_column = False
        self._number_of_dimensions_to_consider = 10

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._messageURL = self._dataframe_context.get_message_url()
        if self._analysistype == "dimension":
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
            self._scriptStages = {
                "initialization": {
                    "summary": "Initialized The Frequency Narratives",
                    "weight": 0
                },
                "summarygeneration": {
                    "summary": "Summary Generation Finished",
                    "weight": 4
                },
                "completion": {
                    "summary": "Frequency Stats Narratives Done",
                    "weight": 0
                },
            }
        elif self._analysistype == "measure":
            if self._trendSpecificMeasure:
                self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
                )
            else:
                self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
                )
            self._scriptStages = {
                "trendNarrativeStart": {
                    "summary": "Started The Descriptive Stats Narratives",
                    "weight": 1
                },
                "trendNarrativeEnd": {
                    "summary": "Narratives For Descriptive Stats Finished",
                    "weight": 0
                },
            }

        self._base_dir = "/trend/"
        if self._pandas_flag and self._selected_date_columns and not self._dateColumnFormatDict and not self._timestamp_columns:
            for column in self._selected_date_columns:
                uniqueVals = self._data_frame[column].astype(
                    str).unique().tolist()
                metaHelperInstance = MetaDataHelper(self._data_frame,
                                                    self._data_frame.shape[0])
                if len(uniqueVals
                       ) > 0 and metaHelperInstance.get_datetime_format_pandas(
                           [
                               self._data_frame.sort_values(
                                   by=column, ascending=False)[column][0]
                           ]) != None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(
                        uniqueVals)
                    self._dateColumnFormatDict.update(
                        {column: dateColumnFormat})
        dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\
                                                    self._timestamp_columns,\
                                                    self._dateColumnFormatDict,\
                                                    self._dateFormatConversionDict,
                                                    self._requestedDateFormat)
        print(dateColCheck)

        self._dateFormatDetected = dateColCheck["dateFormatDetected"]
        self._trend_on_td_column = dateColCheck["trendOnTdCol"]
        if self._dateFormatDetected:
            self._requestedDateFormat = dateColCheck["requestedDateFormat"]
            self._existingDateFormat = dateColCheck["existingDateFormat"]
            # self._date_column_suggested is the column used for trend
            self._date_column_suggested = dateColCheck["suggestedDateColumn"]
        if self._existingDateFormat:
            self._data_frame, dataRangeStats = NarrativesUtils.calculate_data_range_stats(
                self._data_frame, self._existingDateFormat,
                self._date_column_suggested, self._trend_on_td_column,
                self._pandas_flag)
            print(dataRangeStats)
            self._durationString = dataRangeStats["durationString"]
            self._duration = dataRangeStats["duration"]
            self._dataLevel = dataRangeStats["dataLevel"]
            first_date = dataRangeStats["firstDate"]
            last_date = dataRangeStats["lastDate"]

            if self._timestamp_columns != None:
                if self._selected_date_columns == None:
                    self._selected_date_columns = self._timestamp_columns
                else:
                    self._selected_date_columns += self._timestamp_columns
        if self._pandas_flag:
            pass
        else:
            if self._trend_subsection == "regression":
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        trend_subsection_data = self._result_setter.get_trend_section_data(
                        )
                        measure_column = trend_subsection_data[
                            "measure_column"]
                        result_column = trend_subsection_data["result_column"]
                        base_dir = trend_subsection_data["base_dir"]

                        card3heading = 'How ' + result_column + ' and ' + measure_column + ' changed over time'
                        if self._dataLevel == "day":
                            grouped_data = self._data_frame.groupBy(
                                "suggestedDate").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "year_month",
                                udf(lambda x: x.strftime("%b-%y"))(
                                    "suggestedDate"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[0], "key")
                            grouped_data = grouped_data.toPandas()
                        elif self._dataLevel == "month":
                            grouped_data = self._data_frame.groupBy(
                                "year_month").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "suggestedDate",
                                udf(lambda x: datetime.strptime(x, "%b-%y"))(
                                    "year_month"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                "suggestedDate", "key")
                            grouped_data = grouped_data.select([
                                "key", measure_column, result_column,
                                "year_month"
                            ]).toPandas()
                            grouped_data["key"] = grouped_data[
                                "year_month"].apply(
                                    lambda x: datetime.strptime(x, "%b-%y"
                                                                ).date())

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)

                        card3data = trend_narrative_obj.generate_regression_trend_data(
                            grouped_data, measure_column, result_column,
                            self._dataLevel, self._durationString)

                        card3narrative = NarrativesUtils.get_template_output(base_dir,\
                                                                        'regression_card3.html',card3data)

                        card3chart = trend_narrative_obj.generate_regression_trend_chart(
                            grouped_data, self._dataLevel)
                        card3paragraphs = NarrativesUtils.paragraph_splitter(
                            card3narrative)
                        card2 = {
                            'charts': card3chart,
                            'paragraphs': card3paragraphs,
                            'heading': card3heading
                        }
                        self.set_regression_trend_card_data(card2)
                    else:
                        print("NO DATE FORMAT DETECTED")
                else:
                    print("NO DATE COLUMNS PRESENT")

        if self._analysistype == "measure":
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["trendNarrativeStart"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "trendNarrativeStart",\
                                        "info",\
                                        self._scriptStages["trendNarrativeStart"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)
            # self._startMeasureTrend = self._result_setter.get_trend_section_completion_status()
            self._startMeasureTrend = True

            if self._startMeasureTrend == True:
                self.narratives = {
                    "SectionHeading": "",
                    "card1": {},
                    "card2": {},
                    "card3": {}
                }
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            self._data_frame, self._dataLevel,
                            self._result_column, self._analysistype,
                            self._pandas_flag)
                        if self._pandas_flag:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested, axis=1)
                        else:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested)
                        # self._data_frame = self._data_frame.withColumnRenamed("year_month", self._date_column_suggested)

                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/trend_grouped_pandas.csv",index=False)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        # # update reference time with max value
                        reference_time = dataDict["reference_time"]
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            if self._dataLevel == "day":
                                datetimeformat = self._existingDateFormat
                            elif self._dataLevel == "month":
                                datetimeformat = "%b-%y"
                            # xtraData = trend_narrative_obj.get_xtra_calculations(self._data_frame,grouped_data,significant_dimensions,self._date_column_suggested,self._result_column,self._existingDateFormat,reference_time,self._dataLevel, self._pandas_flag)
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                self._data_frame, grouped_data,
                                significant_dimensions,
                                self._date_column_suggested,
                                self._result_column, datetimeformat,
                                reference_time, self._dataLevel,
                                self._pandas_flag)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        # print 'Trend dataDict:  %s' %(json.dumps(dataDict, indent=2))
                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        dataDict.update({
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        summary1 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card1.html',dataDict)
                        summary2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card2.html',dataDict)
                        measureTrendCard = NormalCard()
                        measureTrendcard1Data = NarrativesUtils.block_splitter(
                            summary1,
                            self._blockSplitter,
                            highlightFlag=self._highlightFlag)
                        measureTrendcard2Data = NarrativesUtils.block_splitter(
                            summary2, self._blockSplitter)
                        # print measureTrendcard1Data

                        bubbledata = dataDict["bubbleData"]
                        # print bubbledata
                        card1BubbleData = "<div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div><div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div>".format(
                            bubbledata[0]["value"], bubbledata[0]["text"],
                            bubbledata[1]["value"], bubbledata[1]["text"])
                        # print card1BubbleData

                        trend_chart_data = list(
                            grouped_data[["key",
                                          "value"]].T.to_dict().values())
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = {"actual": [], "predicted": []}

                        if self._dataLevel == "day":
                            card1chartdata["actual"] = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in trend_chart_data]
                        elif self._dataLevel == "month":
                            card1chartdata["actual"] = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in trend_chart_data]

                        if self._duration < 365:
                            prediction_window = 3
                        else:
                            prediction_window = 6
                        predicted_values = trend_narrative_obj.get_forecast_values(
                            grouped_data["value"],
                            prediction_window)[len(grouped_data["value"]):]
                        predicted_values = [
                            round(x, self._num_significant_digits)
                            for x in predicted_values
                        ]

                        forecasted_data = []
                        forecasted_data.append(card1chartdata["actual"][-1])
                        forecasted_dates = []
                        # forecast_start_time = datetime.strptime(card1chartdata["actual"][-1]["key"],"%b-%y")
                        if self._dataLevel == "month":
                            forecast_start_time = datetime.strptime(
                                card1chartdata["actual"][-1]["key"], "%b-%y")
                        elif self._dataLevel == "day":
                            try:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    "%Y-%m-%d")
                            except:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    '%Y-%m-%d %H:%M:%S')
                        for val in range(prediction_window):
                            if self._dataLevel == "month":
                                key = forecast_start_time + relativedelta(
                                    months=1 + val)
                                forecasted_dates.append(key)
                            elif self._dataLevel == "day":
                                key = forecast_start_time + relativedelta(
                                    days=1 + val)
                                forecasted_dates.append(key)
                        forecasted_list = list(
                            zip(forecasted_dates, predicted_values))
                        if self._dataLevel == "month":
                            forecasted_list = [{
                                "key": val[0].strftime("%b-%y"),
                                "value": val[1]
                            } for val in forecasted_list]
                        elif self._dataLevel == "day":
                            forecasted_list = [{
                                "key":
                                val[0].strftime("%Y-%m-%d"),
                                "value":
                                val[1]
                            } for val in forecasted_list]
                        forecasted_data += forecasted_list
                        card1chartdata["predicted"] = forecasted_data
                        # print json.dumps(card1chartdata,indent=2)
                        card1chartdata = ScatterChartData(data=card1chartdata)
                        chartJson = ChartJson()
                        chartJson.set_data(card1chartdata.get_data())
                        chartJson.set_label_text({
                            'x': ' ',
                            'y': 'No. of Observations'
                        })
                        chartJson.set_legend({
                            "actual": "Observed",
                            "predicted": "Forecast"
                        })
                        chartJson.set_chart_type("scatter_line")
                        chartJson.set_axes({"x": "key", "y": "value"})
                        chartJson.set_yaxis_number_format(".2f")
                        st_info = [
                            "Trend Analysis",
                            "Forecast Method : Holt Winters Method"
                        ]
                        measureTrendcard1Data.insert(
                            1, C3ChartData(data=chartJson, info=st_info))
                        measureTrendcard1Data.append(
                            HtmlData(data=card1BubbleData))
                        cardData = measureTrendcard1Data + measureTrendcard2Data
                        measureTrendCard.set_card_data(cardData)
                        measureTrendCard.set_card_name("Trend Analysis")
                        trendStoryNode = NarrativesTree(
                            "Trend", None, [], [measureTrendCard])
                        self._story_narrative.add_a_node(trendStoryNode)
                        self._result_setter.set_trend_node(trendStoryNode)

                        # prediction_data = [{"key":x["key"],"value":x["value"]} for x in trend_chart_data]
                        # last_val = prediction_data[-1]
                        # last_val.update({"predicted_value":last_val["value"]})
                        # prediction_data[-1] = last_val
                        #
                        # for val in range(prediction_window):
                        #     dataLevel = dataDict["dataLevel"]
                        #     if self._dataLevel == "month":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(months=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        #         forecasted_data.append({"key":key,"value":predicted_values[val]})
                        #     elif self._dataLevel == "day":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(days=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        # prediction_data_copy = prediction_data
                        # prediction_data = []
                        # for val in prediction_data_copy:
                        #     val["key"] = val["key"].strftime("%b-%y")
                        #     prediction_data.append(val)

                        # forecastDataDict = {"startForecast":predicted_values[0],
                        #                     "endForecast":predicted_values[prediction_window-1],
                        #                     "measure":dataDict["measure"],
                        #                     "forecast":True,
                        #                     "forecast_percentage": round((predicted_values[prediction_window-1]-predicted_values[0])/predicted_values[0],self._num_significant_digits),
                        #                     "prediction_window_text": str(prediction_window) + " months"
                        #                     }
                        #
                        # self._result_setter.update_executive_summary_data(forecastDataDict)
                        # summary3 = NarrativesUtils.get_template_output(self._base_dir,\
                        # 'trend_narrative_card3.html',forecastDataDict)
                        self._completionStatus += old_div(
                            self._scriptWeightDict[self._analysisName]["total"]
                            *
                            self._scriptStages["trendNarrativeEnd"]["weight"],
                            10)
                        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                    "trendNarrativeEnd",\
                                                    "info",\
                                                    self._scriptStages["trendNarrativeEnd"]["summary"],\
                                                    self._completionStatus,\
                                                    self._completionStatus)
                        CommonUtils.save_progress_message(
                            self._messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                    else:
                        # self._result_setter.update_executive_summary_data({"trend_present":False})
                        print("Trend Analysis for Measure Failed")
                        print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                        print("#" * 60)
                        self._completionStatus += self._scriptWeightDict[
                            self._analysisName]["total"]
                        self._dataframe_context.update_completion_status(
                            completionStatus)
                        progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                        "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                        completionStatus,completionStatus)
                        CommonUtils.save_progress_message(
                            messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                else:
                    # self._result_setter.update_executive_summary_data({"trend_present":False})
                    print("Trend Analysis for Measure Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    print("No date column present for Trend Analysis.")
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "No Date Column Present",\
                                    completionStatus,completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
            else:
                print("overall Trend not Started YET")

        elif self._analysistype == "dimension":
            print("Dimension Trend Started")
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["initialization"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "initialization",\
                                        "info",\
                                        self._scriptStages["initialization"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)

            self.narratives = {"card0": {}}
            if self._selected_date_columns != None:
                if self._dateFormatDetected:
                    # result_column_levels = [x[0] for x in self._data_frame.select(self._result_column).distinct().collect()]
                    try:
                        result_column_levels = self._metaParser.get_unique_level_names(
                            self._result_column)
                    except:
                        if self._pandas_flag:
                            result_column_levels = list(
                                self._data_frame[self._result_column].unique())
                        else:
                            result_column_levels = [
                                x[0] for x in self._data_frame.select(
                                    self._result_column).distinct().collect()
                            ]
                            # result_column_levels = self._data_frame.agg((F.collect_set(self._result_column).alias(self._result_column))).first().asDict()[self._result_column]

                    print("-" * 100)
                    # TODO Implement meta parser getter here
                    print(result_column_levels)
                    if self._pandas_flag:
                        level_count_df = self._data_frame[
                            self._result_column].value_counts()[0:2]
                        top2levels = list(level_count_df.index)
                    else:
                        level_count_df = self._data_frame.groupBy(
                            self._result_column).count().orderBy(
                                "count", ascending=False)
                        level_count_df_rows = level_count_df.collect()
                        top2levels = [
                            level_count_df_rows[0][0],
                            level_count_df_rows[1][0]
                        ]
                    cardData = []
                    chart_data = {}
                    cardData1 = []
                    c3_chart = {"dataType": "c3Chart", "data": {}}
                    print("#" * 40)
                    overall_count = NarrativesUtils.get_grouped_count_data_for_dimension_trend(
                        self._data_frame, self._dataLevel, self._result_column,
                        self._pandas_flag)
                    print("#" * 40)
                    for idx, level in enumerate(top2levels):
                        print("calculations in progress for the level :- ",
                              level)
                        if self._pandas_flag:
                            leveldf = self._data_frame[self._data_frame[
                                self._result_column] == level]
                        else:
                            leveldf = self._data_frame.filter(
                                col(self._result_column) == level)
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            leveldf, self._dataLevel, self._result_column,
                            self._analysistype, self._pandas_flag)
                        grouped_data.rename(columns={"value": "value_count"},
                                            inplace=True)
                        grouped_data = pd.merge(grouped_data,
                                                overall_count,
                                                on='key',
                                                how='left')
                        # grouped_data["value"] = grouped_data["value_count"].apply(lambda x:round(x*100/float(self._data_frame.count()),self._num_significant_digits))
                        grouped_data["value"] = old_div(
                            grouped_data["value_count"],
                            grouped_data["totalCount"])
                        grouped_data["value"] = grouped_data["value"].apply(
                            lambda x: round(x * 100, self.
                                            _num_significant_digits))
                        if self._pandas_flag:
                            leveldf = leveldf.drop(self._date_column_suggested,
                                                   axis=1)
                            leveldf = leveldf.rename(
                                columns={
                                    "year_month": self._date_column_suggested
                                })
                            if "year_month" not in leveldf.columns:
                                leveldf["year_month"] = leveldf[
                                    self._date_column_suggested]
                            leveldf["value_col"] = 1
                        else:
                            leveldf = leveldf.drop(self._date_column_suggested)
                            leveldf = leveldf.withColumnRenamed(
                                "year_month", self._date_column_suggested)
                            if "year_month" not in leveldf.columns:
                                leveldf = leveldf.withColumn(
                                    "year_month",
                                    col(self._date_column_suggested))
                            leveldf = leveldf.withColumn('value_col', lit(1))

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        dataDict["target_column"] = dataDict["measure"]
                        dataDict["measure"] = level
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/grouped_data"+str(idx))
                        # print json.dumps(dataDict,indent=2)
                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_chisquare_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        reference_time = dataDict["reference_time"]
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            st = time.time()
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                leveldf, grouped_data, significant_dimensions,
                                self._date_column_suggested, "value_col",
                                self._existingDateFormat, reference_time,
                                self._dataLevel, self._pandas_flag)
                            print("time for get_xtra_calculations",
                                  time.time() - st)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        dimensionCount = trend_narrative_obj.generate_dimension_extra_narrative(
                            grouped_data, dataDict, self._dataLevel)
                        if dimensionCount != None:
                            dataDict.update(dimensionCount)

                        dataDict.update({
                            "level_index": idx,
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        trendStory = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'dimension_trend.html',dataDict)
                        blocks = NarrativesUtils.block_splitter(
                            trendStory, self._blockSplitter)

                        if idx != 0:
                            cardData1 += blocks[2:]
                        else:
                            cardData1 += blocks

                        trend_chart_data = [
                            x for x in list(grouped_data[
                                ["key", "value"]].T.to_dict().values())
                            if x['key'] != None
                        ]
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = trend_chart_data
                        if self._dataLevel == "day":
                            card1chartdata = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in card1chartdata]
                        elif self._dataLevel == "month":
                            card1chartdata = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in card1chartdata]
                        chart_data[level] = card1chartdata

                    labels = {
                        "x": "key",
                        "y": list(chart_data.keys())[0],
                        "y2": list(chart_data.keys())[1]
                    }
                    c3Chart = {
                        "data": chart_data,
                        "format": "%b-%y",
                        "label": labels,
                        "label_text": {
                            "x": "Time",
                            "y": "Percentage of " + labels["y"],
                            "y2": "Percentage of " + labels["y2"]
                        }
                    }

                    c3_chart["data"] = c3Chart
                    multiLineData = []
                    for idx in range(len(chart_data[top2levels[0]])):
                        key = chart_data[top2levels[0]][idx]["key"]
                        value = chart_data[top2levels[0]][idx]["value"]
                        try:
                            value1 = chart_data[top2levels[1]][idx]["value"]
                        except:
                            value1 = 0
                        multiLineData.append({
                            "key": key,
                            top2levels[0]: value,
                            top2levels[1]: value1
                        })
                    chartData = NormalChartData(multiLineData)
                    chartJson = ChartJson()
                    chartJson.set_data(chartData.get_data())
                    chartJson.set_label_text(c3Chart["label_text"])
                    chartJson.set_legend(c3Chart["label"])
                    chartJson.set_chart_type("line")
                    chartJson.set_yaxis_number_format(".2f")
                    chartJson.set_axes(labels)
                    st_info = [
                        "Trend Analysis",
                        "Forecast Method : Holt Winters Method"
                    ]
                    cardData1.insert(1,
                                     C3ChartData(data=chartJson, info=st_info))
                    trendCard = NormalCard(name="Trend Analysis",
                                           slug=None,
                                           cardData=cardData1)
                    trendStoryNode = NarrativesTree("Trend", None, [],
                                                    [trendCard])
                    self._story_narrative.add_a_node(trendStoryNode)
                    self._result_setter.set_trend_node(trendStoryNode)
                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["summarygeneration"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "summarygeneration",\
                                                "info",\
                                                self._scriptStages["summarygeneration"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["completion"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "completion",\
                                                "info",\
                                                self._scriptStages["completion"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                else:
                    self._result_setter.update_executive_summary_data(
                        {"trend_present": False})
                    print("Trend Analysis for Dimension Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    if self._date_column_suggested:
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                    self._completionStatus,self._completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

            else:
                self._result_setter.update_executive_summary_data(
                    {"trend_present": False})
                print("Trend Analysis for Dimension Failed")
                print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                print("No date column present for Trend Analysis.")
                print("#" * 60)
                self._completionStatus += self._scriptWeightDict[
                    self._analysisName]["total"]
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
                progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                "No Date Column Present",\
                                self._completionStatus,self._completionStatus)
                CommonUtils.save_progress_message(messageURL, progressMessage)
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
Пример #6
0
    def generate_narratives(self):
        regression_narrative_obj = LinearRegressionNarrative(
                                    self._df_regression_result,
                                    self._correlations,
                                    self._dataframe_helper,
                                    self._dataframe_context,
                                    self._metaParser,
                                    self._spark
                                    )
        main_card_data = regression_narrative_obj.generate_main_card_data()
        main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'regression_main_card.html',main_card_data)
        self.narratives['main_card'] = {}
        self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative)
        self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column
        self.narratives["main_card"]['chart'] = {}
        self.narratives["main_card"]['chart']['heading'] = ''
        self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs],
                                                         [j['coefficient'] for i,j in self._all_coeffs]]
        self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name',
                                                            'y': 'Change in ' + self.result_column + ' per unit increase'}

        main_card = NormalCard()
        main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>")
        main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter)
        main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])]
        main_card_chart = NormalChartData(data=main_card_chart_data)
        mainCardChartJson = ChartJson()
        mainCardChartJson.set_data(main_card_chart.get_data())
        mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'})
        mainCardChartJson.set_chart_type("bar")
        mainCardChartJson.set_axes({"x":"key","y":"value"})
        mainCardChartJson.set_yaxis_number_format(".2f")
        # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"]
        chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True)
        statistical_info_array=[
            ("Test Type","Regression"),
            ("Effect Size","Coefficients"),
            ("Max Effect Size",chart_data[0]["key"]),
            ("Min Effect Size",chart_data[-1]["key"]),
            ]
        statistical_inferenc = ""
        if len(chart_data) == 1:
            statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \
             Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4))
        elif len(chart_data) == 2:
            statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \
             Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4))
        else:
            statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \
             Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4))
        if statistical_inference != "":
            statistical_info_array.append(("Inference",statistical_inference))
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)
        main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)])
        main_card.set_card_name("Key Influencers")
        self._regressionNode.add_a_card(main_card)


        count = 0
        for measure_column in self.significant_measures:
            sigMeasureNode = NarrativesTree()
            sigMeasureNode.set_name(measure_column)
            measureCard1 = NormalCard()
            measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column))
            measureCard1Data = []
            if self._run_dimension_level_regression:
                measureCard2 = NormalCard()
                measureCard2.set_card_name("Key Areas where it Matters")
                measureCard2Data = []

            measure_column_cards = {}
            card0 = {}
            card1data = regression_narrative_obj.generate_card1_data(measure_column)
            card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>"
            measureCard1Header = HtmlData(data=card1heading)
            card1data.update({"blockSplitter":self._blockSplitter})
            card1narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card1.html',card1data)

            card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter)
            card0 = {"paragraphs":card1paragraphs}
            card0["charts"] = {}
            card0['charts']['chart2']={}
            # card0['charts']['chart2']['data']=card1data["chart_data"]
            # card0['charts']['chart2']['heading'] = ''
            # card0['charts']['chart2']['labels'] = {}
            card0['charts']['chart1']={}
            card0["heading"] = card1heading
            measure_column_cards['card0'] = card0

            measureCard1Header = HtmlData(data=card1heading)
            measureCard1Data += [measureCard1Header]
            measureCard1para = card1paragraphs
            measureCard1Data += measureCard1para

            if self._run_dimension_level_regression:
                print("running narratives for key area dict")
                self._dim_regression = self.run_regression_for_dimension_levels()
                card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression)
                card2data.update({"blockSplitter":self._blockSplitter})
                card2narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card2.html',card2data)
                card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter)

                card1 = {'tables': card2table, 'paragraphs' : card2paragraphs,
                        'heading' : 'Key Areas where ' + measure_column + ' matters'}
                measure_column_cards['card1'] = card1

                measureCard2Data += card2paragraphs
                if "table1" in card2table:
                    table1data = regression_narrative_obj.convert_table_data(card2table["table1"])
                    card2Table1 = TableData()
                    card2Table1.set_table_data(table1data)
                    card2Table1.set_table_type("heatMap")
                    card2Table1.set_table_top_header(card2table["table1"]["heading"])
                    card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1))
                    # measureCard2Data.insert(3,card2Table1)
                    measureCard2Data.insert(3,card2Table1Json)

                if "table2" in card2table:
                    table2data = regression_narrative_obj.convert_table_data(card2table["table2"])
                    card2Table2 = TableData()
                    card2Table2.set_table_data(table2data)
                    card2Table2.set_table_type("heatMap")
                    card2Table2.set_table_top_header(card2table["table2"]["heading"])
                    # measureCard2Data.insert(5,card2Table2)
                    card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2))
                    # measureCard2Data.append(card2Table2)
                    measureCard2Data.append(card2Table2Json)


            # self._result_setter.set_trend_section_data({"result_column":self.result_column,
            #                                             "measure_column":measure_column,
            #                                             "base_dir":self._base_dir
            #                                             })
            # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative)
            # card2 =  trend_narratives_obj.get_regression_trend_card_data()
            # if card2:
            #     measure_column_cards['card2'] = card2
            #
            #
            # card3 = {}
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True)
            CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False)
            card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column)
            card4data.update({"blockSplitter":self._blockSplitter})
            # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column
            card4narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                                'regression_card4.html',card4data)
            card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter)
            # card3 = {"paragraphs":card4paragraphs}
            card0['paragraphs'] = card1paragraphs+card4paragraphs
            card4Chart = card4data["charts"]
            # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))]
            statistical_info_array=[
                ("Test Type","Regression"),
                ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))),
                ("P-Value","<= 0.05"),
                ("Intercept",str(round(self._df_regression_result.get_intercept(),2))),
                ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))),
                ]
            inferenceTuple = ()
            coeff = self._df_regression_result.get_coeff(measure_column)
            if coeff > 0:
                inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            else:
                inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            if len(inferenceTuple) > 0:
                statistical_info_array.append(inferenceTuple)
            statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)

            card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array))
            measureCard1Data += card4paragraphs

            self.narratives['cards'].append(measure_column_cards)

            if count == 0:
                card4data.pop("charts")
                self._result_setter.update_executive_summary_data(card4data)
            count += 1
            measureCard1.set_card_data(measureCard1Data)
            if self._run_dimension_level_regression:
                measureCard2.set_card_data(measureCard2Data)
                sigMeasureNode.add_cards([measureCard1,measureCard2])
            sigMeasureNode.add_cards([measureCard1])
            self._regressionNode.add_a_node(sigMeasureNode)
        # self._result_setter.set_trend_section_completion_status(True)
        self._story_narrative.add_a_node(self._regressionNode)
Пример #7
0
    def __init__(self, df_helper, df_context, result_setter, spark, df_regression_result, correlations,story_narrative,meta_parser):
        self._metaParser = meta_parser
        self._result_setter = result_setter
        self._story_narrative = story_narrative
        self._df_regression_result = df_regression_result
        self._correlations = correlations
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER

        # self._result_setter.set_trend_section_name("regression")
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._dimension_columns = self._dataframe_helper.get_string_columns()
        self._date_columns = self._dataframe_context.get_date_columns()
        self._uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(self._uid_col):
            self._dimension_columns = list(set(self._dimension_columns) - {self._uid_col})
        if len(self._date_columns) >0 :
            self._dimension_columns = list(set(self._dimension_columns)-set(self._date_columns))
        self._spark = spark
        self.measures = []
        self.result_column = self._dataframe_helper.resultcolumn

        self.all_coefficients = self._df_regression_result.get_all_coeff()
        all_coeff = [(x,self.all_coefficients[x]) for x in list(self.all_coefficients.keys())]
        all_coeff = sorted(all_coeff,key = lambda x:abs(x[1]["coefficient"]),reverse = True)
        self._all_coeffs = all_coeff
        self.significant_measures = [x[0] for x in all_coeff if x[1]['p_value']<=0.05]
        print(self.significant_measures)
        print("regression narratives started")
        self.narratives = {"heading": self.result_column + "Performance Report",
                           "main_card":{},
                           "cards":[]
                        }
        self._base_dir = "/regression/"
        self._run_dimension_level_regression = False

        # self._dim_regression = self.run_regression_for_dimension_levels()
        self._regressionNode = NarrativesTree()

        self._completionStatus = self._dataframe_context.get_completion_status()
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._messageURL = self._dataframe_context.get_message_url()
        self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight()
        self._scriptStages = {
            "regressionNarrativeStart":{
                "summary":"Started The Regression Narratives",
                "weight":1
                },
            "regressionNarrativeEnd":{
                "summary":"Narratives For Regression Finished",
                "weight":0
                },
            }
        self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeStart"]["weight"],10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionNarrativeStart",\
                                    "info",\
                                    self._scriptStages["regressionNarrativeStart"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage)
        self._dataframe_context.update_completion_status(self._completionStatus)

        self.generate_narratives()
        self._regressionNode.set_name("Influencers")
        self._result_setter.set_regression_node(self._regressionNode)

        self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeEnd"]["weight"],10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionNarrativeEnd",\
                                    "info",\
                                    self._scriptStages["regressionNarrativeEnd"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage)
        self._dataframe_context.update_completion_status(self._completionStatus)
Пример #8
0
    def run(self):
        self._start_time = time.time()
        metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows)
        sampleData = metaHelperInstance.get_sample_data()
        sampleData = sampleData.toPandas()
        sampleData = metaHelperInstance.format_sampledata_timestamp_columns(
            sampleData, self._timestamp_columns, self._stripTimestamp)
        time_taken_sampling = time.time() - self._start_time
        self._completionStatus += self._scriptStages["sampling"]["weight"]
        print "sampling takes", time_taken_sampling
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "sampling",\
                                    "info",\
                                    self._scriptStages["sampling"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)

        metaData = []
        metaData.append(
            MetaData(name="noOfRows",
                     value=self._total_rows,
                     display=True,
                     displayName="Rows"))
        metaData.append(
            MetaData(name="noOfColumns",
                     value=self._total_columns,
                     display=True,
                     displayName="Columns"))
        self._percentage_columns = metaHelperInstance.get_percentage_columns(
            self._string_columns)
        if len(self._percentage_columns) > 0:
            self._data_frame = CommonUtils.convert_percentage_columns(
                self._data_frame, self._percentage_columns)
            self._numeric_columns = self._numeric_columns + self._percentage_columns
            self._string_columns = list(
                set(self._string_columns) - set(self._percentage_columns))
            self.update_column_type_dict()

        self._dollar_columns = metaHelperInstance.get_dollar_columns(
            self._string_columns)
        if len(self._dollar_columns) > 0:
            self._data_frame = CommonUtils.convert_dollar_columns(
                self._data_frame, self._dollar_columns)
            self._numeric_columns = self._numeric_columns + self._dollar_columns
            self._string_columns = list(
                set(self._string_columns) - set(self._dollar_columns))
            self.update_column_type_dict()

        if len(self._numeric_columns) > 1:
            # print "self._numeric_columns : ", self._numeric_columns
            metaData.append(
                MetaData(name="measures",
                         value=len(self._numeric_columns),
                         display=True,
                         displayName="Measures"))
        else:
            metaData.append(
                MetaData(name="measures",
                         value=len(self._numeric_columns),
                         display=True,
                         displayName="Measure"))
        if len(self._string_columns) > 1:
            metaData.append(
                MetaData(name="dimensions",
                         value=len(self._string_columns +
                                   self._boolean_columns),
                         display=True,
                         displayName="Dimensions"))
        else:
            metaData.append(
                MetaData(name="dimensions",
                         value=len(self._string_columns +
                                   self._boolean_columns),
                         display=True,
                         displayName="Dimension"))
        if len(self._timestamp_columns) > 1:
            metaData.append(
                MetaData(name="timeDimension",
                         value=len(self._timestamp_columns),
                         display=True,
                         displayName="Time Dimensions"))
        else:
            metaData.append(
                MetaData(name="timeDimension",
                         value=len(self._timestamp_columns),
                         display=True,
                         displayName="Time Dimension"))

        metaData.append(
            MetaData(name="measureColumns",
                     value=self._numeric_columns,
                     display=False))
        metaData.append(
            MetaData(name="dimensionColumns",
                     value=self._string_columns + self._boolean_columns,
                     display=False))
        metaData.append(
            MetaData(name="timeDimensionColumns",
                     value=self._timestamp_columns,
                     display=False))
        metaData.append(
            MetaData(name="percentageColumns",
                     value=self._percentage_columns,
                     display=False))
        metaData.append(
            MetaData(name="dollarColumns",
                     value=self._dollar_columns,
                     display=False))
        columnData = []
        headers = []

        self._start_time = time.time()
        print "Count of Numeric columns", len(self._numeric_columns)
        measureColumnStat, measureCharts = metaHelperInstance.calculate_measure_column_stats(
            self._data_frame,
            self._numeric_columns,
            binColumn=self._binned_stat_flag)
        time_taken_measurestats = time.time() - self._start_time
        self._completionStatus += self._scriptStages["measurestats"]["weight"]
        print "measure stats takes", time_taken_measurestats
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "measurestats",\
                                    "info",\
                                    self._scriptStages["measurestats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        dimensionColumnStat, dimensionCharts = metaHelperInstance.calculate_dimension_column_stats(
            self._data_frame,
            self._string_columns + self._boolean_columns,
            levelCount=self._level_count_flag)
        # print dimensionColumnStat
        self._dataSize["dimensionLevelCountDict"] = {
            k: filter(lambda x: x["name"] == "numberOfUniqueValues",
                      v)[0]["value"]
            for k, v in dimensionColumnStat.items()
        }
        self._dataSize["totalLevels"] = sum(
            self._dataSize["dimensionLevelCountDict"].values())

        time_taken_dimensionstats = time.time() - self._start_time
        self._completionStatus += self._scriptStages["dimensionstats"][
            "weight"]
        # print "dimension stats takes",time_taken_dimensionstats
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dimensionstats",\
                                    "info",\
                                    self._scriptStages["dimensionstats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        timeDimensionColumnStat, timeDimensionCharts = metaHelperInstance.calculate_time_dimension_column_stats(
            self._data_frame,
            self._timestamp_columns,
            level_count_flag=self._level_count_flag)
        time_taken_tdstats = time.time() - self._start_time
        self._completionStatus += self._scriptStages["timedimensionstats"][
            "weight"]
        # print "time dimension stats takes",time_taken_tdstats
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "timedimensionstats",\
                                    "info",\
                                    self._scriptStages["timedimensionstats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        ignoreColumnSuggestions = []
        ignoreColumnReason = []
        utf8ColumnSuggestion = []
        dateTimeSuggestions = {}
        for column in self._data_frame.columns:
            random_slug = uuid.uuid4().hex
            headers.append(ColumnHeader(name=column, slug=random_slug))
            data = ColumnData()
            data.set_slug(random_slug)
            data.set_name(column)
            data.set_abstract_datatype(
                self._column_type_dict[column]["abstract"])

            columnStat = []
            columnChartData = None
            if self._column_type_dict[column]["abstract"] == "measure":
                data.set_column_stats(measureColumnStat[column])
                data.set_column_chart(measureCharts[column])
                data.set_actual_datatype(
                    self._column_type_dict[column]["actual"])
            elif self._column_type_dict[column]["abstract"] == "dimension":
                data.set_column_stats(dimensionColumnStat[column])
                data.set_column_chart(dimensionCharts[column])
                data.set_actual_datatype(
                    self._column_type_dict[column]["actual"])
            elif self._column_type_dict[column]["abstract"] == "datetime":
                data.set_column_stats(timeDimensionColumnStat[column])
                data.set_column_chart(timeDimensionCharts[column])
                data.set_actual_datatype(
                    self._column_type_dict[column]["actual"])

            if self._column_type_dict[column]["abstract"] == "measure":
                if column not in self._real_columns:
                    ignoreSuggestion, ignoreReason = metaHelperInstance.get_ignore_column_suggestions(
                        self._data_frame,
                        column,
                        "measure",
                        measureColumnStat[column],
                        max_levels=self._max_levels)
                    if ignoreSuggestion:
                        ignoreColumnSuggestions.append(column)
                        ignoreColumnReason.append(ignoreReason)
                        data.set_level_count_to_null()
                        data.set_chart_data_to_null()
                        data.set_ignore_suggestion_flag(True)
                        data.set_ignore_suggestion_message(ignoreReason)

            elif self._column_type_dict[column]["abstract"] == "dimension":
                if self._level_count_flag:
                    utf8Suggestion = metaHelperInstance.get_utf8_suggestions(
                        dimensionColumnStat[column])
                else:
                    utf8Suggestion = False
                if self._column_type_dict[column]["actual"] != "boolean":
                    uniqueVals = self._data_frame.select(
                        column).distinct().na.drop().collect()
                else:
                    uniqueVals = []
                if len(uniqueVals) > 0:
                    dateColumnFormat = metaHelperInstance.get_datetime_format(
                        uniqueVals)
                else:
                    dateColumnFormat = None
                if dateColumnFormat:
                    dateTimeSuggestions.update({column: dateColumnFormat})
                    data.set_level_count_to_null()
                    data.set_chart_data_to_null()
                    data.set_date_suggestion_flag(True)

                if utf8Suggestion:
                    utf8ColumnSuggestion.append(column)
                ignoreSuggestion, ignoreReason = metaHelperInstance.get_ignore_column_suggestions(
                    self._data_frame,
                    column,
                    "dimension",
                    dimensionColumnStat[column],
                    max_levels=self._max_levels)
                if ignoreSuggestion:
                    ignoreColumnSuggestions.append(column)
                    ignoreColumnReason.append(ignoreReason)
                    data.set_level_count_to_null()
                    data.set_chart_data_to_null()
                    data.set_ignore_suggestion_flag(True)
                    data.set_ignore_suggestion_message(ignoreReason)

            columnData.append(data)
        for dateColumn in dateTimeSuggestions.keys():
            if dateColumn in ignoreColumnSuggestions:
                ignoreColIdx = ignoreColumnSuggestions.index(dateColumn)
                ignoreColumnSuggestions.remove(dateColumn)
                del (ignoreColumnReason[ignoreColIdx])
        for utfCol in utf8ColumnSuggestion:
            ignoreColumnSuggestions.append(utfCol)
            ignoreColumnReason.append("utf8 values present")
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Validating Metadata Information",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)
        metaData.append(
            MetaData(name="ignoreColumnSuggestions",
                     value=ignoreColumnSuggestions,
                     display=False))
        metaData.append(
            MetaData(name="ignoreColumnReason",
                     value=ignoreColumnReason,
                     display=False))
        metaData.append(
            MetaData(name="utf8ColumnSuggestion",
                     value=utf8ColumnSuggestion,
                     display=False))
        metaData.append(
            MetaData(name="dateTimeSuggestions",
                     value=dateTimeSuggestions,
                     display=False))
        metaData.append(
            MetaData(name="dataSizeSummary",
                     value=self._dataSize,
                     display=False))
        dfMetaData = DfMetaData()
        dfMetaData.set_column_data(columnData)
        dfMetaData.set_header(headers)
        dfMetaData.set_meta_data(metaData)
        dfMetaData.set_sample_data(sampleData)

        time_taken_suggestions = time.time() - self._start_time
        self._completionStatus += self._scriptStages["suggestions"]["weight"]
        # print "suggestions take",time_taken_suggestions
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "suggestions",\
                                    "info",\
                                    self._scriptStages["suggestions"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        return dfMetaData
Пример #9
0
    def run(self):
        self._start_time = time.time()
        metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows)
        sampleData = metaHelperInstance.get_sample_data()
        if not self._pandas_flag:
            sampleData = sampleData.toPandas()
        time_taken_sampling = time.time()-self._start_time
        self._completionStatus += self._scriptStages["sampling"]["weight"]
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "sampling",\
                                    "info",\
                                    self._scriptStages["sampling"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)

        metaData = []
        metaData.append(MetaData(name="noOfRows",value=self._total_rows,display=True,displayName="Rows"))
        metaData.append(MetaData(name="noOfColumns",value=self._total_columns,display=True,displayName="Columns"))
        # self._percentage_columns = metaHelperInstance.get_percentage_columns(self._string_columns)
        separation_time=time.time()
        self._timestamp_string_columns=[]
        uniqueVals = []
        dateTimeSuggestions = {}
        if not self._pandas_flag:
            for column in self._string_columns:
                if self._column_type_dict[column]["actual"] != "boolean":
                    # uniqueVals = self._data_frame.select(column).na.drop().distinct().limit(10).collect()
                    uniqueVals = sampleData[column].unique().tolist()
                else:
                    uniqueVals = []
                ## TODO : remove pandas if not needed later
                if self._pandas_flag:
                    if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame.sort_values(by=column,ascending=False)[column][0]])!=None:
                        dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals)
                    else:
                        dateColumnFormat = None
                else:
                    if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None:
                        dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals)
                    else:
                        dateColumnFormat = None

                if dateColumnFormat:
                    dateTimeSuggestions.update({column:dateColumnFormat})
                    data=ColumnData()
                    data.set_level_count_to_null()
                    data.set_chart_data_to_null()
                    data.set_date_suggestion_flag(True)
                    data.set_abstract_datatype("datetime")
                    data.set_actual_datatype("datetime")
                    self._timestamp_string_columns.append(column)
                    ## TO DO : remove pandas if not needed later
                    if self._pandas_flag:
                        self._data_frame[column] = pd.to_datetime(self._data_frame[column],format=dateColumnFormat)
                    else:
                        self._data_frame = self._data_frame.withColumn(column, self.to_date_(column))
        sampleData = metaHelperInstance.format_sampledata_timestamp_columns(sampleData,self._timestamp_columns,self._stripTimestamp)
        print("sampling takes",time_taken_sampling)
        self._string_columns = list(set(self._string_columns)-set(self._timestamp_string_columns))

        self._timestamp_columns = self._timestamp_columns+self._timestamp_string_columns
        # self.update_column_type_dict()

        print("time taken for separating date columns from string is :", time.time()-separation_time)


        # if len(self._percentage_columns)>0:
        #     self._data_frame = CommonUtils.convert_percentage_columns(self._data_frame,self._percentage_columns)
        #     self._numeric_columns = self._numeric_columns + self._percentage_columns
        #     self._string_columns = list(set(self._string_columns)-set(self._percentage_columns))
        #     self.update_column_type_dict()

        # self._dollar_columns = metaHelperInstance.get_dollar_columns(self._string_columns)
        # if len(self._dollar_columns)>0:
        #     self._data_frame = CommonUtils.convert_dollar_columns(self._data_frame,self._dollar_columns)
        #     self._numeric_columns = self._numeric_columns + self._dollar_columns
        #     self._string_columns = list(set(self._string_columns)-set(self._dollar_columns))
        #     self.update_column_type_dict()


        columnData = []
        headers = []

        self._start_time = time.time()
        print("Count of Numeric columns",len(self._numeric_columns))
        try:
            measureColumnStat,measureCharts = metaHelperInstance.calculate_measure_column_stats(self._data_frame,self._numeric_columns,binColumn=self._binned_stat_flag,pandas_flag=self._pandas_flag)
        except Exception as e:
            raise Exception(e)
        time_taken_measurestats = time.time()-self._start_time
        self._completionStatus += self._scriptStages["measurestats"]["weight"]
        print("measure stats takes",time_taken_measurestats)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "measurestats",\
                                    "info",\
                                    self._scriptStages["measurestats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)
        print("Count of DateTime columns",len(self._timestamp_columns))

        self._start_time = time.time()
        # time_columns=self._timestamp_columns
        # time_string_columns=self._timestamp_string_columns
        # original_timestamp_columns=list(set(self._timestamp_columns)-set(self._timestamp_string_columns))
        timeDimensionColumnStat,timeDimensionCharts, unprocessed_columns = metaHelperInstance.calculate_time_dimension_column_stats(self._data_frame,self._timestamp_columns,level_count_flag=self._level_count_flag,pandas_flag=self._pandas_flag)
        self._string_columns = self._string_columns + unprocessed_columns
        self._timestamp_columns = list(set(self._timestamp_columns) - set(unprocessed_columns))
        self.update_column_type_dict()


        if len(self._numeric_columns) > 1:
            # print "self._numeric_columns : ", self._numeric_columns
            metaData.append(MetaData(name="measures",value=len(self._numeric_columns),display=True,displayName="Measures"))
        else:
            metaData.append(MetaData(name="measures",value=len(self._numeric_columns),display=True,displayName="Measure"))
        if len(self._string_columns) > 1:
            metaData.append(MetaData(name="dimensions",value=len(self._string_columns+self._boolean_columns),display=True,displayName="Dimensions"))
        else:
            metaData.append(MetaData(name="dimensions",value=len(self._string_columns+self._boolean_columns),display=True,displayName="Dimension"))
        if len(self._timestamp_columns) > 1:
            metaData.append(MetaData(name="timeDimension",value=len(self._timestamp_columns),display=True,displayName="Time Dimensions"))
        else:
            metaData.append(MetaData(name="timeDimension",value=len(self._timestamp_columns),display=True,displayName="Time Dimension"))

        metaData.append(MetaData(name="measureColumns",value = self._numeric_columns,display=False))
        metaData.append(MetaData(name="dimensionColumns",value = self._string_columns+self._boolean_columns,display=False))
        metaData.append(MetaData(name="timeDimensionColumns",value = self._timestamp_columns,display=False))
        # metaData.append(MetaData(name="percentageColumns",value = self._percentage_columns,display=False))
        # metaData.append(MetaData(name="dollarColumns",value = self._dollar_columns,display=False))

        # timeDimensionColumnStat2,timeDimensionCharts2,unprocessed_columns = metaHelperInstance.calculate_time_dimension_column_stats_from_string(self._data_frame,self._timestamp_string_columns,level_count_flag=self._level_count_flag)
        # gc.collect()
        # timeDimensionColumnStat.update(timeDimensionColumnStat2)
        # timeDimensionCharts.update(timeDimensionCharts2)
        time_taken_tdstats = time.time()-self._start_time
        self._completionStatus += self._scriptStages["timedimensionstats"]["weight"]
        print("time dimension stats takes",time_taken_tdstats)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "timedimensionstats",\
                                    "info",\
                                    self._scriptStages["timedimensionstats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        try :
            dimensionColumnStat,dimensionCharts = metaHelperInstance.calculate_dimension_column_stats(self._data_frame,self._string_columns+self._boolean_columns,levelCount=self._level_count_flag,pandas_flag=self._pandas_flag)
        except Exception as e:
            raise Exception(e)
        self._dataSize["dimensionLevelCountDict"] = {k:[x for x in v if x["name"]=="numberOfUniqueValues"][0]["value"] for k,v in list(dimensionColumnStat.items())}
        self._dataSize["totalLevels"] = sum(self._dataSize["dimensionLevelCountDict"].values())

        time_taken_dimensionstats = time.time()-self._start_time
        self._completionStatus += self._scriptStages["dimensionstats"]["weight"]
        # print "dimension stats takes",time_taken_dimensionstats
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dimensionstats",\
                                    "info",\
                                    self._scriptStages["dimensionstats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        ignoreColumnSuggestions = []
        ignoreColumnReason = []
        utf8ColumnSuggestion = []

        dup_cols = []
        #columns = self._data_frame.columns
        measureDupCols=self.checkDupColName(measureColumnStat)
        dimensionDupCols=self.checkDupColName(dimensionColumnStat)
        timeDimensionDupCols=self.checkDupColName(timeDimensionColumnStat)
        if self._pandas_flag:
            for i in measureDupCols:
                if self.checkDuplicateCols_pandas(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in measureColumnStat[j]:
                            measureColumnStat[j].append(dict(name="Duplicate",value=i[0]))
            for i in dimensionDupCols:
                if self.checkDuplicateCols_pandas(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in dimensionColumnStat[j]:
                            dimensionColumnStat[j].append(dict(name="Duplicate",value=i[0]))
            for i in timeDimensionDupCols:
                if self.checkDuplicateCols_pandas(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in timeDimensionColumnStat[j]:
                            timeDimensionColumnStat[j].append(dict(name="Duplicate",value=i[0]))
        else:
            for i in measureDupCols:
                if self.checkDuplicateCols(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in measureColumnStat[j]:
                            measureColumnStat[j].append(dict(name="Duplicate",value=i[0]))
            for i in dimensionDupCols:
                if self.checkDuplicateCols(i[0],i[1],True) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in dimensionColumnStat[j]:
                            dimensionColumnStat[j].append(dict(name="Duplicate",value=i[0]))
            for i in timeDimensionDupCols:
                if self.checkDuplicateCols(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in timeDimensionColumnStat[j]:
                            timeDimensionColumnStat[j].append(dict(name="Duplicate",value=i[0]))

        for column in self._data_frame.columns:
            random_slug = uuid.uuid4().hex
            headers.append(ColumnHeader(name=column,slug=random_slug))
            data = ColumnData()
            data.set_slug(random_slug)
            data.set_name(column)
            data.set_abstract_datatype(self._column_type_dict[column]["abstract"])
            data.set_checker(True)
            changeflage=False
            columnStat = []
            columnChartData = None
            check_datatype_change=self.actual_col_datatype_update
            if len(check_datatype_change)!=0:
                for i in check_datatype_change:
                    if list(i.keys())[0]==column:
                        changeflage=True
                        changeType=i[column]
                        break
                    else:
                        changeflage=False
            else:
                changeflage=False
            if self._column_type_dict[column]["abstract"] == "measure":
                data.set_column_stats(measureColumnStat[column])
                data.set_column_chart(measureCharts[column])
                if changeflage:
                    data.set_actual_datatype("dimension")
                else:
                    data.set_actual_datatype(self._column_type_dict[column]["actual"])
            elif self._column_type_dict[column]["abstract"] == "dimension":
                data.set_column_stats(dimensionColumnStat[column])
                data.set_column_chart(dimensionCharts[column])
                if changeflage:
                    data.set_actual_datatype("measure")
                else:
                    data.set_actual_datatype(self._column_type_dict[column]["actual"])
            elif self._column_type_dict[column]["abstract"] == "datetime":
                data.set_column_stats(timeDimensionColumnStat[column])
                data.set_column_chart(timeDimensionCharts[column])
                if changeflage:
                    data.set_actual_datatype("dimension")
                else:
                    data.set_actual_datatype(self._column_type_dict[column]["actual"])
            if self._column_type_dict[column]["abstract"] == "measure":
                #if column not in self._real_columns:
                ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"measure",measureColumnStat[column],max_levels=self._max_levels)
                if ignoreSuggestion:
                    ignoreColumnSuggestions.append(column)
                    ignoreColumnReason.append(ignoreReason)
                    #data.set_level_count_to_null()
                    #data.set_chart_data_to_null()
                    data.set_ignore_suggestion_flag(True)
                    data.set_ignore_suggestion_message(ignoreReason)
            elif self._column_type_dict[column]["abstract"] == "dimension":
                ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"dimension",dimensionColumnStat[column],max_levels=self._max_levels)
                if ignoreSuggestion:
                    ignoreColumnSuggestions.append(column)
                    ignoreColumnReason.append(ignoreReason)
                    if ignoreReason=="Number of Levels are more than the defined thershold":
                        data.set_ignore_suggestion_preview_flag(False)
                    #data.set_level_count_to_null()
                    #data.set_chart_data_to_null()
                    data.set_ignore_suggestion_flag(True)
                    data.set_ignore_suggestion_message(ignoreReason)
                if self._level_count_flag:
                    utf8Suggestion = metaHelperInstance.get_utf8_suggestions(dimensionColumnStat[column])
                else:
                    utf8Suggestion = False
                if utf8Suggestion:
                    utf8ColumnSuggestion.append(column)
                    ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"dimension",dimensionColumnStat[column],max_levels=self._max_levels)
                    if ignoreSuggestion:
                        ignoreColumnSuggestions.append(column)
                        ignoreColumnReason.append(ignoreReason)
                        #data.set_level_count_to_null()
                        #data.set_chart_data_to_null()
                        data.set_ignore_suggestion_flag(True)
                        data.set_ignore_suggestion_message(ignoreReason)

            elif self._column_type_dict[column]["abstract"] == "datetime":
                ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"datetime",timeDimensionColumnStat[column],max_levels=self._max_levels)
                if ignoreSuggestion:
                    ignoreColumnSuggestions.append(column)
                    ignoreColumnReason.append(ignoreReason)
                    #data.set_level_count_to_null()
                    #data.set_chart_data_to_null()
                    data.set_ignore_suggestion_flag(True)
                    data.set_ignore_suggestion_message(ignoreReason)
            columnData.append(data)
            if len(uniqueVals) > 0:
                dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals)
            else:
                dateColumnFormat = None
            if dateColumnFormat:
                dateTimeSuggestions.update({column:dateColumnFormat})
        for utfCol in utf8ColumnSuggestion:
            ignoreColumnSuggestions.append(utfCol)
            ignoreColumnReason.append("utf8 values present")
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Validating Metadata Information",self._completionStatus,self._completionStatus,display=True)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)
        metaData.append(MetaData(name="ignoreColumnSuggestions",value = ignoreColumnSuggestions,display=False))
        metaData.append(MetaData(name="ignoreColumnReason",value = ignoreColumnReason,display=False))
        metaData.append(MetaData(name="utf8ColumnSuggestion",value = utf8ColumnSuggestion,display=False))
        metaData.append(MetaData(name="dateTimeSuggestions",value = dateTimeSuggestions,display=False))
        metaData.append(MetaData(name="dataSizeSummary",value = self._dataSize,display=False))
        dfMetaData = DfMetaData()
        dfMetaData.set_column_data(columnData)
        dfMetaData.set_header(headers)
        dfMetaData.set_meta_data(metaData)
        dfMetaData.set_sample_data(sampleData)

        time_taken_suggestions = time.time()-self._start_time
        self._completionStatus += self._scriptStages["suggestions"]["weight"]
        # print "suggestions take",time_taken_suggestions
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "suggestions",\
                                    "info",\
                                    self._scriptStages["suggestions"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)
        self._dataframe_context.update_completion_status(self._completionStatus)
        return dfMetaData
Пример #10
0
    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        targetLevel = self._dataframe_context.get_target_level_for_model()
        probabilityArrayAll = []

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        targetValues = [x for x in rules_dict.keys() if x == targetLevel
                        ] + [x for x in rules_dict.keys() if x != targetLevel]
        for idx, target in enumerate(targetValues):
            if idx == 0:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": True,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        True,
                        "id":
                        idx + 1
                    })
            else:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": False,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        False,
                        "id":
                        idx + 1
                    })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            probabilityArrayAll += probabilityArray
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [target] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if self._dataframe_context.get_story_on_scored_data() == True:
            chartDict = {}
            probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART
            chartDict = dict(
                zip(probabilityRangeForChart.keys(),
                    [0] * len(probabilityRangeForChart)))
            for val in probabilityArrayAll:
                for grps, grpRange in probabilityRangeForChart.items():
                    if val > grpRange[0] and val <= grpRange[1]:
                        chartDict[grps] = chartDict[grps] + 1
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        else:
            chartDict = dict([(k, sum(v))
                              for k, v in self.total_predictions.items()])
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups
        if self._dataframe_context.get_story_on_scored_data() != True:
            maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'decisiontreesummary.html',data_dict)
        else:
            predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]]
            predictedLevelCountDict = {}
            # predictedLevelcountDict = defaultdict(predictedLevelcountArray)
            for val in predictedLevelcountArray:
                predictedLevelCountDict.setdefault(val[0], []).append(val[1])

            levelCountDict = {}
            for k, v in predictedLevelCountDict.items():
                levelCountDict[k] = sum(v)
            # levelCountDict = self._metaParser.get_unique_level_dict(self._colname)
            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [{
                "name": k,
                "count": v,
                "percentage": round(v * 100 / total, 2)
            } for k, v in levelCountDict.items() if v != None]
            percentageArray = [x["percentage"] for x in levelCountTuple]
            percentageArray = NarrativesUtils.ret_smart_round(percentageArray)
            levelCountTuple = [{
                "name": obj["name"],
                "count": obj["count"],
                "percentage": str(percentageArray[idx]) + "%"
            } for idx, obj in enumerate(levelCountTuple)]
            data_dict["nlevel"] = len(levelCountDict)
            print "levelCountTuple", levelCountTuple
            print "levelCountDict", levelCountDict
            if targetLevel in levelCountDict:
                data_dict["topLevel"] = [
                    x for x in levelCountTuple if x["name"] == targetLevel
                ][0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = max([
                        x for x in levelCountTuple if x["name"] != targetLevel
                    ],
                                                   key=lambda x: x["count"])
                else:
                    data_dict["secondLevel"] = None
            else:
                data_dict["topLevel"] = levelCountTuple[0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = levelCountTuple[1]
                else:
                    data_dict["secondLevel"] = None
            print data_dict
            maincardSummary = NarrativesUtils.get_template_output(
                self._base_dir, 'decisiontreescore.html', data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        if self._dataframe_context.get_story_on_scored_data() == True:
            main_card_table.set_table_width(75)
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        uidTable = self._result_setter.get_unique_identifier_table()
        if uidTable != None:
            main_card_data.append(uidTable)
        else:
            main_card_table.set_table_width(100)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)
Пример #11
0
    def __init__(self,
                 column_name,
                 decision_tree_rules,
                 df_helper,
                 df_context,
                 meta_parser,
                 result_setter,
                 story_narrative=None,
                 analysisName=None,
                 scriptWeight=None):
        self._story_narrative = story_narrative
        self._metaParser = meta_parser
        self._dataframe_context = df_context
        self._ignoreMsg = self._dataframe_context.get_message_ignore()
        self._result_setter = result_setter
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._column_name = column_name.lower()
        self._colname = column_name

        self._capitalized_column_name = "%s%s" % (column_name[0].upper(),
                                                  column_name[1:])
        self._decision_rules_dict = decision_tree_rules.get_decision_rules()
        self._decision_tree_json = CommonUtils.as_dict(decision_tree_rules)
        self._decision_tree_raw = self._decision_rules_dict
        # self._decision_tree_raw = {"tree":{"children":None}}
        # self._decision_tree_raw['tree']["children"] = self._decision_tree_json['tree']["children"]
        self._table = decision_tree_rules.get_table()
        self._new_table = {}
        self.successful_predictions = decision_tree_rules.get_success()
        self.total_predictions = decision_tree_rules.get_total()
        self.success_percent = decision_tree_rules.get_success_percent()
        self._important_vars = decision_tree_rules.get_significant_vars()
        self._target_distribution = decision_tree_rules.get_target_contributions(
        )
        self._get_new_table()
        self._df_helper = df_helper
        self.subheader = None
        #self.table = {}
        self.dropdownComment = None
        self.dropdownValues = None
        self._base_dir = "/decisiontree/"

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "dtreeNarrativeStart": {
                "summary": "Started the Decision Tree Narratives",
                "weight": 0
            },
            "dtreeNarrativeEnd": {
                "summary": "Narratives for Decision Tree Finished",
                "weight": 10
            },
        }
        self._completionStatus += self._scriptWeightDict[
            self._analysisName]["narratives"] * self._scriptStages[
                "dtreeNarrativeStart"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dtreeNarrativeStart",\
                                    "info",\
                                    self._scriptStages["dtreeNarrativeStart"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        self._decisionTreeNode = NarrativesTree()
        self._decisionTreeNode.set_name("Prediction")
        self._generate_narratives()
        # self._story_narrative.add_a_node(self._decisionTreeNode)
        self._result_setter.set_decision_tree_node(self._decisionTreeNode)
        self._result_setter.set_score_dtree_cards(
            json.loads(
                CommonUtils.convert_python_object_to_json(
                    self._decisionTreeNode.get_all_cards())))

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._completionStatus += self._scriptWeightDict[
            self._analysisName]["narratives"] * self._scriptStages[
                "dtreeNarrativeEnd"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dtreeNarrativeEnd",\
                                    "info",\
                                    self._scriptStages["dtreeNarrativeEnd"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
Пример #12
0
 def _generate_narratives(self):
     try:
         nColsToUse = self._analysisDict[
             self._analysisName]["noOfColumnsToUse"]
     except:
         nColsToUse = None
     self._anovaNodes = NarrativesTree()
     self._anovaNodes.set_name("Performance")
     for measure_column in self._df_anova_result.get_measure_columns():
         measure_anova_result = self._df_anova_result.get_measure_result(
             measure_column)
         significant_dimensions_dict, insignificant_dimensions = measure_anova_result.get_OneWayAnovaSignificantDimensions(
         )
         num_dimensions = len(list(significant_dimensions_dict.items())
                              ) + len(insignificant_dimensions)
         significant_dimensions = [
             k for k, v in sorted(list(significant_dimensions_dict.items()),
                                  key=lambda x: -x[1])
         ]
         if nColsToUse != None:
             significant_dimensions = significant_dimensions[:nColsToUse]
         num_significant_dimensions = len(significant_dimensions)
         num_insignificant_dimensions = len(insignificant_dimensions)
         print("num_significant_dimensions", num_significant_dimensions)
         if num_significant_dimensions > 0:
             mainCard = NormalCard(name="Overview of Key Factors")
             data_c3 = []
             for sig_dim in significant_dimensions:
                 data_c3.append({
                     'dimension':
                     sig_dim,
                     'effect_size':
                     float(significant_dimensions_dict[sig_dim])
                 })
             self.narratives = {}
             self.narratives[AnovaNarratives.
                             KEY_HEADING] = "%s Performance Analysis" % (
                                 measure_column, )
             self.narratives['main_card'] = {}
             self.narratives['cards'] = []
             self.narratives['main_card'][
                 AnovaNarratives.
                 KEY_SUBHEADING] = "Relationship between %s and other Dimensions" % (
                     measure_column)
             self.narratives['main_card'][
                 AnovaNarratives.KEY_PARAGRAPH] = []
             data_dict = { \
                             'significant_dimensions' : significant_dimensions,
                             'insignificant_dimensions' : insignificant_dimensions,
                             'num_significant_dimensions' : num_significant_dimensions,
                             'num_insignificant_dimensions' : num_insignificant_dimensions,
                             'num_dimensions' : num_significant_dimensions+num_insignificant_dimensions,
                             'target' : measure_column \
                         }
             output = {'header': ''}
             output['content'] = NarrativesUtils.get_template_output(
                 self._base_dir, 'anova_template_1.html', data_dict)
             self.narratives['main_card'][
                 AnovaNarratives.KEY_PARAGRAPH].append(output)
             output1 = {'header': ''}
             output1['content'] = NarrativesUtils.get_template_output(
                 self._base_dir, 'anova_template_2.html', data_dict)
             lines = []
             lines += NarrativesUtils.block_splitter(
                 output['content'], self._blockSplitter)
             data_c3 = NormalChartData(data_c3)
             chart_data = data_c3.get_data()
             chartDataValues = []
             effect_size_values = []
             for obj in chart_data:
                 effect_size_values.append(obj["effect_size"])
             chart_data_min = min(effect_size_values)
             if chart_data_min < 0.00001:
                 for obj in chart_data:
                     chartDataValues.append(str(obj["effect_size"]))
             else:
                 for obj in chart_data:
                     chartDataValues.append(obj["effect_size"])
             chart_json = ChartJson(data=chart_data,
                                    axes={
                                        'x': 'dimension',
                                        'y': 'effect_size'
                                    },
                                    label_text={
                                        'x': '',
                                        'y':
                                        'Effect Size (scaled exp values)'
                                    },
                                    chart_type='bar')
             chart_json.set_axis_rotation(True)
             # chart_json.set_yaxis_number_format(".4f")
             chart_json.set_yaxis_number_format(
                 NarrativesUtils.select_y_axis_format(chartDataValues))
             # st_info = ["Test : ANOVA", "Threshold for p-value : 0.05", "Effect Size : Tukey's HSD"]
             statistical_info_array = [
                 ("Test Type", "ANOVA"),
                 ("Effect Size", "ETA squared"),
                 ("Max Effect Size", chart_data[0]["dimension"]),
                 ("Min Effect Size", chart_data[-1]["dimension"]),
             ]
             statistical_inferenc = ""
             if len(chart_data) == 1:
                 statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                  Effect size of {}".format(
                     chart_data[0]["dimension"],
                     self._dataframe_context.get_result_column(),
                     round(chart_data[0]["effect_size"], 4))
             elif len(chart_data) == 2:
                 statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                  Effect size ranges are {} and {} respectively".format(
                     chart_data[0]["dimension"], chart_data[1]["dimension"],
                     self._dataframe_context.get_result_column(),
                     round(chart_data[0]["effect_size"], 4),
                     round(chart_data[1]["effect_size"], 4))
             else:
                 statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                  Effect size ranges from {} to {}".format(
                     len(chart_data),
                     self._dataframe_context.get_result_column(),
                     round(chart_data[0]["effect_size"], 4),
                     round(chart_data[-1]["effect_size"], 4))
             if statistical_inference != "":
                 statistical_info_array.append(
                     ("Inference", statistical_inference))
             statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                 statistical_info_array)
             lines += [
                 C3ChartData(data=chart_json, info=statistical_info_array)
             ]
             lines += NarrativesUtils.block_splitter(
                 output1['content'], self._blockSplitter)
             mainCard.set_card_data(lines)
             self._anovaNodes.add_a_card(mainCard)
             self.narratives['main_card'][
                 AnovaNarratives.KEY_PARAGRAPH].append(output1)
             self.narratives['main_card'][AnovaNarratives.KEY_CHART] = {}
             effect_size_chart = {
                 'heading': '',
                 'labels': {
                     'Dimension': 'Effect Size'
                 },
                 'data': significant_dimensions_dict
             }
             print(significant_dimensions_dict)
             self.narratives['main_card'][AnovaNarratives.KEY_CHART][
                 'effect_size'] = effect_size_chart
             progressMessage = CommonUtils.create_progress_message_object(
                 self._analysisName,
                 "custom",
                 "info",
                 "Analyzing Key Drivers",
                 self._completionStatus,
                 self._completionStatus,
                 display=True)
             CommonUtils.save_progress_message(self._messageURL,
                                               progressMessage,
                                               ignore=False)
             self._generate_dimension_narratives(significant_dimensions,
                                                 measure_anova_result,
                                                 measure_column)
         else:
             mainCard = NormalCard(name="Overview of Key Factors")
             cardText = HtmlData(
                 "There are no dimensions in the dataset that have significant influence on {}"
                 .format(measure_column))
             mainCard.set_card_data([cardText])
             self._anovaNodes.add_a_card(mainCard)
Пример #13
0
def main(configJson):
    LOGGER = {}
    deployEnv = False  # running the scripts from job-server env
    debugMode = True  # runnning the scripts for local testing and development
    cfgMode = False  # runnning the scripts by passing config.cfg path
    scriptStartTime = time.time()
    if isinstance(configJson, pyhocon.config_tree.ConfigTree) or isinstance(
            configJson, dict):
        deployEnv = True
        debugMode = False
        ignoreMsg = False
    elif isinstance(configJson, basestring):
        if configJson.endswith(".cfg"):
            print "######################## Running in cfgMode ########################"
            cfgMode = True
            debugMode = False
            ignoreMsg = False
        else:
            print "######################## Running in debugMode ######################"
            cfgMode = False
            debugMode = True
            ignoreMsg = True
            # Test Configs are defined in bi/settings/configs/localConfigs
            jobType = "stockAdvisor"
            if jobType == "testCase":
                configJson = get_test_configs(jobType, testFor="chisquare")
            else:
                configJson = get_test_configs(jobType)

    print "######################## Creating Spark Session ###########################"
    if debugMode:
        APP_NAME = "mAdvisor_running_in_debug_mode"
    else:
        if "job_config" in configJson.keys(
        ) and "job_name" in configJson["job_config"]:
            APP_NAME = configJson["job_config"]["job_name"]
        else:
            APP_NAME = "--missing--"
    if debugMode:
        spark = CommonUtils.get_spark_session(app_name=APP_NAME,
                                              hive_environment=False)
    else:
        spark = CommonUtils.get_spark_session(app_name=APP_NAME)

    spark.sparkContext.setLogLevel("ERROR")
    # applicationIDspark = spark.sparkContext.applicationId

    # spark.conf.set("spark.sql.execution.arrow.enabled", "true")

    print "######################### Parsing the configs #############################"

    config = configJson["config"]
    jobConfig = configJson["job_config"]
    jobType = jobConfig["job_type"]
    jobName = jobConfig["job_name"]
    jobURL = jobConfig["job_url"]
    messageURL = jobConfig["message_url"]

    try:
        errorURL = jobConfig["error_reporting_url"]
    except:
        errorURL = None
    if "app_id" in jobConfig:
        appid = jobConfig["app_id"]
    else:
        appid = None
    configJsonObj = configparser.ParserConfig(config)
    configJsonObj.set_json_params()

    dataframe_context = ContextSetter(configJsonObj)
    dataframe_context.set_job_type(
        jobType
    )  #jobType should be set before set_params call of dataframe_context
    dataframe_context.set_params()
    dataframe_context.set_message_url(messageURL)
    dataframe_context.set_app_id(appid)
    dataframe_context.set_debug_mode(debugMode)
    dataframe_context.set_job_url(jobURL)
    dataframe_context.set_app_name(APP_NAME)
    dataframe_context.set_error_url(errorURL)
    dataframe_context.set_logger(LOGGER)
    dataframe_context.set_xml_url(jobConfig["xml_url"])
    dataframe_context.set_job_name(jobName)
    if debugMode == True:
        dataframe_context.set_environment("debugMode")
        dataframe_context.set_message_ignore(True)

    analysistype = dataframe_context.get_analysis_type()
    result_setter = ResultSetter(dataframe_context)
    # scripts_to_run = dataframe_context.get_scripts_to_run()
    appid = dataframe_context.get_app_id()
    completionStatus = 0
    print "########################## Validate the Config ###############################"
    configValidator = ConfigValidator(dataframe_context)
    configValid = configValidator.get_sanity_check()

    if not configValid:
        progressMessage = CommonUtils.create_progress_message_object(
            "mAdvisor Job",
            "custom",
            "info",
            "Please Provide a Valid Configuration",
            completionStatus,
            completionStatus,
            display=True)
        CommonUtils.save_progress_message(messageURL,
                                          progressMessage,
                                          ignore=ignoreMsg)
        response = CommonUtils.save_result_json(
            dataframe_context.get_job_url(), json.dumps({}))
        CommonUtils.save_error_messages(errorURL,
                                        APP_NAME,
                                        "Invalid Config Provided",
                                        ignore=ignoreMsg)
    else:
        ########################## Initializing messages ##############################
        if jobType == "story":
            if analysistype == "measure":
                progressMessage = CommonUtils.create_progress_message_object(
                    "Measure analysis",
                    "custom",
                    "info",
                    "Analyzing Target Variable",
                    completionStatus,
                    completionStatus,
                    display=True)
            else:
                progressMessage = CommonUtils.create_progress_message_object(
                    "Dimension analysis",
                    "custom",
                    "info",
                    "Analyzing Target Variable",
                    completionStatus,
                    completionStatus,
                    display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg,
                                              emptyBin=True)
            dataframe_context.update_completion_status(completionStatus)
        elif jobType == "metaData":
            progressMessage = CommonUtils.create_progress_message_object(
                "metaData",
                "custom",
                "info",
                "Preparing data for loading",
                completionStatus,
                completionStatus,
                display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg,
                                              emptyBin=True)
            progressMessage = CommonUtils.create_progress_message_object(
                "metaData",
                "custom",
                "info",
                "Initializing the loading process",
                completionStatus,
                completionStatus,
                display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg)
            progressMessage = CommonUtils.create_progress_message_object(
                "metaData",
                "custom",
                "info",
                "Data Upload in progress",
                completionStatus,
                completionStatus,
                display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg)
            dataframe_context.update_completion_status(completionStatus)
        if jobType != "stockAdvisor":
            df = None
            data_loading_st = time.time()
            progressMessage = CommonUtils.create_progress_message_object(
                "scriptInitialization", "scriptInitialization", "info",
                "Loading the Dataset", completionStatus, completionStatus)
            if jobType != "story" and jobType != "metaData":
                CommonUtils.save_progress_message(messageURL,
                                                  progressMessage,
                                                  ignore=ignoreMsg,
                                                  emptyBin=True)
                dataframe_context.update_completion_status(completionStatus)
            ########################## Load the dataframe ##############################
            df = MasterHelper.load_dataset(spark, dataframe_context)
            df = df.persist()
            if jobType != "metaData":
                metaParserInstance = MasterHelper.get_metadata(
                    df, spark, dataframe_context)
                df, df_helper = MasterHelper.set_dataframe_helper(
                    df, dataframe_context, metaParserInstance)
                # updating metaData for binned Cols
                colsToBin = df_helper.get_cols_to_bin()
                levelCountDict = df_helper.get_level_counts(colsToBin)
                metaParserInstance.update_level_counts(colsToBin,
                                                       levelCountDict)

        ############################ MetaData Calculation ##########################

        if jobType == "metaData":
            MasterHelper.run_metadata(spark, df, dataframe_context)
        ############################################################################

        ################################ Data Sub Setting ##########################
        if jobType == "subSetting":
            MasterHelper.run_subsetting(spark, df, dataframe_context,
                                        df_helper, metaParserInstance)
        ############################################################################

        ################################ Story Creation ############################
        if jobType == "story":
            if analysistype == "dimension":
                MasterHelper.run_dimension_analysis(spark, df,
                                                    dataframe_context,
                                                    df_helper,
                                                    metaParserInstance)
            elif analysistype == "measure":
                MasterHelper.run_measure_analysis(spark, df, dataframe_context,
                                                  df_helper,
                                                  metaParserInstance)

            progressMessage = CommonUtils.create_progress_message_object(
                "final",
                "final",
                "info",
                "Job Finished",
                100,
                100,
                display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg)
        ############################################################################

        ################################ Model Training ############################
        elif jobType == 'training':
            dataframe_context.set_ml_environment("sklearn")
            MasterHelper.train_models(spark, df, dataframe_context, df_helper,
                                      metaParserInstance)
        ############################################################################

        ############################## Model Prediction ############################
        elif jobType == 'prediction':
            dataframe_context.set_ml_environment("sklearn")
            MasterHelper.score_model(spark, df, dataframe_context, df_helper,
                                     metaParserInstance)

        ############################################################################
        ################################### Test Cases  ############################

        if jobType == "testCase":
            print "Running Test Case for Chi-square Analysis---------------"
            # TestChiSquare().setUp()
            unittest.TextTestRunner(verbosity=2).run(
                unittest.TestLoader().loadTestsFromTestCase(TestChiSquare))

            # TestChiSquare(df,df_helper,dataframe_context,metaParserInstance).run_chisquare_test()
            # TestChiSquare().setup()
            # TestChiSquare().run_chisquare_test()
            # TestChiSquare().test_upper()
            # test = test_chisquare.run_chisquare_test(df,df_helper,dataframe_context,metaParserInstance)
            # suit = unittest.TestLoader().loadTestsFromTestCase(TestChiSquare)

        ############################################################################

        ################################### Stock ADVISOR ##########################
        if jobType == 'stockAdvisor':
            # spark.conf.set("spark.sql.execution.arrow.enabled", "false")
            file_names = dataframe_context.get_stock_symbol_list()
            stockObj = StockAdvisor(spark, file_names, dataframe_context,
                                    result_setter)
            stockAdvisorData = stockObj.Run()
            stockAdvisorDataJson = CommonUtils.convert_python_object_to_json(
                stockAdvisorData)
            # stockAdvisorDataJson["name"] = jobName
            print "*" * 100
            print "Result : ", stockAdvisorDataJson
            response = CommonUtils.save_result_json(jobURL,
                                                    stockAdvisorDataJson)

        ############################################################################
        scriptEndTime = time.time()
        runtimeDict = {"startTime": scriptStartTime, "endTime": scriptEndTime}
        print runtimeDict
        CommonUtils.save_error_messages(errorURL,
                                        "jobRuntime",
                                        runtimeDict,
                                        ignore=ignoreMsg)
        print "Scripts Time : ", scriptEndTime - scriptStartTime, " seconds."
Пример #14
0
    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)

        for idx, target in enumerate(rules_dict.keys()):
            targetToDisplayInTable = target.split(":")[0].strip()
            if idx == 0:
                dropdownData.append({
                    "displayName": target,
                    "name": targetToDisplayInTable,
                    "searchTerm": targetToDisplayInTable,
                    "selected": True,
                    "id": idx + 1
                })
            else:
                dropdownData.append({
                    "displayName": target,
                    "name": targetToDisplayInTable,
                    "searchTerm": targetToDisplayInTable,
                    "selected": False,
                    "id": idx + 1
                })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [targetToDisplayInTable] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(rulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups

        maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                    'decisiontreesummary.html',data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)
Пример #15
0
        # print resp.text
        print('Main Method Did Not End ....., ', str(e))
        progressMessage = CommonUtils.create_progress_message_object(
            "Main Method Did Not End .....", "Main Method Did Not End .....",
            "Error", str(e), "Failed", 100)
        CommonUtils.save_progress_message(messageURL,
                                          progressMessage,
                                          emptyBin=True)


if __name__ == '__main__':
    jobURL, killURL, messageURL = killer_setting(sys.argv[1])
    try:
        main(sys.argv[1])
        print('Main Method End .....')
    except Exception as e:
        print(jobURL, killURL)
        data = {"status": "killed", "jobURL": jobURL}
        resp = send_kill_command(killURL, data)
        while str(resp.text) != '{"result": "success"}':
            data = {"status": "killed", "jobURL": jobURL}
            resp = send_kill_command(killURL, data)
        progressMessage = CommonUtils.create_progress_message_object(
            "Main Method Did Not End .....", "Main Method Did Not End .....",
            "Error", str(e), "Failed", 100)
        CommonUtils.save_progress_message(messageURL,
                                          progressMessage,
                                          emptyBin=True)

        print('Main Method Did Not End ....., ', str(e))
Пример #16
0
    def __init__(self,
                 data_frame,
                 df_helper,
                 df_context,
                 spark,
                 meta_parser,
                 max_depth=5,
                 scriptWeight=None,
                 analysisName=None):
        self._spark = spark
        self._maxDepth = max_depth
        self._metaParser = meta_parser
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._ignoreMsg = self._dataframe_context.get_message_ignore()
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        # if self._analysisDict:
        #     for m in self._measure_columns:
        #         if data_frame.select(F.countDistinct(m)).collect()[0][0]<self._analysisDict['Dimension vs. Dimension']['binSetting']['binCardinality']:
        #             self._measure_columns.remove(m)
        self._dimension_columns = self._dataframe_helper.get_string_columns()
        self._date_columns = self._dataframe_context.get_date_columns()
        self._uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(
                self._uid_col):
            self._dimension_columns = list(
                set(self._dimension_columns) - {self._uid_col})
        if len(self._date_columns) > 0:
            self._dimension_columns = list(
                set(self._dimension_columns) - set(self._date_columns))
        if not self._pandas_flag:
            self._data_frame = MLUtils.bucket_all_measures(
                data_frame,
                self._measure_columns,
                self._dimension_columns,
                pandas_flag=self._pandas_flag)
        else:
            self._data_frame = data_frame
        try:
            self._data_frame1 = self._data_frame.copy()
        except:
            self._data_frame1 = self._data_frame
        self._mapping_dict = {}
        self._new_rules = {}
        self._total = {}
        self._success = {}
        self._fail = {}
        self._probability = {}
        self._alias_dict = {}
        self._important_vars = {}
        self._total_list = []
        self._row_count = []
        self._targetlevels = []
        self._new_list = []
        self._count_list = []
        self._rule_id = 0
        self._path_dict = {}

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight

        self._scriptStages = {
            "initialization": {
                "summary": "Initialized The Decision Tree Script",
                "weight": 0
            },
            "treegeneration": {
                "summary": "Decision Tree Generation Finished",
                "weight": 10
            }
        }
        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["script"] *
            self._scriptStages["initialization"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        print("I AM HERE")
Пример #17
0
def main(configJson):
    LOGGER = {}
    deployEnv = False  # running the scripts from job-server env
    debugMode = True  # runnning the scripts for local testing and development
    cfgMode = False  # runnning the scripts by passing config.cfg path
    scriptStartTime = time.time()
    if isinstance(configJson, pyhocon.config_tree.ConfigTree) or isinstance(
            configJson, dict):
        deployEnv = True
        debugMode = False
        ignoreMsg = False
    elif isinstance(configJson, basestring):
        if configJson.endswith(".cfg"):
            print(
                "||############################## Running in cfgMode ##############################||"
            )
            cfgMode = True
            debugMode = False
            ignoreMsg = False
        else:
            print(
                "||############################## Running in debugMode ##############################||"
            )
            cfgMode = False
            debugMode = True
            ignoreMsg = True
            # Test Configs are defined in bi/settings/configs/localConfigs
            jobType = "training"
            if jobType == "testCase":
                configJson = get_test_configs(jobType, testFor="chisquare")
            else:
                configJson = get_test_configs(jobType)

    print(
        "||############################## Creating Spark Session ##############################||"
    )
    if debugMode:
        APP_NAME = "mAdvisor_running_in_debug_mode"
    else:
        config = configJson["config"]
        if config is None:
            configJson = requests.get(configJson["job_config"]["config_url"])
            configJson = configJson.json()

        if "job_config" in list(
                configJson.keys()) and "job_name" in configJson["job_config"]:
            APP_NAME = configJson["job_config"]["job_name"]
        else:
            APP_NAME = "--missing--"
    if debugMode:
        spark = CommonUtils.get_spark_session(app_name=APP_NAME,
                                              hive_environment=False)
    else:
        spark = CommonUtils.get_spark_session(app_name=APP_NAME)

    spark.sparkContext.setLogLevel("ERROR")
    # applicationIDspark = spark.sparkContext.applicationId

    # spark.conf.set("spark.sql.execution.arrow.enabled", "true")

    print(
        "||############################## Parsing Config file ##############################||"
    )

    config = configJson["config"]
    if "TRAINER_MODE" in config and config["TRAINER_MODE"] == "autoML":
        if "app_type" in config["FILE_SETTINGS"] and config["FILE_SETTINGS"][
                "app_type"] == "classification":
            if config['FILE_SETTINGS']['inputfile'][0].startswith("https:"):
                config[
                    'ALGORITHM_SETTING'] = GLOBALSETTINGS.algorithm_settings_pandas
            else:
                config[
                    'ALGORITHM_SETTING'] = GLOBALSETTINGS.algorithm_settings_pyspark
    jobConfig = configJson["job_config"]
    jobType = jobConfig["job_type"]
    if jobType == "prediction":
        one_click = config["one_click"]
    jobName = jobConfig["job_name"]
    jobURL = jobConfig["job_url"]
    messageURL = jobConfig["message_url"]
    initialMessageURL = jobConfig["initial_messages"]

    messages = scriptStages.messages_list(config, jobConfig, jobType, jobName)
    messages_for_API = messages.send_messages()
    messages_for_API = json.dumps(messages_for_API)
    res = requests.put(url=initialMessageURL, data=messages_for_API)
    print(
        "---------------------Pipeline changes in SPARK container------------------"
    )
    try:
        errorURL = jobConfig["error_reporting_url"]
    except:
        errorURL = None
    if "app_id" in jobConfig:
        appid = jobConfig["app_id"]
    else:
        appid = None
    configJsonObj = configparser.ParserConfig(config)
    configJsonObj.set_json_params()

    dataframe_context = ContextSetter(configJsonObj)
    dataframe_context.set_job_type(
        jobType
    )  #jobType should be set before set_params call of dataframe_context
    dataframe_context.set_params()
    dataframe_context.set_message_url(messageURL)
    dataframe_context.set_app_id(appid)
    dataframe_context.set_debug_mode(debugMode)
    dataframe_context.set_job_url(jobURL)
    dataframe_context.set_app_name(APP_NAME)
    dataframe_context.set_error_url(errorURL)
    dataframe_context.set_logger(LOGGER)
    dataframe_context.set_xml_url(jobConfig["xml_url"])
    dataframe_context.set_job_name(jobName)

    if debugMode == True:
        dataframe_context.set_environment("debugMode")
        dataframe_context.set_message_ignore(True)

    analysistype = dataframe_context.get_analysis_type()
    result_setter = ResultSetter(dataframe_context)
    appid = dataframe_context.get_app_id()
    completionStatus = 0
    print(
        "||############################## Validating the Config ##############################||"
    )
    configValidator = ConfigValidator(dataframe_context)
    configValid = configValidator.get_sanity_check()

    if not configValid:
        progressMessage = CommonUtils.create_progress_message_object(
            "mAdvisor Job",
            "custom",
            "info",
            "Please Provide A Valid Configuration",
            completionStatus,
            completionStatus,
            display=True)
        CommonUtils.save_progress_message(messageURL,
                                          progressMessage,
                                          ignore=ignoreMsg)
        response = CommonUtils.save_result_json(
            dataframe_context.get_job_url(), json.dumps({}))
        CommonUtils.save_error_messages(errorURL,
                                        APP_NAME,
                                        "Invalid Config Provided",
                                        ignore=ignoreMsg)
    else:
        ########################## Initializing messages ##############################
        if jobType == "story":
            if analysistype == "measure":
                progressMessage = CommonUtils.create_progress_message_object(
                    "Measure analysis",
                    "custom",
                    "info",
                    "Analyzing Target Variable",
                    completionStatus,
                    completionStatus,
                    display=True)
            else:
                progressMessage = CommonUtils.create_progress_message_object(
                    "Dimension analysis",
                    "custom",
                    "info",
                    "Analyzing Target Variable",
                    completionStatus,
                    completionStatus,
                    display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg,
                                              emptyBin=True)
            dataframe_context.update_completion_status(completionStatus)
        elif jobType == "metaData":
            progressMessage = CommonUtils.create_progress_message_object(
                "metaData",
                "custom",
                "info",
                "Preparing Data For Loading",
                completionStatus,
                completionStatus,
                display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg,
                                              emptyBin=True)
            progressMessage = CommonUtils.create_progress_message_object(
                "metaData",
                "custom",
                "info",
                "Initializing The Loading Process",
                completionStatus,
                completionStatus,
                display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg)
            progressMessage = CommonUtils.create_progress_message_object(
                "metaData",
                "custom",
                "info",
                "Uploading Data",
                completionStatus,
                completionStatus,
                display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg)
            dataframe_context.update_completion_status(completionStatus)
        if jobType != "stockAdvisor":
            df = None
            data_loading_st = time.time()
            progressMessage = CommonUtils.create_progress_message_object(
                "scriptInitialization", "scriptInitialization", "info",
                "Loading The Dataset", completionStatus, completionStatus)
            if jobType != "story" and jobType != "metaData":
                CommonUtils.save_progress_message(messageURL,
                                                  progressMessage,
                                                  ignore=ignoreMsg,
                                                  emptyBin=True)
                dataframe_context.update_completion_status(completionStatus)
            ########################## Load the dataframe ##############################
            df = MasterHelper.load_dataset(spark, dataframe_context)
            ######  pandas Flag  ################
            #dataframe_context._pandas_flag = False
            try:
                df = df.persist()
            except:
                pass
            rowscols = (df.count(), len(df.columns))
            removed_col = []
            new_cols_added = None
            if jobType != "metaData":
                # df,df_helper = MasterHelper.set_dataframe_helper(df,dataframe_context,metaParserInstance)
                if jobType == "training" or jobType == "prediction":
                    automl_enable = False
                    if dataframe_context.get_trainerMode() == "autoML":
                        automl_enable = True
                    one_click_json = {}
                    if dataframe_context.get_trainerMode() == "autoML":
                        # dataframe_context._pandas_flag = True
                        if jobType == "training":
                            # if dataframe_context._pandas_flag :
                            #     df = df.toPandas()
                            fs = time.time()
                            autoML_obj = autoML.AutoMl(
                                df, dataframe_context,
                                GLOBALSETTINGS.APPS_ID_MAP[appid]["type"])

                            one_click_json, linear_df, tree_df = autoML_obj.run(
                            )
                            print("Automl Done in ",
                                  time.time() - fs, " seconds.")
                        elif jobType == "prediction":
                            #try:
                            #    df = df.toPandas()
                            #except:
                            #    pass
                            score_obj = autoMLScore.Scoring(
                                df, one_click, dataframe_context._pandas_flag)
                            linear_df, tree_df = score_obj.run()
                        # linear
                        print('No. of columns in Linear data :',
                              len(list(linear_df.columns)))
                        #linear_df = spark.createDataFrame(linear_df)
                        metaParserInstance_linear_df = MasterHelper.get_metadata(
                            linear_df, spark, dataframe_context,
                            new_cols_added)
                        linear_df, df_helper_linear_df = MasterHelper.set_dataframe_helper(
                            linear_df, dataframe_context,
                            metaParserInstance_linear_df)
                        dataTypeChangeCols_linear_df = dataframe_context.get_change_datatype_details(
                        )
                        colsToBin_linear_df = df_helper_linear_df.get_cols_to_bin(
                        )
                        updateLevelCountCols_linear_df = colsToBin_linear_df
                        try:
                            for i in dataTypeChangeCols_linear_df:
                                if i["columnType"] == "dimension" and i[
                                        'colName'] in list(linear_df.columns):
                                    updateLevelCountCols_linear_df.append(
                                        i["colName"])
                        except:
                            pass
                        levelCountDict_linear_df = df_helper_linear_df.get_level_counts(
                            updateLevelCountCols_linear_df)
                        metaParserInstance_linear_df.update_level_counts(
                            updateLevelCountCols_linear_df,
                            levelCountDict_linear_df)

                        # Tree
                        print('No. of columns in Tree data :',
                              len(list(tree_df.columns)))
                        #tree_df = spark.createDataFrame(tree_df)
                        metaParserInstance_tree_df = MasterHelper.get_metadata(
                            tree_df, spark, dataframe_context, new_cols_added)
                        tree_df, df_helper_tree_df = MasterHelper.set_dataframe_helper(
                            tree_df, dataframe_context,
                            metaParserInstance_tree_df)
                        dataTypeChangeCols_tree_df = dataframe_context.get_change_datatype_details(
                        )
                        colsToBin_tree_df = df_helper_tree_df.get_cols_to_bin()
                        updateLevelCountCols_tree_df = colsToBin_tree_df
                        try:
                            for i in dataTypeChangeCols_tree_df:
                                if i["columnType"] == "dimension" and i[
                                        'colName'] in list(tree_df.columns):
                                    updateLevelCountCols_tree_df.append(
                                        i["colName"])
                        except:
                            pass
                        levelCountDict_tree_df = df_helper_tree_df.get_level_counts(
                            updateLevelCountCols_tree_df)
                        metaParserInstance_tree_df.update_level_counts(
                            updateLevelCountCols_tree_df,
                            levelCountDict_tree_df)
                    else:
                        dataCleansingDict = dataframe_context.get_dataCleansing_info(
                        )
                        featureEngineeringDict = dataframe_context.get_featureEngginerring_info(
                        )
                        if dataCleansingDict[
                                'selected'] or featureEngineeringDict[
                                    'selected']:
                            old_cols_list = df.columns
                            completionStatus = 10
                            progressMessage = CommonUtils.create_progress_message_object(
                                "scriptInitialization", "scriptInitialization",
                                "info",
                                "Performing Required Data Preprocessing And Feature Transformation Tasks",
                                completionStatus, completionStatus)
                            CommonUtils.save_progress_message(messageURL,
                                                              progressMessage,
                                                              ignore=ignoreMsg,
                                                              emptyBin=True)
                            dataframe_context.update_completion_status(
                                completionStatus)
                            ## TO DO : Change flag later this is only for testing
                            pandas_flag = dataframe_context._pandas_flag
                            if pandas_flag:
                                try:
                                    df = df.toPandas()
                                except:
                                    pass
                            if dataCleansingDict['selected']:
                                data_preprocessing_obj = data_preprocessing.DataPreprocessing(
                                    spark, df, dataCleansingDict,
                                    dataframe_context)
                                df = data_preprocessing_obj.data_cleansing()
                                removed_col = data_preprocessing_obj.removed_col
                            dataframe_context.set_ignore_column_suggestions(
                                removed_col)

                            if featureEngineeringDict['selected']:
                                feature_engineering_obj = feature_engineering.FeatureEngineering(
                                    spark, df, featureEngineeringDict,
                                    dataframe_context)
                                feature_engineering_obj.consider_columns = dataframe_context.get_consider_columns(
                                )
                                df = feature_engineering_obj.feature_engineering(
                                )
                            new_cols_list = df.columns
                            old_cols_list = list(
                                set(old_cols_list) - set(removed_col))
                            if len(old_cols_list) < len(new_cols_list):
                                new_cols_added = list(
                                    set(new_cols_list) - set(old_cols_list))
                            else:
                                new_cols_added = None
                            # if pandas_flag:
                            #     ## TODO: has to be removed now that metadata and DFhelper are in pandas
                            #     df=spark.createDataFrame(df)
                            try:
                                print(df.printSchema())
                            except:
                                print(df.dtypes)

                        metaParserInstance = MasterHelper.get_metadata(
                            df, spark, dataframe_context, new_cols_added)
                        df, df_helper = MasterHelper.set_dataframe_helper(
                            df, dataframe_context, metaParserInstance)
                        # updating metaData for binned Cols
                        dataTypeChangeCols = dataframe_context.get_change_datatype_details(
                        )
                        colsToBin = df_helper.get_cols_to_bin()
                        updateLevelCountCols = colsToBin
                        try:
                            for i in dataTypeChangeCols:
                                if i["columnType"] == "dimension":
                                    if jobType != "prediction":
                                        updateLevelCountCols.append(
                                            i["colName"])
                                    elif i["colName"] != self.dataframe_context.get_result_column(
                                    ) and jobType == "prediction":  #in prediction we should not add target
                                        updateLevelCountCols.append(
                                            i["colName"])
                        except:
                            pass
                        levelCountDict = df_helper.get_level_counts(
                            updateLevelCountCols)
                        metaParserInstance.update_level_counts(
                            updateLevelCountCols, levelCountDict)

                else:
                    metaParserInstance = MasterHelper.get_metadata(
                        df, spark, dataframe_context, new_cols_added)
                    df, df_helper = MasterHelper.set_dataframe_helper(
                        df, dataframe_context, metaParserInstance)
                    # updating metaData for binned Cols
                    dataTypeChangeCols = dataframe_context.get_change_datatype_details(
                    )
                    colsToBin = df_helper.get_cols_to_bin()
                    updateLevelCountCols = colsToBin
                    try:
                        for i in dataTypeChangeCols:
                            if i["columnType"] == "dimension":
                                updateLevelCountCols.append(i["colName"])
                    except:
                        pass
                    levelCountDict = df_helper.get_level_counts(
                        updateLevelCountCols)
                    metaParserInstance.update_level_counts(
                        updateLevelCountCols, levelCountDict)
        ############################ MetaData Calculation ##########################

        if jobType == "metaData":
            MasterHelper.run_metadata(spark, df, dataframe_context)
        ############################################################################

        ################################ Data Sub Setting ##########################
        if jobType == "subSetting":
            MasterHelper.run_subsetting(spark, df, dataframe_context,
                                        df_helper, metaParserInstance)
        ############################################################################

        ################################ Story Creation ############################
        if jobType == "story":
            if analysistype == "dimension":
                MasterHelper.run_dimension_analysis(spark, df,
                                                    dataframe_context,
                                                    df_helper,
                                                    metaParserInstance)
            elif analysistype == "measure":
                MasterHelper.run_measure_analysis(spark, df, dataframe_context,
                                                  df_helper,
                                                  metaParserInstance)

            progressMessage = CommonUtils.create_progress_message_object(
                "final",
                "final",
                "info",
                "Job Finished",
                100,
                100,
                display=True)
            CommonUtils.save_progress_message(messageURL,
                                              progressMessage,
                                              ignore=ignoreMsg)
        ############################################################################

        ################################ Model Training ############################
        elif jobType == 'training':
            # dataframe_context.set_ml_environment("sklearn")
            if automl_enable is True:
                MasterHelper.train_models_automl(
                    spark, linear_df, tree_df, dataframe_context,
                    df_helper_linear_df, df_helper_tree_df,
                    metaParserInstance_linear_df, metaParserInstance_tree_df,
                    one_click_json)
            else:
                MasterHelper.train_models(spark, df, dataframe_context,
                                          df_helper, metaParserInstance,
                                          one_click_json)
        ############################################################################

        ############################## Model Prediction ############################
        elif jobType == 'prediction':
            if automl_enable is True:
                MasterHelper.score_model_autoML(spark, linear_df, tree_df,
                                                dataframe_context,
                                                df_helper_linear_df,
                                                df_helper_tree_df,
                                                metaParserInstance_linear_df,
                                                metaParserInstance_tree_df)
            else:
                # dataframe_context.set_ml_environment("sklearn")
                MasterHelper.score_model(spark, df, dataframe_context,
                                         df_helper, metaParserInstance)

        ############################################################################
        ################################### Test Cases  ############################

        if jobType == "testCase":
            print("Running Test Case for Chi-square Analysis---------------")
            # TestChiSquare().setUp()
            unittest.TextTestRunner(verbosity=2).run(
                unittest.TestLoader().loadTestsFromTestCase(TestChiSquare))

            # TestChiSquare(df,df_helper,dataframe_context,metaParserInstance).run_chisquare_test()
            # TestChiSquare().setup()
            # TestChiSquare().run_chisquare_test()
            # TestChiSquare().test_upper()
            # test = test_chisquare.run_chisquare_test(df,df_helper,dataframe_context,metaParserInstance)
            # suit = unittest.TestLoader().loadTestsFromTestCase(TestChiSquare)

        ############################################################################

        ################################### Stock ADVISOR ##########################
        if jobType == 'stockAdvisor':
            # spark.conf.set("spark.sql.execution.arrow.enabled", "false")
            file_names = dataframe_context.get_stock_symbol_list()
            stockObj = StockAdvisor(spark, file_names, dataframe_context,
                                    result_setter)
            stockAdvisorData = stockObj.Run()
            stockAdvisorDataJson = CommonUtils.convert_python_object_to_json(
                stockAdvisorData)
            # stockAdvisorDataJson["name"] = jobName
            print("*" * 100)
            print("Result : ", stockAdvisorDataJson)
            response = CommonUtils.save_result_json(jobURL,
                                                    stockAdvisorDataJson)

        ############################################################################
        scriptEndTime = time.time()
        runtimeDict = {"startTime": scriptStartTime, "endTime": scriptEndTime}
        print(runtimeDict)
        CommonUtils.save_error_messages(errorURL,
                                        "jobRuntime",
                                        runtimeDict,
                                        ignore=ignoreMsg)
        print("Scripts Time : ", scriptEndTime - scriptStartTime, " seconds.")
Пример #18
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        measures = measure_columns
        if measure_columns is None:
            measures = self._measure_columns
        self._target_dimension = dimension_columns[0]
        dimension = self._target_dimension

        #####Look into it for Issue 947#################
        max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL
        # max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5))
        # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels]
        all_dimensions = [
            dim for dim in self._dimension_columns
            if self._metaParser.get_num_unique_values(dim) <= max_num_levels
        ]
        all_measures = self._measure_columns
        if self._pandas_flag:
            self._data_frame = self._data_frame[all_dimensions + all_measures]
        cat_feature_info = []
        columns_without_dimension = [
            x for x in all_dimensions if x != dimension
        ]
        mapping_dict = {}
        masterMappingDict = {}
        decision_tree_result = DecisionTreeResult()
        decision_tree_result.set_freq_distribution(
            self._metaParser.get_unique_level_dict(self._target_dimension),
            self._important_vars)
        if self._pandas_flag:
            try:
                all_dimensions.remove(dimension)
            except:
                pass
            actual_cols = list(self._data_frame.columns)
            print(actual_cols)
            self._data_frame = pd.get_dummies(self._data_frame,
                                              columns=all_dimensions)
            after_dummy_cols = list(self._data_frame.columns)

            def Diff(li1, li2):
                return (list(
                    list(set(li1) - set(li2)) + list(set(li2) - set(li1))))

            decision_tree_result.dummy_cols = [
                Diff(after_dummy_cols, Diff(actual_cols, all_dimensions)),
                all_dimensions
            ]

        all_dimensions.append(dimension)  #this has been done for scoring error
        if self._pandas_flag:
            self._data_frame, mapping_dict = MLUtils.add_string_index(
                self._data_frame, [dimension], self._pandas_flag)
        else:
            self._data_frame, mapping_dict = MLUtils.add_string_index(
                self._data_frame, all_dimensions, self._pandas_flag)
        if self._pandas_flag:
            print(self._data_frame.head(1))
        else:
            print(self._data_frame.show(1))
        # standard_measure_index = {0.0:'Low',1.0:'Medium',2.0:'High'}
        standard_measure_index = {
            0.0: 'Low',
            1.0: 'Below Average',
            2.0: 'Average',
            3.0: 'Above Average',
            4.0: 'High'
        }
        for measure in all_measures:
            mapping_dict[measure] = standard_measure_index

        for k, v in list(mapping_dict.items()):
            temp = {}
            for k1, v1 in list(v.items()):
                self._alias_dict[v1.replace(",", "")] = v1
                temp[k1] = v1.replace(",", "")
            mapping_dict[k] = temp
        self._mapping_dict = mapping_dict
        if not self._pandas_flag:

            for c in columns_without_dimension:
                if self._pandas_flag:
                    cat_feature_info.append(len(self._data_frame[c].unique()))
                else:
                    cat_feature_info.append(
                        self._data_frame.select(c).distinct().count())
            for c in all_measures:
                cat_feature_info.append(5)
            columns_without_dimension = columns_without_dimension + all_measures
            all_measures = []
            if len(cat_feature_info) > 0:
                max_length = max(cat_feature_info)
            else:
                max_length = 32
        else:
            decision_tree_result.mappingdict = mapping_dict[dimension]
            max_length = 32
        cat_feature_info = dict(enumerate(cat_feature_info))
        if self._pandas_flag:
            dimension_classes = len(self._data_frame[dimension].unique())
        else:
            dimension_classes = self._data_frame.select(
                dimension).distinct().count()
        if not self._pandas_flag:
            self._data_frame = self._data_frame[[dimension] +
                                                columns_without_dimension +
                                                all_measures]
        print("=" * 200)
        # print self._data_frame.rdd.first()
        print("numClasses", dimension_classes)
        print("maxDepth", self._maxDepth)
        decision_tree_result._maxDepth = self._maxDepth
        print("maxBins", max_length)
        print("=" * 200)
        if self._pandas_flag:
            self._data_frame.columns = [
                re.sub('\W+', '_', col.strip())
                for col in self._data_frame.columns
            ]
            x = self._data_frame.drop(dimension, axis=1)
            y = self._data_frame[dimension]
            #tle = LabelEncoder()
            #y = tle.fit_transform(y)
            for i in x.columns:
                x[i] = x[i].fillna(x[i].mode()[0])
            model = DecisionTreeClassifier(criterion='gini',
                                           max_depth=self._maxDepth,
                                           random_state=42)
            model = model.fit(x, y)
            output_result = self.tree_to_code(model, list(x.columns))
            output_result = list(map(lambda x: x.strip(), output_result))
        else:
            data = self._data_frame.rdd.map(
                lambda x: LabeledPoint(x[0], x[1:]))
            (trainingData, testData) = data.randomSplit([1.0, 0.0])
            # TO DO : set maxBins at least equal to the max level of categories in dimension column
            # model = DecisionTree.trainClassifier(trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length)
            # Removed categoricalFeaturesInfo to be passed to DecisionTree to get all levels and consider all feature as continuous variables
            #But that results in wrong result in Prediction Rule eg: columns containing "yes" or "no" as its value is considered as float value(0.5) so removing categoricalFeaturesInfo={} with categoricalFeaturesInfo=cat_feature_info
            model = DecisionTree.trainClassifier(
                trainingData,
                numClasses=dimension_classes,
                categoricalFeaturesInfo=cat_feature_info,
                impurity='gini',
                maxDepth=self._maxDepth,
                maxBins=max_length)
            output_result = model.toDebugString()
        decision_tree = self.tree_json(output_result, self._data_frame,
                                       self._pandas_flag)
        self._new_tree = self.generate_new_tree(decision_tree)
        node_list = self.node_name_extractor(self._new_tree)
        node_list = list(self.flatten(node_list))
        correct_count_list = [i[0] for i in self._count_list]
        tree_dict = dict(list(zip(node_list, correct_count_list)))
        self._new_tree = self.wrap_tree(self._new_tree, tree_dict)
        self._path_dict = self.path_dict_creator(node_list, self._new_tree)
        print("===" * 40)
        decision_tree_result.set_params(self._new_tree, self._new_rules,
                                        self._total, self._success,
                                        self._probability, self._path_dict)
        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["script"] *
            self._scriptStages["treegeneration"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "treegeneration",\
                                    "info",\
                                    self._scriptStages["treegeneration"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        return decision_tree_result
Пример #19
0
    def applyFilter(self):
        """
        here all the filter settings will come from the df_context
        """
        dimension_filters = self._dataframe_context.get_dimension_filters()
        measure_filters = self._dataframe_context.get_measure_filters()
        time_dimension_filters = self._dataframe_context.get_time_dimension_filters(
        )

        self._completionStatus += self._scriptStages["initialization"][
            "weight"]
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)

        if len(dimension_filters) > 0:
            for filter_dict in dimension_filters:
                if filter_dict["filterType"] == "valueIn":
                    self.values_in(filter_dict["colname"],
                                   filter_dict["values"])

        time_taken_dimensionfilters = time.time() - self._start_time
        self._completionStatus += self._scriptStages["dimensionfilters"][
            "weight"]
        print "dimensionfilters takes", time_taken_dimensionfilters
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dimensionfilters",\
                                    "info",\
                                    self._scriptStages["dimensionfilters"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)

        if len(measure_filters) > 0:
            for filter_dict in measure_filters:
                if filter_dict["filterType"] == "valueRange":
                    self.values_between(filter_dict["colname"],\
                                                           filter_dict["lowerBound"],\
                                                           filter_dict["upperBound"],\
                                                           greater_than_equal=1,\
                                                           less_than_equal =1)
        time_taken_measurefilters = time.time() - self._start_time
        self._completionStatus += self._scriptStages["measurefilters"][
            "weight"]
        print "measurefilters takes", time_taken_measurefilters
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "measurefilters",\
                                    "info",\
                                    self._scriptStages["measurefilters"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)

        if len(time_dimension_filters) > 0:
            for filter_dict in time_dimension_filters:
                if filter_dict["filterType"] == "valueRange":
                    self.values_between(filter_dict["colname"],\
                                                           filter_dict["lowerBound"],\
                                                           filter_dict["upperBound"],\
                                                           greater_than_equal=1,\
                                                           less_than_equal =1)
        time_taken_datetimefilters = time.time() - self._start_time
        self._completionStatus += self._scriptStages["datetimefilters"][
            "weight"]
        print "datetimefilters takes", time_taken_datetimefilters
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "datetimefilters",\
                                    "info",\
                                    self._scriptStages["datetimefilters"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        return self._data_frame
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Naive Bayes Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Spark ML Naive Bayes Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["initialization"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        time_dimension_columns = self._dataframe_helper.get_timestamp_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        level_counts_score = CommonUtils.get_level_count_dict(
            self._data_frame,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            output_type="dict",
            dataType="spark")
        for key in level_counts_train:
            if key in level_counts_score:
                if level_counts_train[key] != level_counts_score[key]:
                    dataSanity = False
            else:
                dataSanity = False

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/data.csv"
        trained_model_path = self._dataframe_context.get_model_path()
        trained_model_path = "/".join(
            trained_model_path.split("/")[:-1]
        ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring(
        )
        # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json"

        pipelineModel = MLUtils.load_pipeline(trained_model_path)

        df = self._data_frame
        transformed = pipelineModel.transform(df)
        label_indexer_dict = MLUtils.read_string_indexer_mapping(
            trained_model_path, SQLctx)
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            result_column, prediction_to_levels(transformed.prediction))

        if "probability" in transformed.columns:
            probability_dataframe = transformed.select(
                [result_column, "probability"]).toPandas()
            probability_dataframe = probability_dataframe.rename(
                index=str, columns={result_column: "predicted_class"})
            probability_dataframe[
                "predicted_probability"] = probability_dataframe[
                    "probability"].apply(lambda x: max(x))
            self._score_summary[
                "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                    probability_dataframe)
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(
                categorical_columns + time_dimension_columns +
                numerical_columns + [result_column, "probability"]).toPandas()
            scored_dataframe['predicted_probability'] = probability_dataframe[
                "predicted_probability"].values
            # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"})
        else:
            self._score_summary["prediction_split"] = []
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(categorical_columns +
                                                  time_dimension_columns +
                                                  numerical_columns +
                                                  [result_column]).toPandas()

        labelMappingDict = self._dataframe_context.get_label_map()
        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        scored_dataframe.to_csv(score_data_path, header=True, index=False)

        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(scored_dataframe[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = scored_dataframe[scored_dataframe[result_column]
                                               == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["prediction"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        columns_to_keep = self._dataframe_context.get_score_consider_columns()

        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]

        scored_df = transformed.select(categorical_columns +
                                       time_dimension_columns +
                                       numerical_columns + [result_column])
        columns_to_drop = [
            x for x in columns_to_drop if x in scored_df.columns
        ]
        modified_df = scored_df.select(
            [x for x in scored_df.columns if x not in columns_to_drop])
        resultColLevelCount = dict(
            modified_df.groupby(result_column).count().collect())
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(resultColLevelCount.keys())
            })
        self._dataframe_context.set_story_on_scored_data(True)

        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(modified_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()

        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except Exception as e:
                print("DecisionTree Analysis Failed ", str(e))
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(labelMappingDict.values())
            }

            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in resultColLevelCount.items()
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(v * 100 / total) + "%"
            }) for k, v in levelCountDict.items() if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(levelCountDict.keys())
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
Пример #21
0
    def fit(self, output_column, input_columns=None):
        print "linear regression fit started"
        if output_column not in self._dataframe_helper.get_numeric_columns():
            raise BIException('Output column: %s is not a measure column' %
                              (output_column, ))

        if input_columns == None:
            input_columns = list(
                set(self._dataframe_helper.get_numeric_columns()) -
                {output_column})

        nColsToUse = self._analysisDict[self._analysisName]["noOfColumnsToUse"]
        if nColsToUse != None:
            input_columns = input_columns[:nColsToUse]
        if len(
                set(input_columns) -
                set(self._dataframe_helper.get_numeric_columns())) != 0:
            raise BIException(
                'At least one of the input columns %r is not a measure column'
                % (input_columns, ))

        all_measures = input_columns + [output_column]
        print all_measures
        measureDf = self._data_frame.select(all_measures)
        lr = LR(maxIter=LinearRegression.MAX_ITERATIONS,
                regParam=LinearRegression.REGULARIZATION_PARAM,
                elasticNetParam=1.0,
                labelCol=LinearRegression.LABEL_COLUMN_NAME,
                featuresCol=LinearRegression.FEATURES_COLUMN_NAME)

        st = time.time()
        pipeline = MLUtils.create_pyspark_ml_pipeline(input_columns, [],
                                                      output_column)
        pipelineModel = pipeline.fit(measureDf)
        training_df = pipelineModel.transform(measureDf)
        training_df = training_df.withColumn("label",
                                             training_df[output_column])
        print "time taken to create training_df", time.time() - st
        # st = time.time()
        # training_df.cache()
        # print "caching in ",time.time()-st
        st = time.time()
        lr_model = lr.fit(training_df)
        lr_summary = lr_model.evaluate(training_df)
        print "lr model summary", time.time() - st
        sample_data_dict = {}
        for input_col in input_columns:
            sample_data_dict[input_col] = None

        coefficients = [
            float(val) if val != None else None
            for val in lr_model.coefficients.values
        ]
        try:
            p_values = [
                float(val) if val != None else None
                for val in lr_model.summary.pValues
            ]
        except:
            p_values = [None] * len(coefficients)
        # print p_values
        # print coefficients
        regression_result = RegressionResult(output_column,
                                             list(set(input_columns)))
        regression_result.set_params(intercept=float(lr_model.intercept),\
                                     coefficients=coefficients,\
                                     p_values = p_values,\
                                     rmse=float(lr_summary.rootMeanSquaredError), \
                                     r2=float(lr_summary.r2),\
                                     sample_data_dict=sample_data_dict)

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._completionStatus += self._scriptWeightDict[
            self._analysisName]["script"]
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionTrainingEnd",\
                                    "info",\
                                    self._scriptStages["regressionTrainingEnd"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        if self._ignoreRegressionElasticityMessages != True:
            CommonUtils.save_progress_message(
                self._messageURL,
                progressMessage,
                ignore=self._ignoreRegressionElasticityMessages)
            self._dataframe_context.update_completion_status(
                self._completionStatus)

        return regression_result
Пример #22
0
    def __init__(self, data_frame, spark, dataframe_context):
        self._dataframe_context = dataframe_context
        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._start_time = time.time()
        self._analysisName = "metadata"
        self._messageURL = self._dataframe_context.get_message_url()
        self._ignoreMsgFlag = self._dataframe_context.get_metadata_ignore_msg_flag(
        )
        self._scriptStages = {
            "schema": {
                "summary": "Loaded the data and Schema is Run",
                "weight": 12
            },
            "sampling": {
                "summary": "Sampling the dataframe",
                "weight": 5
            },
            "measurestats": {
                "summary": "calculating stats for measure columns",
                "weight": 25
            },
            "dimensionstats": {
                "summary": "calculating stats for dimension columns",
                "weight": 25
            },
            "timedimensionstats": {
                "summary": "calculating stats for time dimension columns",
                "weight": 5
            },
            "suggestions": {
                "summary": "Ignore and Date Suggestions",
                "weight": 25
            },
        }

        self._binned_stat_flag = True
        self._level_count_flag = True
        self._stripTimestamp = True
        self._data_frame = data_frame
        self._spark = spark
        self._total_columns = len(
            [field.name for field in self._data_frame.schema.fields])
        self._total_rows = self._data_frame.count()
        self._max_levels = min(200, round(self._total_rows**0.5))

        self._percentage_columns = []
        self._numeric_columns = [
            field.name for field in self._data_frame.schema.fields
            if ColumnType(type(field.dataType)).get_abstract_data_type() ==
            ColumnType.MEASURE
        ]
        self._string_columns = [
            field.name for field in self._data_frame.schema.fields
            if ColumnType(type(field.dataType)).get_abstract_data_type() ==
            ColumnType.DIMENSION
        ]
        self._timestamp_columns = [
            field.name for field in self._data_frame.schema.fields
            if ColumnType(type(field.dataType)).get_abstract_data_type() ==
            ColumnType.TIME_DIMENSION
        ]
        self._boolean_columns = [
            field.name for field in self._data_frame.schema.fields
            if ColumnType(type(field.dataType)).get_abstract_data_type() ==
            ColumnType.BOOLEAN
        ]
        self._real_columns = [
            field.name
            for field in self._data_frame.schema.fields if ColumnType(
                type(field.dataType)).get_actual_data_type() == ColumnType.REAL
        ]
        self._column_type_dict = {}
        self._dataSize = {
            "nRows": self._total_rows,
            "nCols": self._total_columns,
            "nBooleans": None,
            "nMeasures": None,
            "nDimensions": None,
            "nTimeDimensions": None,
            "dimensionLevelCountDict": {},
            "totalLevels": None
        }
        self.update_column_type_dict()

        time_taken_schema = time.time() - self._start_time
        print "schema rendering takes", time_taken_schema

        self._completionStatus += self._scriptStages["schema"]["weight"]
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "schema",\
                                    "info",\
                                    self._scriptStages["schema"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)
        self._dataframe_context.update_completion_status(
            self._completionStatus)