예제 #1
0
 def test_extra_fields_pd(self):
     pdf = PanDatFactory(boger=[["a"], ["b", "c"]])
     dat = pdf.PanDat(boger=pd.DataFrame({
         "a": [1, 2, 3],
         "b": [4, 5, 6],
         "c": ['a', 'b', 'c']
     }))
     schema = "test_pd_extra_fields"
     pdf.pgsql.write_schema(self.engine,
                            schema,
                            forced_field_types={
                                ("boger", "c"): "text",
                                ("boger", "a"): "float"
                            })
     pdf.pgsql.write_data(dat, self.engine, schema)
     pdf2 = PanDatFactory(boger=[["a"], ["b"]])
     dat2 = pdf2.pgsql.create_pan_dat(self.engine, schema)
     self.assertTrue(
         list(dat2.boger["a"]) == [1.0, 2.0, 3.0]
         and list(dat2.boger["b"]) == [4.0, 5.0, 6.0])
     dat2_2 = pdf2.PanDat(boger=pd.DataFrame({
         "a": [10, 300],
         "b": [40, 60]
     }))
     pdf2.pgsql.write_data(dat2_2, self.engine, schema)
     dat = pdf.pgsql.create_pan_dat(self.engine, schema)
     self.assertTrue(
         list(dat.boger["a"]) == [10, 300]
         and list(dat.boger["b"]) == [40, 60])
     self.assertTrue(len(set(dat.boger["c"])) == 1)
예제 #2
0
    def testNullsPd(self):
        pdf = PanDatFactory(table=[[], ["field one", "field two"]])
        for f in ["field one", "field two"]:
            pdf.set_data_type("table", f, nullable=True)
        dat = pdf.PanDat(
            table={
                "field one": [None, 200, 0, 300, 400],
                "field two": [100, 109, 300, None, 0]
            })
        schema = test_schema + "_bool_defaults_pd"
        pdf.pgsql.write_schema(self.engine,
                               schema,
                               include_ancillary_info=False)
        pdf.pgsql.write_data(dat, self.engine, schema)

        dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema)
        self.assertTrue(
            pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))

        pdf = PanDatFactory(table=[["field one"], ["field two"]])
        for f in ["field one", "field two"]:
            pdf.set_data_type("table", f, max=float("inf"), inclusive_max=True)
        pdf.set_infinity_io_flag(None)
        dat_inf = pdf.PanDat(
            table={
                "field one": [float("inf"), 200, 0, 300, 400],
                "field two": [100, 109, 300, float("inf"), 0]
            })
        dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema)

        self.assertTrue(pdf._same_data(dat_inf, dat_1))
        pdf.pgsql.write_data(dat_inf, self.engine, schema)
        dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema)
        self.assertTrue(pdf._same_data(dat_inf, dat_1))

        pdf = PanDatFactory(table=[["field one"], ["field two"]])
        for f in ["field one", "field two"]:
            pdf.set_data_type("table",
                              f,
                              min=-float("inf"),
                              inclusive_min=True)
        pdf.set_infinity_io_flag(None)
        dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema)
        self.assertFalse(pdf._same_data(dat_inf, dat_1))
        dat_inf = pdf.PanDat(
            table={
                "field one": [-float("inf"), 200, 0, 300, 400],
                "field two": [100, 109, 300, -float("inf"), 0]
            })
        self.assertTrue(pdf._same_data(dat_inf, dat_1))
class Outlier(Action):
    """flags outliers in a column with numerical value (not categorical).
     Class not meant for multi-variate outlier detection;
     Can be extended in future.

    Parameter used for outlier detection: z-score and IQR"""
    def __init__(self):
        self.config_schema = PanDatFactory(action_settings=[
            [], ['Table', 'Column', 'Method', 'Value', 'Flag Column']
        ])

        self.config_defaults = self.config_schema.PanDat(
            action_settings=[{
                'Table': 'data',
                'Column': 'column1',
                'Method': 'zScore',
                'Value': '3',
                'Flag Column': 'flag'
            }])

    def execute_action(self):
        """
        Performs outlier/anomaly detection on fields/columns based on zScore,
        inter-quartile range or bounds
        """

        config_dfs = self.read_data('config_schema')

        # parsing action config table into a dataframe
        action_config_df = config_dfs.action_settings

        table_col_dict = {}
        table_flag_col_dict = {}
        for _, series in action_config_df.iterrows():
            if series['Table'] in table_col_dict.keys():
                table_col_dict[series['Table']].add(str(series['Column']))
            else:
                table_col_dict[series['Table']] = {series['Column']}
                table_flag_col_dict[series['Table']] = {
                    series['Flag '
                           'Column']
                }
            # print("table_col_dict", table_col_dict)
            # print("table_flag_col_dict", table_flag_col_dict)

        self.data_schema = PanDatFactory(
            **{
                table: [[],
                        list(table_col_dict[table]) +
                        list(table_flag_col_dict[table])]
                for table in table_col_dict.keys()
            })

        table_dfs = self.read_data('data_schema')

        # processing data in dataframe

        for i in range(0, action_config_df.shape[0]):
            # print("\n *** Executing row", i, "in actions ***")
            table_name = action_config_df['Table'].iloc[i]
            col_to_analyze = action_config_df['Column'].iloc[i]
            flag_column = action_config_df['Flag Column'].iloc[i]
            # print("Table: ", table_name, ", column to analyze: ",
            #       col_to_analyze, ", flag column: ", flag_column)
            z_threshold = 0
            iqr_multiplier = 0
            lb = 0
            ub = 0
            if action_config_df['Method'].iloc[i] == 'zScore':
                z_threshold = float(action_config_df['Value'].iloc[i])
                table_df = getattr(table_dfs, table_name)

                numbers_l = []
                for item in table_df[col_to_analyze]:
                    numbers_l.append(float(item))
                numpy_num_array = np.array(numbers_l)
                std_dev = np.std(numpy_num_array)
                mean = np.mean(numbers_l)
                # print("Mean: ", mean, "Standard deviation: ", std_dev)
                # print("z_threshold: ", z_threshold)

                table_df[flag_column] = table_df.apply(
                    lambda row: bool(row[flag_column]) or (True if (
                        ((float(row[col_to_analyze]) - mean) / std_dev >
                         z_threshold) or ((float(row[col_to_analyze]) - mean) /
                                          std_dev < -z_threshold)) else False),
                    axis=1)

            elif action_config_df['Method'].iloc[i] == 'IQR':
                iqr_multiplier = float(action_config_df['Value'].iloc[i])
                table_df = getattr(table_dfs, table_name)

                numbers_l = []
                for item in table_df[col_to_analyze]:
                    numbers_l.append(float(item))
                numpy_num_array = np.array(numbers_l)
                q75, q25 = np.percentile(numpy_num_array, [75, 25])
                iqr = q75 - q25
                # print("First percentile: ", q25, "Third percentile: ", q75,
                #       "Inter quartile range: ", iqr)
                # print("IQR multiplier: ", iqr_multiplier)

                table_df[flag_column] = table_df.apply(
                    lambda row: bool(row[flag_column]) or (True if (
                        (float(row[col_to_analyze]) > q75 +
                         (iqr_multiplier * iqr)) or
                        (float(row[col_to_analyze]) < q25 -
                         (iqr_multiplier * iqr))) else False),
                    axis=1)

            elif action_config_df['Method'].iloc[i] == 'range':
                lb, ub = action_config_df['Value'].iloc[i].split(',')
                # print("Lower bound = ", lb,
                #       ", Upper bound =", ub)
                table_df = getattr(table_dfs, table_name)

                table_df[flag_column] = table_df.apply(
                    lambda row: bool(row[flag_column]) or (True if (
                        (float(row[col_to_analyze]) > float(ub)) or
                        (float(row[col_to_analyze]) < float(lb))) else False),
                    axis=1)

            else:
                print("Error: Please enter a valid value in Method to use ("
                      "zScore, IQR, range).")

        # writing data
        self.write_data(table_dfs)

        exit()