Пример #1
0
    def test_select_dtypes_exclude_using_scalars(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(exclude=np.number)
        ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(exclude='category')
        ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
        assert_frame_equal(ri, ei)

        pytest.raises(NotImplementedError,
                      lambda: df.select_dtypes(exclude='period'))
Пример #2
0
    def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=np.number,
                              exclude=['floating', 'timedelta'])
        ei = df[['b', 'c']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'],
                              exclude='floating')
        ei = df[['b', 'c', 'f', 'k']]
        assert_frame_equal(ri, ei)
Пример #3
0
 def test_select_dtypes_bad_arg_raises(self):
     df = DataFrame({'a': list('abc'),
                     'g': list(u('abc')),
                     'b': list(range(1, 4)),
                     'c': np.arange(3, 6).astype('u1'),
                     'd': np.arange(4.0, 7.0, dtype='float64'),
                     'e': [True, False, True],
                     'f': pd.date_range('now', periods=3).values})
     with tm.assertRaisesRegexp(TypeError, 'data type.*not understood'):
         df.select_dtypes(['blargy, blarg, blarg'])
Пример #4
0
    def test_select_dtypes_bad_datetime64(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        with tm.assert_raises_regex(ValueError, '.+ is too specific'):
            df.select_dtypes(include=['datetime64[D]'])

        with tm.assert_raises_regex(ValueError, '.+ is too specific'):
            df.select_dtypes(exclude=['datetime64[as]'])
Пример #5
0
    def test_select_dtypes_str_raises(self, dtype, arg):
        df = DataFrame({"a": list("abc"),
                        "g": list(u("abc")),
                        "b": list(range(1, 4)),
                        "c": np.arange(3, 6).astype("u1"),
                        "d": np.arange(4.0, 7.0, dtype="float64"),
                        "e": [True, False, True],
                        "f": pd.date_range("now", periods=3).values})
        msg = "string dtypes are not allowed"
        kwargs = {arg: [dtype]}

        with tm.assert_raises_regex(TypeError, msg):
            df.select_dtypes(**kwargs)
Пример #6
0
    def test_select_dtypes_include(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc'))})
        ri = df.select_dtypes(include=[np.number])
        ei = df[['b', 'c', 'd']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'])
        ei = df[['b', 'c', 'd', 'f']]
        assert_frame_equal(ri, ei)
Пример #7
0
    def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        df['g'] = df.f.diff()
        assert not hasattr(np, 'u8')
        r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
        e = df[['a', 'b']]
        assert_frame_equal(r, e)

        r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
        e = df[['a', 'b', 'g']]
        assert_frame_equal(r, e)
Пример #8
0
 def test_select_dtypes_exclude_using_list_like(self):
     df = DataFrame({'a': list('abc'),
                     'b': list(range(1, 4)),
                     'c': np.arange(3, 6).astype('u1'),
                     'd': np.arange(4.0, 7.0, dtype='float64'),
                     'e': [True, False, True]})
     re = df.select_dtypes(exclude=[np.number])
     ee = df[['a', 'e']]
     assert_frame_equal(re, ee)
Пример #9
0
    def test_select_dtypes_exclude_include_using_list_like(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        exclude = np.datetime64,
        include = np.bool_, 'integer'
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[['b', 'c', 'e']]
        assert_frame_equal(r, e)

        exclude = 'datetime',
        include = 'bool', 'int64', 'int32'
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[['b', 'e']]
        assert_frame_equal(r, e)
Пример #10
0
 def test_select_dtypes_str_raises(self):
     df = DataFrame({'a': list('abc'),
                     'g': list(u('abc')),
                     'b': list(range(1, 4)),
                     'c': np.arange(3, 6).astype('u1'),
                     'd': np.arange(4.0, 7.0, dtype='float64'),
                     'e': [True, False, True],
                     'f': pd.date_range('now', periods=3).values})
     string_dtypes = set((str, 'str', np.string_, 'S1',
                          'unicode', np.unicode_, 'U1'))
     try:
         string_dtypes.add(unicode)
     except NameError:
         pass
     for dt in string_dtypes:
         with tm.assert_raises_regex(TypeError,
                                     'string dtypes are not allowed'):
             df.select_dtypes(include=[dt])
         with tm.assert_raises_regex(TypeError,
                                     'string dtypes are not allowed'):
             df.select_dtypes(exclude=[dt])
Пример #11
0
 def test_select_dtypes_raises_on_string(self):
     df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
     with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'):
         df.select_dtypes(include='object')
     with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'):
         df.select_dtypes(exclude='object')
     with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'):
         df.select_dtypes(include=int, exclude='object')
Пример #12
0
    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']]
        multi_column = item.dtype.metadata.get('multi_column')
        if len(item) == 0:
            rdata = item[column_fields] if len(column_fields) > 0 else None
            if multi_column is not None:
                columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])
                return DataFrame(rdata, index=index, columns=columns)
            else:
                return DataFrame(rdata, index=index)

        columns = item.dtype.metadata['columns']
        df = DataFrame(data=item[column_fields], index=index, columns=columns)

        if multi_column is not None:
            df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])

        if force_bytes_to_unicode:
            # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow
            # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
            # This should not be used for a normal flow, and you should instead of writing unicode strings
            # if you want to work with str in py3.,

            for c in df.select_dtypes(object):
                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                if type(df[c].iloc[0]) == bytes:
                    df[c] = df[c].str.decode('utf-8')

            if isinstance(df.index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(df.index.levels)):
                    _index = df.index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                df.index = unicode_indexes
            else:
                if type(df.index[0]) == bytes:
                    df.index = df.index.astype('unicode')

            if type(df.columns[0]) == bytes:
                df.columns = df.index.astype('unicode')

        return df
Пример #13
0
    def test_select_dtypes_include(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=[np.number])
        ei = df[['b', 'c', 'd', 'k']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
        ei = df[['b', 'c', 'd']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'],
                              exclude=['timedelta'])
        ei = df[['b', 'c', 'd', 'f']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetime'])
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetime64'])
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetimetz'])
        ei = df[['h', 'i']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['timedelta'])
        ei = df[['k']]
        assert_frame_equal(ri, ei)

        self.assertRaises(NotImplementedError,
                          lambda: df.select_dtypes(include=['period']))
Пример #14
0
    def test_select_dtypes_duplicate_columns(self):
        # GH20839
        odict = compat.OrderedDict
        df = DataFrame(odict([('a', list('abc')),
                              ('b', list(range(1, 4))),
                              ('c', np.arange(3, 6).astype('u1')),
                              ('d', np.arange(4.0, 7.0, dtype='float64')),
                              ('e', [True, False, True]),
                              ('f', pd.date_range('now', periods=3).values)]))
        df.columns = ['a', 'a', 'b', 'b', 'b', 'c']

        expected = DataFrame({'a': list(range(1, 4)),
                              'b': np.arange(3, 6).astype('u1')})

        result = df.select_dtypes(include=[np.number], exclude=['floating'])
        assert_frame_equal(result, expected)
Пример #15
0
 def remove_discrete_variables_with_too_many_states(df: pd.DataFrame, num_states = 30):
     column_names = df.select_dtypes(include=['object']).apply(lambda x: len(x.unique()) >= num_states)
     cols = list(set(df.columns.tolist()) - set(column_names[column_names == True].index.tolist()))
     return df[cols]
Пример #16
0
 def test_select_dtypes_empty(self):
     df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
     msg = 'at least one of include or exclude must be nonempty'
     with pytest.raises(ValueError, match=msg):
         df.select_dtypes()
Пример #17
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            target_names = None

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            #plot confusion matrix
            conf_matrix = metrics.confusion_matrix(
                reference_data[target_column],
                reference_data[prediction_column])

            z = conf_matrix.astype(int)

            labels = target_names if target_names else sorted(
                set(reference_data[target_column]))

            # change each element of z to type string for annotations
            z_text = [[str(y) for y in x] for x in z]

            fig = ff.create_annotated_heatmap(z,
                                              x=labels,
                                              y=labels,
                                              annotation_text=z_text,
                                              colorscale='bluered',
                                              showscale=True)

            fig.update_layout(xaxis_title="Predicted value",
                              yaxis_title="Actual value")

            conf_matrix_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1 if production_data is not None else 2,
                params={
                    "data": conf_matrix_json['data'],
                    "layout": conf_matrix_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Пример #18
0
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if prediction_column is not None:
            #calculate output drift
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            #ref_feature_vc = reference_data[prediction_column][np.isfinite(reference_data[prediction_column])].value_counts()
            #prod_feature_vc = production_data[prediction_column][np.isfinite(production_data[prediction_column])].value_counts()

            #keys = set(list(reference_data[prediction_column][np.isfinite(reference_data[prediction_column])].unique()) + 
            #    list(production_data[prediction_column][np.isfinite(production_data[prediction_column])].unique()))

            ref_feature_vc = reference_data[prediction_column].value_counts()
            prod_feature_vc = production_data[prediction_column].value_counts()

            keys = set(list(reference_data[prediction_column].unique()) + 
                list(production_data[prediction_column].unique()))

            ref_feature_dict = dict.fromkeys(keys, 0)
            for key, item in zip(ref_feature_vc.index, ref_feature_vc.values):
                ref_feature_dict[key] = item

            prod_feature_dict = dict.fromkeys(keys, 0)
            for key, item in zip(prod_feature_vc.index, prod_feature_vc.values):
                prod_feature_dict[key] = item

            f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
            f_obs = [value[1] for value in sorted(prod_feature_dict.items())]

            pred_p_value = chisquare(f_exp, f_obs)[1]

            pred_sim_test = "detected" if pred_p_value < 0.05 else "not detected"

            #plot output distributions
            fig = go.Figure()
            
            fig.add_trace(go.Histogram(x=reference_data[prediction_column], 
                 marker_color=grey, opacity=0.6, nbinsx=10,  name='Reference', histnorm='probability'))

            fig.add_trace(go.Histogram(x=production_data[prediction_column],
                 marker_color=red, opacity=0.6,nbinsx=10, name='Current', histnorm='probability'))

            fig.update_layout(
                legend = dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
                ),
                xaxis_title = prediction_column,
                yaxis_title = "Share"
            )

            pred_drift_json  = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title="Prediction Drift: " + pred_sim_test + ", p_value=" + str(round(pred_p_value, 6)),
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "data": pred_drift_json['data'],
                    "layout": pred_drift_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
def get_numeric_columns(df: pd.DataFrame) -> List[str]:
    cols = df.select_dtypes([np.number]).columns
    return cols.tolist()
Пример #20
0
 def test_select_dtypes_empty(self):
     df = DataFrame({"a": list("abc"), "b": list(range(1, 4))})
     msg = "at least one of include or exclude must be nonempty"
     with pytest.raises(ValueError, match=msg):
         df.select_dtypes()
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """
        # check input dataframe
        X = _is_dataframe(X)

        # check variables
        self.variables = _find_all_variables(X, self.variables)

        # check if df contains na
        _check_contains_na(X, self.variables)

        # limit df to variables to smooth code below
        X = X[self.variables].copy()

        # find categorical and numerical variables
        self.variables_categorical_ = list(
            X.select_dtypes(include="O").columns)
        self.variables_numerical_ = list(
            X.select_dtypes(include=["float", "integer"]).columns)

        # obtain cross-validation indeces
        skf = StratifiedKFold(n_splits=self.cv,
                              shuffle=True,
                              random_state=self.random_state)
        skf.get_n_splits(X, y)

        if self.variables_categorical_ and self.variables_numerical_:
            _pipeline = self._make_combined_pipeline()

        elif self.variables_categorical_:
            _pipeline = self._make_categorical_pipeline()

        else:
            _pipeline = self._make_numerical_pipeline()

        # obtain feature performance with cross-validation
        feature_importances_cv = []

        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]

            _pipeline.fit(X_train, y_train)

            X_test = _pipeline.transform(X_test)

            if self.scoring == "roc_auc_score":
                tmp_split = {
                    f: roc_auc_score(y_test, X_test[f])
                    for f in self.variables
                }
            else:
                tmp_split = {
                    f: r2_score(y_test, X_test[f])
                    for f in self.variables
                }

            feature_importances_cv.append(pd.Series(tmp_split))

        feature_importances_cv = pd.concat(feature_importances_cv, axis=1)

        self.feature_performance_ = feature_importances_cv.mean(  # type: ignore
            axis=1).to_dict()

        self.selected_features_ = [
            f for f in self.variables
            if self.feature_performance_[f] > self.threshold
        ]

        self.input_shape_ = X.shape

        return self
Пример #22
0
def dist_plot(
    data: pd.DataFrame,
    mean_color: str = "orange",
    figsize: Tuple = (16, 2),
    fill_range: Tuple = (0.025, 0.975),
    showall: bool = False,
    kde_kws: Dict[str, Any] = None,
    rug_kws: Dict[str, Any] = None,
    fill_kws: Dict[str, Any] = None,
    font_kws: Dict[str, Any] = None,
):
    """ Two-dimensional visualization of the distribution of non binary numerical features.
    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \
        is provided, the index/column information is used to label the plots
    mean_color : str, optional
        Color of the vertical line indicating the mean of the data, by default "orange"
    figsize : Tuple, optional
        Controls the figure size, by default (16, 2)
    fill_range : Tuple, optional
        Set the quantiles for shading. Default spans 95% of the data, which is about \
        two std. deviations above and below the mean, by default (0.025, 0.975)
    showall : bool, optional
        Set to True to remove the output limit of 20 plots, by default False
    kde_kws : Dict[str, Any], optional
        Keyword arguments for kdeplot(), by default {"color": "k", "alpha": 0.7, \
        "linewidth": 1.5, "bw": 0.3}
    rug_kws : Dict[str, Any], optional
        Keyword arguments for rugplot(), by default {"color": "#ff3333", \
        "alpha": 0.05, "linewidth": 4, "height": 0.075}
    fill_kws : Dict[str, Any], optional
        Keyword arguments to control the fill, by default {"color": "#80d4ff", \
        "alpha": 0.2}
    font_kws : Dict[str, Any], optional
        Keyword arguments to control the font, by default {"color":  "#111111", \
        "weight": "normal", "size": 11}
    Returns
    -------
    ax: matplotlib Axes
        Returns the Axes object with the plot for further tweaking.
    """

    # Handle dictionary defaults
    kde_kws = ({
        "alpha": 0.75,
        "linewidth": 1.5,
        "bw": 0.4
    } if kde_kws is None else kde_kws.copy())
    rug_kws = ({
        "color": "#ff3333",
        "alpha": 0.05,
        "linewidth": 4,
        "height": 0.075
    } if rug_kws is None else rug_kws.copy())
    fill_kws = ({
        "color": "#80d4ff",
        "alpha": 0.2
    } if fill_kws is None else fill_kws.copy())
    font_kws = ({
        "color": "#111111",
        "weight": "normal",
        "size": 11
    } if font_kws is None else font_kws.copy())

    data = pd.DataFrame(data.copy()).dropna(axis=1, how="all")
    data = data.loc[:, data.nunique() > 2]
    cols = list(data.select_dtypes(include=["number"]).columns)
    data = data[cols]
    data = data.loc[:, data.nunique() > 2]

    if len(cols) == 0:
        print("No columns with numeric data were detected.")
        return

    elif len(cols) >= 20 and showall is False:
        print(
            "Note: The number of non binary numerical features is very large "
            f"({len(cols)}), please consider splitting the data. Showing plots for "
            "the first 20 numerical features. Override this by setting showall=True."
        )
        cols = cols[:20]

    for col in cols:
        num_dropped_vals = data[col].isna().sum()
        if num_dropped_vals > 0:
            col_data = data[col].dropna(axis=0)
            print(
                f"Dropped {num_dropped_vals} missing values from column {col}."
            )

        else:
            col_data = data[col]

        _, ax = plt.subplots(figsize=figsize)
        ax = sns.distplot(
            col_data,
            hist=False,
            rug=True,
            kde_kws=kde_kws,
            rug_kws=rug_kws,
        )

        # Vertical lines and fill
        x, y = ax.lines[0].get_xydata().T
        ax.fill_between(
            x,
            y,
            where=((x >= np.quantile(col_data, fill_range[0]))
                   & (x <= np.quantile(col_data, fill_range[1]))),
            label=f"{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%",
            **fill_kws,
        )

        mean = np.mean(col_data)
        std = scipy.stats.tstd(col_data)
        ax.vlines(
            x=mean,
            ymin=0,
            ymax=np.interp(mean, x, y),
            ls="dotted",
            color=mean_color,
            lw=2,
            label="mean",
        )
        ax.vlines(
            x=np.median(col_data),
            ymin=0,
            ymax=np.interp(np.median(col_data), x, y),
            ls=":",
            color=".3",
            label="median",
        )
        ax.vlines(
            x=[mean - std, mean + std],
            ymin=0,
            ymax=[np.interp(mean - std, x, y),
                  np.interp(mean + std, x, y)],
            ls=":",
            color=".5",
            label="\u03BC \u00B1 \u03C3",
        )

        ax.set_ylim(0)
        ax.set_xlim(ax.get_xlim()[0] * 1.15, ax.get_xlim()[1] * 1.15)

        # Annotations and legend
        ax.text(0.01,
                0.85,
                f"Mean: {mean:.2f}",
                fontdict=font_kws,
                transform=ax.transAxes)
        ax.text(0.01,
                0.7,
                f"Std. dev: {std:.2f}",
                fontdict=font_kws,
                transform=ax.transAxes)
        ax.text(
            0.01,
            0.55,
            f"Skew: {scipy.stats.skew(col_data):.2f}",
            fontdict=font_kws,
            transform=ax.transAxes,
        )
        ax.text(
            0.01,
            0.4,
            f"Kurtosis: {scipy.stats.kurtosis(col_data):.2f}",  # Excess Kurtosis
            fontdict=font_kws,
            transform=ax.transAxes,
        )
        ax.text(
            0.01,
            0.25,
            f"Count: {len(col_data)}",
            fontdict=font_kws,
            transform=ax.transAxes,
        )
        ax.legend(loc="upper right")

    return ax
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            #target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            array_prediction = production_data[prediction_column].to_numpy()

            prediction_ids = np.argmax(array_prediction, axis=-1)
            prediction_labels = [prediction_column[x] for x in prediction_ids]

            #plot support bar
            graphs = []

            for label in prediction_column:
                pred_distr = ff.create_distplot([
                    production_data[production_data[target_column] == label]
                    [label], production_data[
                        production_data[target_column] != label][label]
                ], [str(label), "other"],
                                                colors=[red, grey],
                                                bin_size=0.05,
                                                show_curve=False,
                                                show_rug=True)

                pred_distr.update_layout(xaxis_title="Probability",
                                         yaxis_title="Share",
                                         legend=dict(orientation="h",
                                                     yanchor="bottom",
                                                     y=1.02,
                                                     xanchor="right",
                                                     x=1))

                pred_distr_json = json.loads(pred_distr.to_json())

                graphs.append({
                    "id": "tab_" + str(label),
                    "title": str(label),
                    "graph": {
                        "data": pred_distr_json["data"],
                        "layout": pred_distr_json["layout"],
                    }
                })

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="tabbed_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={"graphs": graphs},
                additionalGraphs=[],
            )
        else:
            self.wi = None
Пример #24
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            #plot output correlations
            abs_perc_error_time = go.Figure()

            abs_perc_error = list(
                map(
                    lambda x: 100 * abs(x[0] - x[1]) / x[0],
                    zip(reference_data[target_column],
                        reference_data[prediction_column])))

            error_trace = go.Scatter(x=reference_data[date_column]
                                     if date_column else reference_data.index,
                                     y=abs_perc_error,
                                     mode='lines',
                                     name='Absolute Percentage Error',
                                     marker=dict(size=6, color=red))

            zero_trace = go.Scatter(
                x=reference_data[date_column]
                if date_column else reference_data.index,
                y=[0] * reference_data.shape[0],
                mode='lines',
                opacity=0.5,
                marker=dict(
                    size=6,
                    color='green',
                ),
                showlegend=False,
            )

            abs_perc_error_time.add_trace(error_trace)
            abs_perc_error_time.add_trace(zero_trace)

            abs_perc_error_time.update_layout(
                xaxis_title="Timestamp" if date_column else "Index",
                yaxis_title="Percent",
                legend=dict(orientation="h",
                            yanchor="bottom",
                            y=1.02,
                            xanchor="right",
                            x=1))

            abs_perc_error_time_json = json.loads(
                abs_perc_error_time.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={
                    "data": abs_perc_error_time_json['data'],
                    "layout": abs_perc_error_time_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Пример #25
0
def get_numerical_cols(data: pd.DataFrame) -> pd.Index:
    numerical_columns: pd.Index = data.select_dtypes(exclude='object').columns

    return numerical_columns
Пример #26
0
def _impute_data(df: pd.DataFrame):
    for float_col in df.select_dtypes('float64'):
        df[float_col].fillna(df[float_col].mean(), inplace=True)

    for col in df.columns:
        df[col].fillna(df[col].mode().iloc[0], inplace=True)
Пример #27
0
def set_categories(df: pandas.DataFrame,
                   column_categories: Dict[str, pandas.Categorical]):
    for c in df.select_dtypes(include='category').columns:
        df[c].cat.set_categories(column_categories[c].categories, inplace=True)
Пример #28
0
def correlation_analysis(
    data: pd.DataFrame,
    col_list=None,
    row_list=None,
    check_norm=False,
    method: str = "pearson",
    dropna: str = "pairwise",
    permutation_test: bool = False,
    n_permutations: int = 1000,
    random_state=None,
):
    """Run correlations for numerical features and return output in different formats
    Different methods to compute correlations and to handle missing values are implemented.
    Inspired by `researchpy.corr_case` and `researchpy.corr_pair`.
    
    Parameters
    ----------
    data : pd.DataFrame
        Dataframe with variables in columns, cases in rows
    row_list: list or None (default: None)
        List with names of columns in `data` that should be in the rows of the correlogram.
        If None, all columns are used but only every unique combination.
    col_list: list or None (default: None)
        List with names of columns in `data` that should be in the columns of the correlogram.
        If None, all columns are used and only every unique combination.
    check_norm: bool (default: False)
        If True, normality will be checked for columns in `data` using `normal_check`. This influences the used method
        for correlations, i.e. Pearson or Spearman. Note: normality check ignores missing values.
    method: {'pearson', 'kendall', 'spearman'}, default 'pearson'
        Type of correlation, either Pearson's r, Spearman's rho, or Kendall's tau, implemented via respectively
        `scipy.stats.pearsonr`, `scipy.stats.spearmanr`, and `scipy.stats.kendalltau`
        Will be ignored if check_norm=True. Instead, Person's r is used for every combination of normally distributed
        columns and Spearman's rho is used for all other combinations.
    dropna : {'listwise', 'pairwise'}, default 'pairwise'
        Should rows with missing values be dropped over the complete `data` ('listwise') or for every correlation
        separately ('pairwise')
    permutation_test: bool (default: False)
        If true, a permutation test will added
    n_permutations: int (default: 1000)
        Number of permutations in the permutation test
    random_state: None or int (default: None)
        Random state for permutation_test. If not None, random_state will be updated for every permutation
    plot_permutation: bool (default: False)
        Whether to plot the results of the permutation test
    figsize: tuple (default: (11.7, 8.27))
        Width and height of the figure in inches
    
    Returns
    -------
    result_dict: dict
        Dictionary containing with the following keys:
        info: pd.DataFrame
            Description of correlation method, missing values handling and number of observations
        r-values: pd.DataFrame
            Dataframe with correlation coefficients. Indices and columns are column names from `data`. Only lower
            triangle is filled.
        p-values: pd.DataFrame
            Dataframe with p-values. Indices and columns are column names from `data`. Only lower triangle is filled.
        N: pd.DataFrame
            Dataframe with numbers of observations. Indices and columns are column names from `data`. Only lower
            triangle is filled. If dropna ='listwise', every correlation will have the same number of observations.
        summary: pd.DataFrame
            Dataframe with columns ['analysis', 'feature1', 'feature2', 'r-value', 'p-value', 'N', 'stat-sign']
            which indicate the type of test used for the correlation, the pair of columns, the correlation coefficient,
            the p-value, the number of observations for each combination of columns in `data` and whether the r-value is
            statistically significant.
    plotted_permuations: Figure
    
    Examples
    --------
    >>> from jmspack.frequentist_statistics import correlation_analysis
    >>> import seaborn as sns
    >>> iris = sns.load_dataset('iris')
    >>> dict_results = correlation_analysis(iris, method='pearson', dropna='listwise', permutation_test=True,
    ...                                        n_permutations=100, check_norm=True)
    >>> dict_results['summary']
    
    References
    ----------
    Bryant, C (2018). researchpy's documentation [Revision 9ae5ed63]. Retrieved from
    https://researchpy.readthedocs.io/en/latest/
    
    """

    # Settings test
    if method == "pearson":
        test, test_name = stats.pearsonr, "Pearson"
    elif method == "spearman":
        test, test_name = stats.spearmanr, "Spearman Rank"
    elif method == "kendall":
        test, test_name = stats.kendalltau, "Kendall's Tau-b"
    else:
        raise ValueError("method not in {'pearson', 'kendall', 'spearman'}")

    # Copy numerical data from the original data
    data = data.copy().select_dtypes("number")

    # Get correct lists
    if col_list and not row_list:
        row_list = data.select_dtypes("number").drop(col_list,
                                                     axis=1).columns.tolist()
    elif row_list and not col_list:
        col_list = data.select_dtypes("number").drop(row_list,
                                                     axis=1).columns.tolist()

    # Initializing dataframes to store results
    info = pd.DataFrame()
    summary = pd.DataFrame()
    if not col_list and not row_list:
        r_vals = pd.DataFrame(columns=data.columns, index=data.columns)
        p_vals = pd.DataFrame(columns=data.columns, index=data.columns)
        n_vals = pd.DataFrame(columns=data.columns, index=data.columns)
        iterator = combinations(data.columns, 2)
    else:
        r_vals = pd.DataFrame(columns=col_list, index=row_list)
        p_vals = pd.DataFrame(columns=col_list, index=row_list)
        n_vals = pd.DataFrame(columns=col_list, index=row_list)
        iterator = product(col_list, row_list)

    if dropna == "listwise":
        # Remove rows with missing values
        data = data.dropna(how="any", axis="index")
        info = info.append(
            {
                f"{test_name} correlation test using {dropna} deletion":
                f"Total observations used = {len(data)}"
            },
            ignore_index=True,
        )
    elif dropna == "pairwise":
        info = info.append(
            {
                f"{test_name} correlation test using {dropna} deletion":
                f"Observations in the data = {len(data)}"
            },
            ignore_index=True,
        )
    else:
        raise ValueError("dropna not in {'listwise', 'pairwise'}")

    if check_norm:
        # Check normality of all columns in the data
        df_normality = normal_check(data)
        norm_names = df_normality.loc[df_normality["normality"],
                                      "feature"].tolist()

    # Iterating through the Pandas series and performing the correlation
    for col1, col2 in iterator:
        if dropna == "pairwise":
            # Remove rows with missing values in the pair of columns
            test_data = data[[col1, col2]].dropna()
        else:
            test_data = data

        if check_norm:
            # Select Pearson's r only if both columns are normally distributed
            if (col1 in norm_names) and (col2 in norm_names):
                test, test_name = stats.pearsonr, "Pearson"
            else:
                test, test_name = stats.spearmanr, "Spearman Rank"

        # Run correlations
        r_value, p_value = test(test_data.loc[:, col1], test_data.loc[:, col2])
        n_value = len(test_data)

        # Store output in matrix format
        try:
            r_vals.loc[col2, col1] = r_value
            p_vals.loc[col2, col1] = p_value
            n_vals.loc[col2, col1] = n_value
        except KeyError:
            r_vals.loc[col1, col2] = r_value
            p_vals.loc[col1, col2] = p_value
            n_vals.loc[col1, col2] = n_value

        # Store output in dataframe format
        dict_summary = {
            "analysis": test_name,
            "feature1": col1,
            "feature2": col2,
            "r-value": r_value,
            "p-value": p_value,
            "stat-sign": (p_value < 0.05),
            "N": n_value,
        }

        if permutation_test:
            raise ValueError("permutation_test has yet to be implemented")

            # # Copy the complete data
            # col2_shuffle = np.array(test_data.loc[:, col2])
            # col2_shuffle = np.repeat(
            #     col2_shuffle[:, np.newaxis], n_permutations, axis=1
            # )
            # # Shuffle within the columns
            # np.random.seed(random_state)
            # ix_i = np.random.sample(col2_shuffle.shape).argsort(axis=0)
            # ix_j = np.tile(np.arange(col2_shuffle.shape[1]), (col2_shuffle.shape[0], 1))
            # col2_shuffle = col2_shuffle[ix_i, ix_j]
            # permutations = np.apply_along_axis(
            #     permute_test,
            #     axis=0,
            #     arr=col2_shuffle,
            #     test_type="correlation",
            #     test=test,
            #     a2=np.array(test_data.loc[:, col1]),
            # )
            #
            # extreme_permutation = np.where(permutations < p_value, 1, 0)
            # p_permutation = extreme_permutation.sum() / len(permutations)
            # dict_summary["permutation-p-value"] = p_permutation
            #
            # # Reset random seed numpy
            # np.random.seed(None)

        summary = pd.concat(
            [summary, pd.DataFrame(data=dict_summary, index=[0])],
            axis=0,
            ignore_index=True,
            sort=False,
        )

    # Embed results within a dictionary
    result_dict = {
        "r-value": r_vals,
        "p-value": p_vals,
        "N": n_vals,
        "info": info,
        "summary": summary,
    }

    return result_dict
Пример #29
0
 def check_dataframe(self, dataframe: pd.DataFrame) -> pd.DataFrame:
     dataframe = super().check_dataframe(dataframe=dataframe)
     dataframe["luid"] = dataframe["filepath"]
     for column in dataframe.select_dtypes("number").columns:
         dataframe[column] = dataframe[column].map(str)
     return dataframe
def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df
Пример #31
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            #target_names = None

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            #array_prediction = reference_data[prediction_column].to_numpy()

            #prediction_ids = np.argmax(array_prediction, axis=-1)
            #prediction_labels = [prediction_column[x] for x in prediction_ids]
            if len(prediction_column) <= 2:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(reference_data[target_column]))
                binaraized_target.columns = ['target']

                p, r, thrs = metrics.precision_recall_curve(
                    binaraized_target, reference_data[prediction_column[0]])
                fig = go.Figure()

                fig.add_trace(
                    go.Scatter(x=p,
                               y=r,
                               mode='lines',
                               name='PR',
                               marker=dict(
                                   size=6,
                                   color=red,
                               )))

                fig.update_layout(yaxis_title="Precision",
                                  xaxis_title="Recall",
                                  showlegend=True)

                fig_json = json.loads(fig.to_json())

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1 if production_data is not None else 2,
                    params={
                        "data": fig_json['data'],
                        "layout": fig_json['layout']
                    },
                    additionalGraphs=[],
                )
            else:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(reference_data[target_column]))
                binaraized_target.columns = prediction_column
                #plot support bar
                graphs = []

                for label in prediction_column:
                    p, r, thrs = metrics.precision_recall_curve(
                        binaraized_target[label], reference_data[label])
                    fig = go.Figure()

                    fig.add_trace(
                        go.Scatter(x=p,
                                   y=r,
                                   mode='lines',
                                   name='PR',
                                   marker=dict(
                                       size=6,
                                       color=red,
                                   )))

                    fig.update_layout(yaxis_title="Precision",
                                      xaxis_title="Recall",
                                      showlegend=True)

                    fig_json = json.loads(fig.to_json())

                    graphs.append({
                        "id": "tab_" + str(label),
                        "title": str(label),
                        "graph": {
                            "data": fig_json["data"],
                            "layout": fig_json["layout"],
                        }
                    })

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="tabbed_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1 if production_data is not None else 2,
                    params={"graphs": graphs},
                    additionalGraphs=[],
                )
        else:
            self.wi = None
Пример #32
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            #plot support bar
            metrics_matrix = metrics.classification_report(
                production_data[target_column],
                production_data[prediction_column],
                output_dict=True)
            metrics_frame = pd.DataFrame(metrics_matrix)
            support = metrics_frame.iloc[-1:, :-3].values[0]

            fig = go.Figure()

            fig.add_trace(
                go.Bar(x=target_names if target_names else
                       metrics_frame.columns.tolist()[:-3],
                       y=metrics_frame.iloc[-1:, :-3].values[0],
                       marker_color=red,
                       name='Support'))

            fig.update_layout(
                xaxis_title="Class",
                yaxis_title="Number of Objects",
            )

            support_bar_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={
                    "data": support_bar_json['data'],
                    "layout": support_bar_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Пример #33
0
 def test_select_dtypes_empty(self):
     df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
     msg = 'at least one of include or exclude must be nonempty'
     with pytest.raises(ValueError, match=msg):
         df.select_dtypes()
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        #set params data
        params_data = []
        drifted_fetures_count = 0
        #plt.ioff()
        for feature_name in num_feature_names:  # + cat_feature_names: #feature_names:
            prod_small_hist = np.histogram(
                production_data[feature_name][np.isfinite(
                    production_data[feature_name])],
                bins=10,
                density=True)
            ref_small_hist = np.histogram(
                reference_data[feature_name][np.isfinite(
                    reference_data[feature_name])],
                bins=10,
                density=True)

            feature_type = 'num'

            p_value = ks_2samp(reference_data[feature_name],
                               production_data[feature_name])[1]

            distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected"
            drifted_fetures_count += 1 if p_value < 0.05 else 0

            params_data.append({
                "details": {
                    "parts": [{
                        "title": "Data drift",
                        "id": feature_name + "_drift",
                        "type": "widget"
                    }, {
                        "title": "Data distribution",
                        "id": feature_name + "_distr"
                    }],
                    "insights": []
                },
                "f1": feature_name,
                "f6": feature_type,
                "f3": {
                    "x": list(ref_small_hist[1]),
                    "y": list(ref_small_hist[0])
                },
                "f4": {
                    "x": list(prod_small_hist[1]),
                    "y": list(prod_small_hist[0])
                },
                "f2": distr_sim_test,
                "f5": round(p_value, 6)
            })

        for feature_name in cat_feature_names:  #feature_names:
            prod_small_hist = np.histogram(
                production_data[feature_name][np.isfinite(
                    production_data[feature_name])],
                bins=10,
                density=True)
            ref_small_hist = np.histogram(
                reference_data[feature_name][np.isfinite(
                    reference_data[feature_name])],
                bins=10,
                density=True)

            feature_type = 'cat'

            #p_value = ks_2samp(reference_data[feature_name], production_data[feature_name])[1]
            #CHI2 to be implemented for cases with different categories
            ref_feature_vc = reference_data[feature_name][np.isfinite(
                reference_data[feature_name])].value_counts()
            prod_feature_vc = production_data[feature_name][np.isfinite(
                production_data[feature_name])].value_counts()

            keys = set(
                list(reference_data[feature_name][np.isfinite(
                    reference_data[feature_name])].unique()) +
                list(production_data[feature_name][np.isfinite(
                    production_data[feature_name])].unique()))

            ref_feature_dict = dict.fromkeys(keys, 0)
            for key, item in zip(ref_feature_vc.index, ref_feature_vc.values):
                ref_feature_dict[key] = item

            prod_feature_dict = dict.fromkeys(keys, 0)
            for key, item in zip(prod_feature_vc.index,
                                 prod_feature_vc.values):
                prod_feature_dict[key] = item

            f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
            f_obs = [value[1] for value in sorted(prod_feature_dict.items())]

            p_value = chisquare(f_exp, f_obs)[1]

            distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected"
            drifted_fetures_count += 1 if p_value < 0.05 else 0

            params_data.append({
                "details": {
                    "parts": [{
                        "title": "Data drift",
                        "id": feature_name + "_drift",
                        "type": "widget"
                    }, {
                        "title": "Data distribution",
                        "id": feature_name + "_distr"
                    }],
                    "insights": []
                },
                "f1": feature_name,
                "f6": feature_type,
                "f3": {
                    "x": list(ref_small_hist[1]),
                    "y": list(ref_small_hist[0])
                },
                "f4": {
                    "x": list(prod_small_hist[1]),
                    "y": list(prod_small_hist[0])
                },
                "f2": distr_sim_test,
                "f5": round(p_value, 6)
            })

        #set additionalGraphs
        additional_graphs_data = []
        for feature_name in num_feature_names + cat_feature_names:  #feature_names:

            #plot distributions
            fig = go.Figure()
            fig.add_trace(
                go.Histogram(x=reference_data[feature_name],
                             marker_color=grey,
                             opacity=0.6,
                             nbinsx=10,
                             name='Reference',
                             histnorm='probability'))

            fig.add_trace(
                go.Histogram(x=production_data[feature_name],
                             marker_color=red,
                             opacity=0.6,
                             nbinsx=10,
                             name='Production',
                             histnorm='probability'))

            fig.update_layout(legend=dict(orientation="h",
                                          yanchor="bottom",
                                          y=1.02,
                                          xanchor="right",
                                          x=1),
                              xaxis_title=feature_name,
                              yaxis_title="Share")

            distr_figure = json.loads(fig.to_json())

            #plot drift
            reference_mean = np.mean(reference_data[feature_name][np.isfinite(
                reference_data[feature_name])])
            reference_std = np.std(reference_data[feature_name][np.isfinite(
                reference_data[feature_name])],
                                   ddof=1)
            x_title = "Timestamp" if date_column else "Index"

            fig = go.Figure()

            fig.add_trace(
                go.Scatter(x=production_data[date_column]
                           if date_column else production_data.index,
                           y=production_data[feature_name],
                           mode='markers',
                           name='Production',
                           marker=dict(size=6, color=grey)))

            fig.update_layout(
                xaxis_title=x_title,
                yaxis_title=feature_name,
                showlegend=True,
                legend=dict(orientation="h",
                            yanchor="bottom",
                            y=1.02,
                            xanchor="right",
                            x=1),
                shapes=[
                    dict(
                        type="rect",
                        # x-reference is assigned to the x-values
                        xref="paper",
                        # y-reference is assigned to the plot paper [0,1]
                        yref="y",
                        x0=0,
                        y0=reference_mean - reference_std,
                        x1=1,
                        y1=reference_mean + reference_std,
                        fillcolor="LightGreen",
                        opacity=0.5,
                        layer="below",
                        line_width=0,
                    ),
                    dict(
                        type="line",
                        name='Reference',
                        xref="paper",
                        yref="y",
                        x0=0,  #min(testset_agg_by_date.index),
                        y0=reference_mean,
                        x1=1,  #max(testset_agg_by_date.index),
                        y1=reference_mean,
                        line=dict(color="Green", width=3)),
                ])

            drift_figure = json.loads(fig.to_json())

            #add distributions data
            additional_graphs_data.append(
                AdditionalGraphInfo(feature_name + '_distr', {
                    "data": distr_figure['data'],
                    "layout": distr_figure['layout']
                }))

            #add drift data
            additional_graphs_data.append(
                AdditionalGraphInfo(
                    feature_name + '_drift', {
                        "title": feature_name + "drift",
                        "size": 2,
                        "text": "",
                        "type": "big_graph",
                        "params": {
                            "data": drift_figure['data'],
                            "layout": drift_figure['layout']
                        }
                    }))

        self.wi = BaseWidgetInfo(
            title="Data Drift: drift detected for " +
            str(drifted_fetures_count) + " out of " +
            str(len(num_feature_names) + len(cat_feature_names)) + " features",
            type="big_table",
            details="",
            alertStats=AlertStats(),
            alerts=[],
            alertsPosition="row",
            insights=[],
            size=2,
            params={
                "rowsPerPage":
                min(len(num_feature_names) + len(cat_feature_names), 10),
                "columns": [{
                    "title": "Feature",
                    "field": "f1"
                }, {
                    "title": "Type",
                    "field": "f6"
                }, {
                    "title": "Reference Distribution",
                    "field": "f3",
                    "type": "histogram",
                    "options": {
                        "xField": "x",
                        "yField": "y"
                    }
                }, {
                    "title": "Production Distribution",
                    "field": "f4",
                    "type": "histogram",
                    "options": {
                        "xField": "x",
                        "yField": "y"
                    }
                }, {
                    "title": "Data drift",
                    "field": "f2"
                }, {
                    "title": "P-Value for Similarity Test",
                    "field": "f5",
                    "sort": "asc"
                }],
                "data":
                params_data
            },
            additionalGraphs=additional_graphs_data)
Пример #35
0
class DataWorker(object):
	
	
	def feat_value2int(series):
	    all_values = list(enumerate(np.unique(series)))
	    value_dict = {name : i for i,name in all_values}
	    return value_dict
	
	
	def __init__(self,data=None):
		"""
		Init DataWorker with pandas.DataFrame
		Otherwise make sure that the rdata can be transformed to DataFrame.
		"""
		if data is None:
			self.__data = {}
		if isinstance(data,DataFrame):
			self.__data = data.copy()
		else:
			self.__data = DataFrame(data)
		
		self.__featureDict = None

		
	@property
	def featureDict(self):
		self.__data.select_dtypes(include=['object'])
		
	@featureDict.setter
	def featureDict(self,value):
		pass
		
	@property
	def data(self):
		return self.__data
	@data.setter
	def data(self,df):
		self.data = df
	
	def getColNamesWithNan(self):
		s = self.__data.isnull().any()
		return	s.index[s==True].tolist()
	
	def dataClean(self,transDict = None,fillna={'all':'most_frequent'},yCol = -1):
		"""
		yCol: the col you wanna predict
		fillna: 
			{columnn:method_name} dictionary
			default:{'all':'most_frequent'}
			provied functions are : 'most_frequent','mean','median','first_n_frequent,n'(where the last n is a number)
			when key =='all' : fill column which include na with the same function,
			this key is suggested to put at the end
		"""
		
		# try to map all data to numeric

		self.__data = cd.fillna(self.__data,fillna)

		if transDict == None:
			self.__featureDict
		if yCol != -1:
			self.__data = cd.change_yCol(self.__data,yCol)
		
		
	def algorithmUsing():
		pass

	def showFeagure():
		pass
	def getResult():
		pass
Пример #36
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        if self.columns is None:
            self.columns = list(X.select_dtypes('number').columns)

        raise NotImplementedError
Пример #37
0
 def test_select_dtypes_empty(self):
     df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
     with tm.assertRaisesRegexp(
             ValueError, 'at least one of include or '
             'exclude must be nonempty'):
         df.select_dtypes()
Пример #38
0
 def _get_date_columns(dataframe: pd.DataFrame):
     return dataframe.select_dtypes(include=[np.datetime64]).columns.values
def get_text_categorical_columns(df: pd.DataFrame) -> List[str]:
    return df.select_dtypes(exclude=['int', 'float']).columns
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if production_data is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            ref_error = reference_data[prediction_column] - reference_data[
                target_column]
            prod_error = production_data[prediction_column] - production_data[
                target_column]

            ref_quntile_5 = np.quantile(ref_error, .05)
            ref_quntile_95 = np.quantile(ref_error, .95)

            prod_quntile_5 = np.quantile(prod_error, .05)
            prod_quntile_95 = np.quantile(prod_error, .95)

            #create subplots
            reference_data['dataset'] = 'Reference'
            reference_data['Error bias'] = list(
                map(
                    lambda x: 'Underestimation'
                    if x <= ref_quntile_5 else 'Majority'
                    if x < ref_quntile_95 else 'Overestimation', ref_error))

            production_data['dataset'] = 'Production'
            production_data['Error bias'] = list(
                map(
                    lambda x: 'Underestimation'
                    if x <= prod_quntile_5 else 'Majority'
                    if x < prod_quntile_95 else 'Overestimation', prod_error))
            merged_data = pd.concat([reference_data, production_data])

            reference_data.drop(['dataset', 'Error bias'],
                                axis=1,
                                inplace=True)
            production_data.drop(['dataset', 'Error bias'],
                                 axis=1,
                                 inplace=True)

            params_data = []
            additional_graphs_data = []

            for feature_name in num_feature_names:
                feature_type = 'num'

                ref_overal_value = np.mean(reference_data[feature_name])
                ref_under_value = np.mean(
                    reference_data[ref_error <= ref_quntile_5][feature_name])
                ref_expected_value = np.mean(
                    reference_data[(ref_error > ref_quntile_5) &
                                   (ref_error < ref_quntile_95)][feature_name])
                ref_over_value = np.mean(
                    reference_data[ref_error >= ref_quntile_95][feature_name])
                ref_range_value = 0 if ref_over_value == ref_under_value else 100 * abs(
                    ref_over_value -
                    ref_under_value) / (np.max(reference_data[feature_name]) -
                                        np.min(reference_data[feature_name]))

                prod_overal_value = np.mean(production_data[feature_name])
                prod_under_value = np.mean(production_data[
                    prod_error <= prod_quntile_5][feature_name])
                prod_expected_value = np.mean(production_data[
                    (prod_error > prod_quntile_5)
                    & (prod_error < prod_quntile_95)][feature_name])
                prod_over_value = np.mean(production_data[
                    prod_error >= prod_quntile_95][feature_name])
                prod_range_value = 0 if prod_over_value == prod_under_value else 100 * abs(
                    prod_over_value - prod_under_value) / (
                        np.max(production_data[feature_name]) -
                        np.min(production_data[feature_name]))

                feature_hist = px.histogram(
                    merged_data,
                    x=feature_name,
                    color='Error bias',
                    facet_col="dataset",
                    histnorm='percent',
                    barmode='overlay',
                    category_orders={
                        "dataset": ["Reference", "Production"],
                        "Error bias":
                        ["Underestimation", "Overestimation", "Majority"]
                    })

                feature_hist_json = json.loads(feature_hist.to_json())

                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Error bias",
                            "id": feature_name + "_hist"
                        }],
                        "insights": []
                    },
                    "f1": feature_name,
                    "f2": feature_type,
                    "f3": round(ref_expected_value, 2),
                    "f4": round(ref_under_value, 2),
                    "f5": round(ref_over_value, 2),
                    "f6": round(ref_range_value, 2),
                    "f7": round(prod_expected_value, 2),
                    "f8": round(prod_under_value, 2),
                    "f9": round(prod_over_value, 2),
                    "f10": round(prod_range_value, 2)
                })

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_hist', {
                            "data": feature_hist_json['data'],
                            "layout": feature_hist_json['layout']
                        }))

            for feature_name in cat_feature_names:
                feature_type = 'cat'

                ref_overal_value = reference_data[feature_name].value_counts(
                ).idxmax()
                ref_under_value = reference_data[ref_error <= ref_quntile_5][
                    feature_name].value_counts().idxmax()
                #ref_expected_value = reference_data[(ref_error > ref_quntile_5) & (ref_error < ref_quntile_95)][feature_name].value_counts().idxmax()
                ref_over_value = reference_data[ref_error >= ref_quntile_95][
                    feature_name].value_counts().idxmax()
                ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \
                   or (ref_under_value != ref_overal_value) else 0

                prod_overal_value = production_data[feature_name].value_counts(
                ).idxmax()
                prod_under_value = production_data[
                    prod_error <= prod_quntile_5][feature_name].value_counts(
                    ).idxmax()
                #prod_expected_value = production_data[(prod_error > prod_quntile_5) & (prod_error < prod_quntile_95)][feature_name].value_counts().idxmax()
                prod_over_value = production_data[
                    prod_error >= prod_quntile_95][feature_name].value_counts(
                    ).idxmax()
                prod_range_value = 1 if (prod_overal_value != prod_under_value) or (prod_over_value != prod_overal_value) \
                   or (prod_under_value != prod_overal_value) else 0

                feature_hist = px.histogram(
                    merged_data,
                    x=feature_name,
                    color='Error bias',
                    facet_col="dataset",
                    histnorm='percent',
                    barmode='overlay',
                    category_orders={
                        "dataset": ["Reference", "Production"],
                        "Error bias":
                        ["Underestimation", "Overestimation", "Majority"]
                    })

                feature_hist_json = json.loads(feature_hist.to_json())

                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Error bias",
                            "id": feature_name + "_hist"
                        }],
                        "insights": []
                    },
                    "f1": feature_name,
                    "f2": feature_type,
                    "f3": str(ref_overal_value),
                    "f4": str(ref_under_value),
                    "f5": str(ref_over_value),
                    "f6": str(ref_range_value),
                    "f7": str(prod_overal_value),
                    "f8": str(prod_under_value),
                    "f9": str(prod_over_value),
                    "f10": int(prod_range_value)
                })

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_hist', {
                            "data": feature_hist_json['data'],
                            "layout": feature_hist_json['layout']
                        }))

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_table",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "rowsPerPage":
                    min(len(num_feature_names) + len(cat_feature_names), 10),
                    "columns": [{
                        "title": "Feature",
                        "field": "f1"
                    }, {
                        "title": "Type",
                        "field": "f2"
                    }, {
                        "title": "REF: Majority",
                        "field": "f3"
                    }, {
                        "title": "REF: Under",
                        "field": "f4"
                    }, {
                        "title": "REF: Over",
                        "field": "f5"
                    }, {
                        "title": "REF: Range(%)",
                        "field": "f6"
                    }, {
                        "title": "PROD: Majority",
                        "field": "f7"
                    }, {
                        "title": "PROD: Under",
                        "field": "f8"
                    }, {
                        "title": "PROD: Over",
                        "field": "f9"
                    }, {
                        "title": "PROD: Range(%)",
                        "field": "f10",
                        "sort": "desc"
                    }],
                    "data":
                    params_data
                },
                additionalGraphs=additional_graphs_data)

        else:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            error = reference_data[prediction_column] - reference_data[
                target_column]

            quntile_5 = np.quantile(error, .05)
            quntile_95 = np.quantile(error, .95)

            reference_data['Error bias'] = reference_data['Error bias'] = list(
                map(
                    lambda x: 'Underestimation'
                    if x <= quntile_5 else 'Majority'
                    if x < quntile_95 else 'Overestimation', error))

            params_data = []
            additional_graphs_data = []

            for feature_name in num_feature_names:  # + cat_feature_names: #feature_names:

                feature_type = 'num'
                ref_overal_value = np.mean(reference_data[feature_name])
                ref_under_value = np.mean(
                    reference_data[error <= quntile_5][feature_name])
                #ref_expected_value = np.mean(reference_data[(error > quntile_5) & (error < quntile_95)][feature_name])
                ref_over_value = np.mean(
                    reference_data[error >= quntile_95][feature_name])
                ref_range_value = 0 if ref_over_value == ref_under_value else 100 * abs(
                    ref_over_value -
                    ref_under_value) / (np.max(reference_data[feature_name]) -
                                        np.min(reference_data[feature_name]))

                hist = px.histogram(
                    reference_data,
                    x=feature_name,
                    color='Error bias',
                    histnorm='percent',
                    barmode='overlay',
                    category_orders={
                        "Error bias":
                        ["Underestimation", "Overestimation", "Majority"]
                    })

                #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset",
                #        category_orders={"dataset": ["Reference", "Production"]})

                hist_figure = json.loads(hist.to_json())

                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Error bias",
                            "id": feature_name + "_hist"
                        }],
                        "insights": []
                    },
                    "f1": feature_name,
                    "f2": feature_type,
                    "f3": round(ref_overal_value, 2),
                    "f4": round(ref_under_value, 2),
                    "f5": round(ref_over_value, 2),
                    "f6": round(ref_range_value, 2)
                })

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_hist', {
                            "data": hist_figure['data'],
                            "layout": hist_figure['layout']
                        }))

            for feature_name in cat_feature_names:  #feature_names:

                feature_type = 'cat'
                ref_overal_value = reference_data[feature_name].value_counts(
                ).idxmax()
                ref_under_value = reference_data[
                    error <= quntile_5][feature_name].value_counts().idxmax()
                #ref_expected_value = reference_data[(error > quntile_5) & (error < quntile_95)][feature_name].value_counts().idxmax()
                ref_over_value = reference_data[
                    error >= quntile_95][feature_name].value_counts().idxmax()
                ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \
                   or (ref_under_value != ref_overal_value) else 0

                hist = px.histogram(
                    reference_data,
                    x=feature_name,
                    color='Error bias',
                    histnorm='percent',
                    barmode='overlay',
                    category_orders={
                        "Error bias":
                        ["Underestimation", "Overestimation", "Majority"]
                    })

                #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset",
                #        category_orders={"dataset": ["Reference", "Production"]})

                hist_figure = json.loads(hist.to_json())

                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Error bias",
                            "id": feature_name + "_hist"
                        }],
                        "insights": []
                    },
                    "f1": feature_name,
                    "f2": feature_type,
                    "f3": str(ref_overal_value),
                    "f4": str(ref_under_value),
                    "f5": str(ref_over_value),
                    "f6": int(ref_range_value)
                })

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_hist', {
                            "data": hist_figure['data'],
                            "layout": hist_figure['layout']
                        }))

            reference_data.drop('Error bias', axis=1, inplace=True)

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_table",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "rowsPerPage":
                    min(len(num_feature_names) + len(cat_feature_names), 10),
                    "columns": [{
                        "title": "Feature",
                        "field": "f1"
                    }, {
                        "title": "Type",
                        "field": "f2"
                    }, {
                        "title": "Majority",
                        "field": "f3"
                    }, {
                        "title": "Underestimation",
                        "field": "f4"
                    }, {
                        "title": "Overestimation",
                        "field": "f5"
                    }, {
                        "title": "Range(%)",
                        "field": "f6",
                        "sort": "desc"
                    }],
                    "data":
                    params_data
                },
                additionalGraphs=additional_graphs_data)
Пример #41
0
def determine_numeric_features(df: pd.DataFrame) -> pd.Series:
    return df.select_dtypes(include={"int64", "float64"}).columns
Пример #42
0
def normalize(df: pd.DataFrame, scaler) -> pd.DataFrame:
    df_num = df.select_dtypes(include=[np.float, np.int])
    df[list(df_num.columns)] = scaler.transform(df[list(df_num.columns)])
    return df
Пример #43
0
 def fit(self, X: pd.DataFrame, y=None):
     #data.select_dtypes(include=['float', 'int']) #WHAT
     #self.std = X.std()
     self.std = X.select_dtypes(include=['float', 'int']).std()
     self.columns = self.std.index.values
     return self
Пример #44
0
def impute_continuous_missing_values(dataframe: pd.DataFrame) -> pd.DataFrame:
    new_value = 0
    continuous_columns = dataframe.select_dtypes(include='number').columns
    for column_name in continuous_columns:
        dataframe[column_name] = dataframe[column_name].fillna(new_value)
    return dataframe
Пример #45
0
 def test_select_dtypes_empty(self):
     df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
     with tm.assert_raises_regex(ValueError, 'at least one of '
                                 'include or exclude '
                                 'must be nonempty'):
         df.select_dtypes()
Пример #46
0
def get_categorical_column_names(dataframe: pd.DataFrame) -> [str]:
    categorical_columns = list(dataframe.select_dtypes(include='object').columns)
    return categorical_columns
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    return df
Пример #48
0
import pandas as pd