Пример #1
1
    def test_get_numeric_data_preserve_dtype(self):

        # get the numeric data
        o = DataFrame({'A': [1, '2', 3.]})
        result = o._get_numeric_data()
        expected = DataFrame(index=[0, 1, 2], dtype=object)
        self._compare(result, expected)
Пример #2
0
    def test_get_numeric_data(self):
        # TODO(wesm): unused?
        intname = np.dtype(np.int_).name  # noqa
        floatname = np.dtype(np.float_).name  # noqa

        datetime64name = np.dtype('M8[ns]').name
        objectname = np.dtype(np.object_).name

        df = DataFrame(
            {
                'a': 1.,
                'b': 2,
                'c': 'foo',
                'f': Timestamp('20010102')
            },
            index=np.arange(10))
        result = df.dtypes
        expected = Series([
            np.dtype('float64'),
            np.dtype('int64'),
            np.dtype(objectname),
            np.dtype(datetime64name)
        ],
                          index=['a', 'b', 'c', 'f'])
        assert_series_equal(result, expected)

        df = DataFrame(
            {
                'a': 1.,
                'b': 2,
                'c': 'foo',
                'd': np.array([1.] * 10, dtype='float32'),
                'e': np.array([1] * 10, dtype='int32'),
                'f': np.array([1] * 10, dtype='int16'),
                'g': Timestamp('20010102')
            },
            index=np.arange(10))

        result = df._get_numeric_data()
        expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
        assert_frame_equal(result, expected)

        only_obj = df.loc[:, ['c', 'g']]
        result = only_obj._get_numeric_data()
        expected = df.loc[:, []]
        assert_frame_equal(result, expected)

        df = DataFrame.from_dict({
            'a': [1, 2],
            'b': ['foo', 'bar'],
            'c': [np.pi, np.e]
        })
        result = df._get_numeric_data()
        expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
        assert_frame_equal(result, expected)

        df = result.copy()
        result = df._get_numeric_data()
        expected = df
        assert_frame_equal(result, expected)
Пример #3
0
class GetNumericData:
    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df['foo'] = 'bar'
        self.df['bar'] = 'baz'
        self.df = self.df._consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #4
0
class GetNumericData:
    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df["foo"] = "bar"
        self.df["bar"] = "baz"
        self.df = self.df._consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #5
0
class GetNumericData(object):
    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df['foo'] = 'bar'
        self.df['bar'] = 'baz'
        with warnings.catch_warnings(record=True):
            self.df = self.df.consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #6
0
class GetNumericData(object):

    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df['foo'] = 'bar'
        self.df['bar'] = 'baz'
        self.df = self.df._consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #7
0
    def test_get_numeric_data(self):
        # TODO(wesm): unused?
        intname = np.dtype(np.int_).name  # noqa
        floatname = np.dtype(np.float_).name  # noqa

        datetime64name = np.dtype("M8[ns]").name
        objectname = np.dtype(np.object_).name

        df = DataFrame(
            {"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")},
            index=np.arange(10),
        )
        result = df.dtypes
        expected = Series(
            [
                np.dtype("float64"),
                np.dtype("int64"),
                np.dtype(objectname),
                np.dtype(datetime64name),
            ],
            index=["a", "b", "c", "f"],
        )
        assert_series_equal(result, expected)

        df = DataFrame(
            {
                "a": 1.0,
                "b": 2,
                "c": "foo",
                "d": np.array([1.0] * 10, dtype="float32"),
                "e": np.array([1] * 10, dtype="int32"),
                "f": np.array([1] * 10, dtype="int16"),
                "g": Timestamp("20010102"),
            },
            index=np.arange(10),
        )

        result = df._get_numeric_data()
        expected = df.loc[:, ["a", "b", "d", "e", "f"]]
        assert_frame_equal(result, expected)

        only_obj = df.loc[:, ["c", "g"]]
        result = only_obj._get_numeric_data()
        expected = df.loc[:, []]
        assert_frame_equal(result, expected)

        df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]})
        result = df._get_numeric_data()
        expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]})
        assert_frame_equal(result, expected)

        df = result.copy()
        result = df._get_numeric_data()
        expected = df
        assert_frame_equal(result, expected)
Пример #8
0
class GetNumericData(object):

    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df['foo'] = 'bar'
        self.df['bar'] = 'baz'
        with warnings.catch_warnings(record=True):
            self.df = self.df.consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #9
0
def basicInfoAnalysis(df:DataFrame):
    numeric_col_num = len(df._get_numeric_data().columns)
    object_col_num = len(df.select_dtypes(['object']).columns)
    categorical_col_num = len(df.select_dtypes(['category']).columns)
    bool_col_num = len(df.select_dtypes(['bool']).columns)
    print('# of numeric columns:', numeric_col_num)
    print('# of object columns:', object_col_num)
    print('# of category columns:', categorical_col_num)
    print('# of bool columns:', bool_col_num)
    
    if numeric_col_num != 0:
        print('*'*10+' Numeric Variable Insight '+'*'*10)
        print(df[df._get_numeric_data().columns].describe())
Пример #10
0
def plot_boxes(df: pd.DataFrame, cols: list = None, out_path: str = None, show_p: bool = True, return_p: bool = False,
               h: int = None, w: int = None, spacing: float = 0.05, theme: str = 'simple_white',
               renderer: str = 'browser', n_cols: int = 3, shared_yaxes: bool = True, cols_like: list = None):
    """plot box plots"""
    # get cols to plot
    if not cols:
        if cols_like:
            cols = get_cols_like(df, cols_like)
        else:
            cols = df._get_numeric_data().columns
    n_rows = math.ceil(len(cols) / n_cols)
    p = make_subplots(rows=n_rows, cols=n_cols, shared_yaxes=shared_yaxes, vertical_spacing=spacing, horizontal_spacing=spacing)
    # figure out what to plot where on the subplot
    axes_dict = dict()
    i = 0
    for index, x in np.ndenumerate(np.zeros((n_cols, n_rows))):
        axes_dict[i] = index
        i += 1
    # make each plot
    for i, col in enumerate(cols):
        p.add_trace(go.Box(name=col, y=df[col]), row=axes_dict[i][1]+1, col=axes_dict[i][0]+1)
    if h:
        p.update_layout(height=h)
    if w:
        p.update_layout(width=w)
    p.update_layout(template=theme)
    if out_path:
        plotly.offline.plot(p, filename=out_path, auto_open=False)
    if show_p:
        p.show(renderer=renderer)
    if return_p:
        return p
Пример #11
0
def vif(df: pd.DataFrame, dependent: str) -> pd.DataFrame:
    """Get Variance Inflation Factor for each feature in df via a simple, multiple regression.

	Arguments:
		df {pd.DataFrame} -- dataset
		dependent {str} -- column name of dependent feature in df

	Returns:
		pd.DataFrame -- DataFrame containing feature names and VIF measures.
	"""

    # https://etav.github.io/python/vif_factor_python.html
    df = df.dropna()
    df = df._get_numeric_data()  #drop non-numeric cols

    #gather features
    features = "+".join(df.columns.drop(dependent).tolist())

    # get y and X dataframes based on this regression:
    y, X = dmatrices('{} ~'.format(dependent) + features,
                     df,
                     return_type='dataframe')

    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [
        variance_inflation_factor(X.values, i) for i in range(X.shape[1])
    ]
    vif["features"] = X.columns

    return vif.round(1)
Пример #12
0
        def LinearReg_Term(data: pd.DataFrame):

            # Handle with negative term values
            # way no1
            data = data._get_numeric_data()  # <- this increase accuracy
            data[data < 0] = 0

            # way no2
            # data['profit'] = data['profit'] + 1 - data['profit'].min()

            # Log Transformation
            data['price'] = np.log1p(data['price_meter_sq'])
            data['profit'] = np.log1p(data['profit'])
            data['term'] = np.log1p(data['term'])

            # Create X and y for Linear Model training
            X = data[['profit', 'price_meter_sq']]
            y = data[['term']].values.ravel()

            # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

            # Create LinearModel and fitting
            reg = LinearRegression().fit(X, y)
            print("Term linear regression fitted", flush=True)
            return reg
Пример #13
0
    def test_get_numeric_data_preserve_dtype(self):

        # get the numeric data
        o = DataFrame({'A': [1, '2', 3.]})
        result = o._get_numeric_data()
        expected = DataFrame(index=[0, 1, 2], dtype=object)
        self._compare(result, expected)
Пример #14
0
def bin_data(data: pd.DataFrame,
             cols: Union[list, np.ndarray, tuple] = (),
             bins: Union[int, list, np.ndarray, dict] = 10,
             quantile: bool = False,
             retbins: bool = False):
    """
    Index the input DataFrame given the bin_edges for the columns specified in cols.

    :param DataFrame data: input data
    :param list cols: list of columns with numeric data which needs to be indexed
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
    E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
    :returns: rebinned DataFrame
    :rtype: pandas.DataFrame
    """

    if isinstance(bins, dict):
        for col in cols:
            if col not in bins:
                raise ValueError(
                    'column {0} is not included in bins dictionary.'.format(
                        col))

    # check for numeric bins
    for col in list(set(data._get_numeric_data().columns) - set(cols)):
        nuq = data[col].nunique()
        if (nuq > 0.9 * len(data)) or (nuq > 100):
            warnings.warn(
                "numeric variable {1:s} has {0:d} unique values. Are you sure you don't want to bin it?"
                .format(nuq, str(col)), Warning)

    binned_data = data.copy()

    if isinstance(bins, (list, np.ndarray)):
        xbins = bins

    bins_dict = {}
    for col in cols:
        if isinstance(bins, (int, float)):
            xbins = bin_edges(data[col].astype(float),
                              int(bins),
                              quantile=quantile)
        if isinstance(bins, dict):
            if isinstance(bins[col], (int, float)):
                xbins = bin_edges(data[col].astype(float),
                                  int(bins[col]),
                                  quantile=quantile)
            elif isinstance(bins[col], (list, np.ndarray)):
                xbins = bins[col]
        binned_data[col], bin_labels = bin_array(
            data[col].astype(float).values, xbins)
        if retbins:
            bins_dict[col] = bin_labels

    if retbins:
        return binned_data, bins_dict

    return binned_data
Пример #15
0
    def _fit(self, X: pd.DataFrame, y: pd.DataFrame = None):
        self.mapping_ = self.mapping
        if self.auto_input:
            for col in X._get_numeric_data().columns:
                if col not in self.mapping_.keys():
                    self.mapping_[col] = self.numeric_input

        return self
Пример #16
0
def get_list_non_numerical_columns_of_df(dataframe: pd.DataFrame) -> List:
    """
    Returns a list of column names, where this names corresponds to non numerical attributes.
    """
    cols = dataframe.columns
    num_cols = dataframe._get_numeric_data().columns

    return list(set(cols) - set(num_cols))
Пример #17
0
def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None, fname=None):
    """
    Plot the means and standard deviations of each dataset.

    :param real: DataFrame containing the real data
    :param fake: DataFrame containing the fake data
    :param ax: Axis to plot on. If none, a new figure is made.
    :param fname: If not none, saves the plot with this file name. 
    """
    if ax is None:
        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
        fig.suptitle('Absolute Log Mean and STDs of numeric data\n',
                     fontsize=16)

    ax[0].grid(True)
    ax[1].grid(True)
    real = real._get_numeric_data()
    fake = fake._get_numeric_data()
    real_mean = np.log(np.add(abs(real.mean()).values, 1e-5))
    fake_mean = np.log(np.add(abs(fake.mean()).values, 1e-5))
    min_mean = min(real_mean) - 1
    max_mean = max(real_mean) + 1
    line = np.arange(min_mean, max_mean)
    sns.lineplot(x=line, y=line, ax=ax[0])
    sns.scatterplot(x=real_mean, y=fake_mean, ax=ax[0])
    ax[0].set_title('Means of real and fake data')
    ax[0].set_xlabel('real data mean (log)')
    ax[0].set_ylabel('fake data mean (log)')

    real_std = np.log(np.add(real.std().values, 1e-5))
    fake_std = np.log(np.add(fake.std().values, 1e-5))
    min_std = min(real_std) - 1
    max_std = max(real_std) + 1
    line = np.arange(min_std, max_std)
    sns.lineplot(x=line, y=line, ax=ax[1])
    sns.scatterplot(x=real_std, y=fake_std, ax=ax[1])
    ax[1].set_title('Stds of real and fake data')
    ax[1].set_xlabel('real data std (log)')
    ax[1].set_ylabel('fake data std (log)')

    if fname is not None:
        plt.savefig(fname)

    if ax is None:
        plt.show()
Пример #18
0
    def test_get_numeric_data(self):
        # TODO(wesm): unused?
        intname = np.dtype(np.int_).name  # noqa
        floatname = np.dtype(np.float_).name  # noqa

        datetime64name = np.dtype('M8[ns]').name
        objectname = np.dtype(np.object_).name

        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
                        'f': Timestamp('20010102')},
                       index=np.arange(10))
        result = df.get_dtype_counts()
        expected = Series({'int64': 1, 'float64': 1,
                           datetime64name: 1, objectname: 1})
        result.sort_index()
        expected.sort_index()
        assert_series_equal(result, expected)

        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
                        'd': np.array([1.] * 10, dtype='float32'),
                        'e': np.array([1] * 10, dtype='int32'),
                        'f': np.array([1] * 10, dtype='int16'),
                        'g': Timestamp('20010102')},
                       index=np.arange(10))

        result = df._get_numeric_data()
        expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
        assert_frame_equal(result, expected)

        only_obj = df.loc[:, ['c', 'g']]
        result = only_obj._get_numeric_data()
        expected = df.loc[:, []]
        assert_frame_equal(result, expected)

        df = DataFrame.from_dict(
            {'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]})
        result = df._get_numeric_data()
        expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
        assert_frame_equal(result, expected)

        df = result.copy()
        result = df._get_numeric_data()
        expected = df
        assert_frame_equal(result, expected)
Пример #19
0
def plot_hists(df: pd.DataFrame, cols: list = None, out_path: str = None, show_p: bool = True, return_p: bool = False,
               h: int = None, w: int = None, spacing: float = 0.05, theme: str = 'simple_white',
               renderer: str = 'browser', n_cols: int = 3, shared_yaxes: bool = True, cols_like: list = None,
               cumulative: bool = False, dim: str = None):
    """plot histogram"""
    # get cols to plot
    if not cols:
        if cols_like:
            cols = get_cols_like(df, cols_like)
        else:
            cols = df._get_numeric_data().columns
    n_rows = math.ceil(len(cols) / n_cols)
    p = make_subplots(rows=n_rows, cols=n_cols, shared_yaxes=shared_yaxes, vertical_spacing=spacing, horizontal_spacing=spacing)
    # figure out what to plot where on the subplot
    axes_dict = dict()
    i = 0
    for index, x in np.ndenumerate(np.zeros((n_cols, n_rows))):
        axes_dict[i] = index
        i += 1
    # make each plot
    for i, col in enumerate(cols):
        if dim:
            for dim_value in df[dim].unique():
                p.add_trace(
                    go.Histogram(
                        name=f'{col} - {dim_value}',
                        x=df[df[dim] == dim_value][col],
                        cumulative_enabled=cumulative,
                        bingroup=1,
                        histnorm='probability density'
                    ),
                    row=axes_dict[i][1]+1,
                    col=axes_dict[i][0]+1
                )
            p.update_layout(barmode='overlay')
            p.update_traces(opacity=0.5)
        else:
            p.add_trace(
                go.Histogram(
                    name=col, x=df[col], cumulative_enabled=cumulative
                ),
                row=axes_dict[i][1] + 1,
                col=axes_dict[i][0] + 1
            )

    if h:
        p.update_layout(height=h)
    if w:
        p.update_layout(width=w)
    p.update_layout(template=theme)
    if out_path:
        plotly.offline.plot(p, filename=out_path, auto_open=False)
    if show_p:
        p.show(renderer=renderer)
    if return_p:
        return p
Пример #20
0
 def test_get_numeric_data_extension_dtype(self):
     # GH 22290
     df = DataFrame({
         'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'),
         'B': Categorical(list('abcabc')),
         'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'),
         'D': IntervalArray.from_breaks(range(7))})
     result = df._get_numeric_data()
     expected = df.loc[:, ['A', 'C']]
     assert_frame_equal(result, expected)
Пример #21
0
def col_numeric_cat_split(df: pd.DataFrame) -> list:
    """
    takes in a pandas dataframe
    returns a list of numeric columns, and another list of categorical columns
    """
    num_cols = df._get_numeric_data().columns
    cols = df.columns
    categorical_cols = list(set(cols) - set(num_cols))
    num_cols = list(num_cols)
    return num_cols, categorical_cols
Пример #22
0
 def test_get_numeric_data_extension_dtype(self):
     # GH 22290
     df = DataFrame({
         'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'),
         'B': Categorical(list('abcabc')),
         'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'),
         'D': IntervalArray.from_breaks(range(7))})
     result = df._get_numeric_data()
     expected = df.loc[:, ['A', 'C']]
     assert_frame_equal(result, expected)
Пример #23
0
    def _check_num_nans(self, data: pd.DataFrame) -> bool:
        """
        Check Nans in numeric features in data

        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
        Return:
            bool: True or False
        """
        data = data._get_numeric_data()
        return len(list(data.columns[data.isnull().sum() > 0])) > 0
Пример #24
0
    def test_get_X_columns(self):
        # numeric and object columns

        df = DataFrame({'a': [1, 2, 3],
                        'b': [True, False, True],
                        'c': ['foo', 'bar', 'baz'],
                        'd': [None, None, None],
                        'e': [3.14, 0.577, 2.773]})

        tm.assert_index_equal(df._get_numeric_data().columns,
                              pd.Index(['a', 'b', 'e']))
Пример #25
0
    def test_get_X_columns(self):
        # numeric and object columns

        df = DataFrame({'a': [1, 2, 3],
                        'b': [True, False, True],
                        'c': ['foo', 'bar', 'baz'],
                        'd': [None, None, None],
                        'e': [3.14, 0.577, 2.773]})

        tm.assert_index_equal(df._get_numeric_data().columns,
                              pd.Index(['a', 'b', 'e']))
Пример #26
0
def get_n(recalculate: bool,
          data_df: pd.DataFrame,
          filepath: Optional[str] = None) -> pd.DataFrame:
    """Get sample sizes

    Parameters
    ----------
    recalculate :
        If True, recalculate the sample sizes
    data_df :
        Original raw data as a dataframe
    filepath :
        If `recalculate==False`: read the correlation values from this file.
        If `recalculate==True`: write the correlation values to this file.
        If not provided, run the calculation and return the correlation data
        without writing it to a file.

    Returns
    -------
    :
        A dataframe holding the sample sizes
    """
    start = time()
    if recalculate or filepath is None:
        logger.info('Calculating sampling values')
        num_cols = data_df.shape[1]
        data_mat = data_df._get_numeric_data().to_numpy(dtype=float,
                                                        na_value=np.nan,
                                                        copy=False)
        n_mat = np.zeros((num_cols, num_cols))
        group_start = time()
        for a_ix in range(num_cols):
            if a_ix % 100 == 0:
                print(a_ix, '%.2f' % (time() - group_start), 'sec per round,',
                      int(time() - start), 'sec total')
                group_start = time()
            n_mat[a_ix, a_ix] = (~np.isnan(data_mat[:, a_ix])).sum()
            for b_ix in range(a_ix + 1, num_cols):
                n = (~np.isnan(data_mat[:, a_ix])
                     & ~np.isnan(data_mat[:, b_ix])).sum()
                n_mat[a_ix, b_ix] = n
                n_mat[b_ix, a_ix] = n
        data_n = pd.DataFrame(n_mat,
                              index=data_df.columns,
                              columns=data_df.columns)
        if filepath is not None:
            logger.info(f'Saving sampling matrix to {"%s.h5" % filepath}')
            data_n.to_hdf('%s.h5' % filepath, filepath.split('/')[-1])
    else:
        logger.info(f'Reading sampling values from file {filepath}')
        data_n = pd.read_hdf('%s.h5' % filepath)
    elapsed = time() - start
    print(int(elapsed), 'sec')
    return data_n
    def test_get_numeric_data_mixed_dtype(self):
        # numeric and object columns

        df = DataFrame({
            "a": [1, 2, 3],
            "b": [True, False, True],
            "c": ["foo", "bar", "baz"],
            "d": [None, None, None],
            "e": [3.14, 0.577, 2.773],
        })
        result = df._get_numeric_data()
        tm.assert_index_equal(result.columns, Index(["a", "b", "e"]))
Пример #28
0
def remove_negatives(micro_df: pd.DataFrame):
    """
    Replaces negative values with NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """

    numeric = micro_df._get_numeric_data()
    numeric.where(numeric >= 0, np.nan, inplace=True)
Пример #29
0
 def test_get_numeric_data_extension_dtype(self):
     # GH 22290
     df = DataFrame(
         {
             "A": integer_array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"),
             "B": Categorical(list("abcabc")),
             "C": integer_array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"),
             "D": IntervalArray.from_breaks(range(7)),
         }
     )
     result = df._get_numeric_data()
     expected = df.loc[:, ["A", "C"]]
     assert_frame_equal(result, expected)
Пример #30
0
    def test_get_X_columns(self):
        # numeric and object columns

        df = DataFrame({
            "a": [1, 2, 3],
            "b": [True, False, True],
            "c": ["foo", "bar", "baz"],
            "d": [None, None, None],
            "e": [3.14, 0.577, 2.773],
        })

        tm.assert_index_equal(df._get_numeric_data().columns,
                              pd.Index(["a", "b", "e"]))
Пример #31
0
    def fit(self,
            data: pd.DataFrame,
            cols: Optional[List[str]] = None) -> None:
        """
        Parameters
        ----------
        data : pd.DataFrame
            dataset (pd.DataFrame shape = (n_samples, n_features))
        cols : Optional[List[str]], optional
            cols list features, by default None

        Returns
        -------
        self

        Raises
        ------
        Exception
            No numerical features
        """
        if cols is not None:
            data = data[cols]

        data = data._get_numeric_data()
        self.columns = data.columns
        count_columns = len(self.columns)

        if count_columns < 1:
            raise ValueError("No numerical features")

        self.scaler = MinMaxScaler().fit(data)
        s_data = self.scaler.transform(data)

        units = 512
        if count_columns > 512:
            units = count_columns

        self.autoencoder = self._get_dae(count_columns, units=units)
        self.autoencoder.fit(
            s_data,
            s_data,
            epochs=50,
            batch_size=124,
            shuffle=True,
            verbose=self.verbose,
        )
        return self
Пример #32
0
def plot_lines(df: pd.DataFrame,
               cols: list = None,
               cols_like: list = None,
               x: str = None,
               h: int = 300,
               w: int = 1200,
               t_str: str = 'box_zoom,pan,hover,reset,save',
               x_type: str = 'datetime',
               show_p: bool = True,
               t_loc: str = 'right',
               out_path: str = None,
               return_p: bool = False,
               palette: str = 'Category20',
               p_theme: str = 'light_minimal',
               notebook: bool = False):
    """Plot lines.
    """
    # get cols to plot
    if not cols:
        if cols_like:
            cols = get_cols_like(df, cols_like)
        else:
            cols = df._get_numeric_data().columns
    # define x axis if needed
    if not x:
        x = df.index.name
    # define source
    source = ColumnDataSource(df)
    # define palette
    if palette == 'Category20':
        p_palette = Category20[20]
    else:
        raise NotImplementedError(f'... palette {palette} not implemented ...')
    p = make_figure(h=h, w=w, x_type=x_type, t_loc=t_loc, t_str=t_str)
    for i, col in enumerate(cols):
        p.line(x, col, source=source, name=col, color=p_palette[i])
        add_hover(p, cols)
    if out_path:
        output_file(out_path)
    curdoc().theme = p_theme
    if notebook:
        output_notebook()
    if show_p:
        show(p)
    if return_p:
        return p
Пример #33
0
    def __init__(self, real: pd.DataFrame, fake: pd.DataFrame, cat_cols=None, unique_thresh=0, metric='pearsonr', verbose=False, n_samples=None,
                 name: str = None):
        """

        :param real: Real dataset (pd.DataFrame)
        :param fake: Synthetic dataset (pd.DataFrame)
        :param unique_thresh: Threshold for automatic evaluation if column is numeric
        :param cat_cols: The columns that are to be evaluated as discrete. If passed, unique_thresh is ignored.
        :param metric: the metric to use for evaluation linear relations. Pearson's r by default, but supports all models in scipy.stats
        :param verbose: Whether to print verbose output
        :param n_samples: Number of samples to evaluate. If none, it will take the minimal length of both datasets and cut the larger one off to make sure they
            are the same length.
        :param name: Name of the TableEvaluator. Used in some plotting functions like `helpers.plot_correlation_comparison` to indicate your model.
        """
        self.name = name
        self.unique_thresh = unique_thresh
        self.real = real.copy()
        self.fake = fake.copy()
        self.comparison_metric = getattr(stats, metric)
        self.verbose = verbose

        if cat_cols is None:
            self.numerical_columns = [column for column in real._get_numeric_data().columns if
                                      len(real[column].unique()) > unique_thresh]
            self.categorical_columns = [column for column in real.columns if column not in self.numerical_columns]
        else:
            self.categorical_columns = cat_cols
            self.numerical_columns = [column for column in real.columns if column not in cat_cols]

        if n_samples is None:
            self.n_samples = min(len(self.real), len(self.fake))
        elif len(fake) >= n_samples and len(real) >= n_samples:
            self.n_samples = n_samples
        else:
            raise Exception(f'Make sure n_samples < len(fake/real). len(real): {len(real)}, len(fake): {len(fake)}')
        
        self.real = self.real.sample(self.n_samples)
        self.fake = self.fake.sample(self.n_samples)
        assert len(self.real) == len(self.fake), f'len(real) != len(fake)'

        self.real.loc[:, self.categorical_columns] = self.real.loc[:, self.categorical_columns].fillna('[NAN]')
        self.fake.loc[:, self.categorical_columns] = self.fake.loc[:, self.categorical_columns].fillna('[NAN]')
        self.real.loc[:, self.numerical_columns] = self.real.loc[:, self.numerical_columns].fillna(self.real[self.numerical_columns].mean()) 
        self.fake.loc[:, self.numerical_columns] = self.fake.loc[:, self.numerical_columns].fillna(self.fake[self.numerical_columns].mean()) 
    def preprocess_numerical_data(data: pd.DataFrame,
                                  drop_cols: list,
                                  scaler_filename: str,
                                  fit=True):
        data_transformed: pd.DataFrame = pd.DataFrame([])
        try:
            # Extract Numerical Features
            df_num: pd.DataFrame = data._get_numeric_data()
            df_num.drop(drop_cols, axis=1, inplace=True)

            # Scale data
            data_transformed: pd.DataFrame = FeatureExtraction.scale_data(
                data=df_num, fit=fit, filename=scaler_filename)

            # Add drop cols
            data_transformed: pd.DataFrame = pd.concat(
                [data[drop_cols], data_transformed], axis=1, sort=False)

        except Exception as e:
            logger.error(e)
        return data_transformed
Пример #35
0
def plot_heatmap(df: pd.DataFrame,
                 cols: list = None,
                 cols_like: list = None,
                 id_vars: list = None,
                 out_path: str = None,
                 show_p: bool = True,
                 return_p: bool = False,
                 h: int = None,
                 w: int = None,
                 theme: str = 'plotly_white',
                 renderer: str = 'browser',
                 colorscale: str = 'RdBu',
                 showscale: bool = False):
    """plot heatmap"""
    # get cols to plot
    if not cols:
        if cols_like:
            cols = get_cols_like(df, cols_like)
        else:
            cols = df._get_numeric_data().columns
    if not id_vars:
        id_vars = list(df.index.names)
    df = pd.melt(df.reset_index(), id_vars=id_vars, value_vars=cols)
    p = go.Figure(data=go.Heatmap(z=df['value'],
                                  x=df[','.join(id_vars)],
                                  y=df['variable'],
                                  colorscale=colorscale,
                                  showscale=showscale))
    if h:
        p.update_layout(height=h)
    if w:
        p.update_layout(width=w)
    p.update_layout(template=theme)
    if out_path:
        plotly.offline.plot(p, filename=out_path, auto_open=False)
    if show_p:
        p.show(renderer=renderer)
    if return_p:
        return p
Пример #36
0
    def fit(self,
            data: pd.DataFrame,
            cols: Optional[List[str]] = None) -> None:
        """

        Parameters
        ----------
        data : pd.DataFrame
            dataset (pd.DataFrame shape = (n_samples, n_features))
        cols : Optional[List[str]], optional
            cols list features, by default None

        Returns
        -------
        self
        """
        if cols is not None:
            data = data[cols]

        data = data._get_numeric_data()

        if self.verbose:
            for col in data.columns:
                pct_missing = np.mean(data[col].isnull())
                if pct_missing > 0.25:
                    logger.warning("! Attention {} - {}% Nans!".format(
                        col, round(pct_missing * 100)))

        self.nan_columns = list(data.columns[data.isnull().sum() > 0])
        if not self.nan_columns:
            logger.info("No nans features")

        if self.method == "median":
            self.fill_value = data.median()
        elif self.method == "mean":
            self.fill_value = data.mean()
        else:
            raise ValueError("Wrong fill method")
        return self