Exemplo n.º 1
0
def test_data_characters_types():
    from pandas.api.types import is_object_dtype
    from pandas.api.types import is_float_dtype
    las = lasio.read(egfn('data_characters.las'))
    assert is_object_dtype(las.df().index.dtype)
    assert is_object_dtype(las.df()['DATE'].dtype)
    assert is_float_dtype(las.df()['DEPT'].dtype)
    assert is_float_dtype(las.df()['ARC_GR_UNC_RT'].dtype)
Exemplo n.º 2
0
def _write_header(data, fp, relation_name, index):
    """Write header containing attribute names and types"""
    fp.write("@relation {0}\n\n".format(relation_name))

    if index:
        data = data.reset_index()

    attribute_names = _sanitize_column_names(data)

    for column, series in data.iteritems():
        name = attribute_names[column]
        fp.write("@attribute {0}\t".format(name))

        if is_categorical_dtype(series) or is_object_dtype(series):
            _write_attribute_categorical(series, fp)
        elif numpy.issubdtype(series.dtype, numpy.floating):
            fp.write("real")
        elif numpy.issubdtype(series.dtype, numpy.integer):
            fp.write("integer")
        elif numpy.issubdtype(series.dtype, numpy.datetime64):
            fp.write("date 'yyyy-MM-dd HH:mm:ss'")
        else:
            raise TypeError('unsupported type %s' % series.dtype)

        fp.write("\n")
    return data
Exemplo n.º 3
0
Arquivo: utils.py Projeto: eggls6/difi
def _checkColumnTypes(df, cols, column_mapping):
    """
    Checks that each dataframe column listed in cols has Pandas dtype "Object".
    
    Parameters
    ----------
    df : `~pandas.DataFrame`
        Pandas dataframe
    cols : list
        Columns to check for appropriate data type. 
    column_mapping : dict
        Column name mapping to internally used column names (truth, linkage_id, obs_id).
    
    Raises
    ------
    TypeError : If any column is not of type "Object" or String .
    
    Returns
    -------
    None
    """
    error_text = ""
    for col in cols:
        value = column_mapping[col]
        if not is_object_dtype(df[value].dtype):
            error = "\n{1} column ('{0}') should have type string. " \
                    "Please convert column using: \n" \
                    "dataframe['{0}'] = dataframe['{0}'].astype(str)`\n"
            error = error.format(value, col)
            error_text += error
    
    if len(error_text) > 0:
        raise TypeError(error_text)
    return
Exemplo n.º 4
0
    def astype(self, dtype, copy=True):
        """Cast to a NumPy array with 'dtype'.

        Parameters
        ----------
        dtype : str or dtype
            Typecode or data-type to which the array is cast.
        copy : bool, default True
            Whether to copy the data, even if not necessary. If False,
            a copy is made only if the old dtype does not match the
            new dtype.

        Returns
        -------
        array : ndarray
            NumPy ndarray with 'dtype' for its dtype.
        """
        if isinstance(dtype, str) and (dtype.startswith("Pint[")
                                       or dtype.startswith("pint[")):
            dtype = PintType(dtype)
        if isinstance(dtype, PintType):
            if dtype == self._dtype and not copy:
                return self
            else:
                return PintArray(
                    self.quantity.to(dtype.units).magnitude, dtype)
        # do *not* delegate to __array__ -> is required to return a numpy array,
        # but somebody may be requesting another pandas array
        # examples are e.g. PyArrow arrays as requested by "string[pyarrow]"
        if is_object_dtype(dtype):
            return self._to_array_of_quantity(copy=copy)
        if is_string_dtype(dtype):
            return pd.array([str(x) for x in self.quantity], dtype=dtype)
        return pd.array(self.quantity, dtype, copy)
Exemplo n.º 5
0
    def find_atoms(self, data: pd.DataFrame):
        """
        Find the numeric atoms and categorical levels to be modeled.
        """
        self._dtypes = data.dtypes
        atoms_dict = {}
        levels_dict = {}
        for i in range(data.shape[1]):
            vname = data.columns[i]
            dt = self._dtypes[i]

            if is_numeric_dtype(dt):
                variable = data.iloc[:, i]
                counts = variable.value_counts().sort_values(ascending=False)
                number_observed = counts.sum()
                atom_indicator = counts > 0.05 * number_observed
                atoms = counts[atom_indicator].index.tolist()
                if len(atoms) > 3:
                    atoms = atoms[:3]
                atoms_dict[vname] = atoms
                self._numeric_colnames.append(data.columns[i])

            elif (is_categorical_dtype(dt) or is_object_dtype(dt)
                  or is_bool_dtype(dt)):
                # TODO: put in some cardinality protections.
                levels = data.iloc[:, i].value_counts()
                levels_dict[vname] = levels
                self._categorical_colnames.append(data.columns[i])

            else:
                raise Exception(
                    "Only categorical or numeric types are supported.")

        return atoms_dict, levels_dict
Exemplo n.º 6
0
def to_data_table(data: pd.DataFrame):
    """
    Create a BOOM DataTable object from a pandas DataFrame.  The categories of
    any categorical variables will be handled as strings.
    """
    dtypes = data.dtypes
    ans = boom.DataTable()
    for i in range(data.shape[1]):
        dt = dtypes[i]
        vname = data.columns[i]
        if is_numeric_dtype(dt) or is_bool_dtype(dt):
            ans.add_numeric(
                boom.Vector(data.iloc[:, i].values.astype("float")), vname)
        elif is_categorical_dtype(dt):
            x = data.iloc[:, i]
            values = x.cat.codes
            codes = x.cat.categories
            ans.add_categorical(values, codes, vname)
        elif is_object_dtype(dt):
            labels = data.iloc[:, i].astype("str")
            ans.add_categorical_from_labels(labels.values, vname)
        else:
            raise Exception(
                f"Only numeric or categorical data are supported.  "
                f"Column {i} ({data.columns[i]}) has dtype {dt}.")
    return ans
Exemplo n.º 7
0
def _sanitize_anndata(adata: AnnData) -> None:
    """Sanitization and sanity checks on IR-anndata object.
    Should be executed by every read_xxx function"""
    assert (len(adata.X.shape) == 2
            ), "X needs to have dimensions, otherwise concat doesn't work. "

    # Pending updates to anndata to properly handle boolean columns.
    # For now, let's turn them into a categorical with "True/False"
    BOOLEAN_COLS = ("has_ir", "is_cell", "multi_chain", "high_confidence",
                    "productive")

    # explicitly convert those to categoricals. All IR_ columns that are strings
    # will be converted to categoricals, too
    CATEGORICAL_COLS = ("extra_chains", )

    # Sanitize has_ir column into categorical
    # This should always be a categorical with True / False
    for col in adata.obs.columns:
        if col.endswith(BOOLEAN_COLS):
            adata.obs[col] = pd.Categorical(
                [
                    "True"
                    if _is_true2(x) else "False" if _is_false2(x) else "None"
                    for x in adata.obs[col]
                ],
                categories=["True", "False", "None"],
            )
        elif col.endswith(CATEGORICAL_COLS) or (
                col.startswith("IR_") and is_object_dtype(adata.obs[col])):
            # Turn all IR_VJ columns that are of type string or object to categoricals
            # otherwise saving anndata doesn't work.
            adata.obs[col] = pd.Categorical(adata.obs[col])

    adata.strings_to_categoricals()
def _check_Xy(X: pd.DataFrame,
              y: pd.Series, *,
              norm_y=False) -> Tuple[pd.Series, pd.Series]:
    if np.ndim(X) == 1:
        X = pd.Series(X).to_frame()
    elif np.ndim(X) == 2:
        X = pd.DataFrame(X)

    assert X.ndim == 2
    assert np.ndim(y) == 1
    assert len(X) == len(y)

    valid = ~X.isnull().any(1).values
    X = pd.Series(list(zip(*X.values[valid].T)),
                  name=tuple(X.columns)).astype('category')
    y = pd.Series(y).reset_index(drop=True)[valid]

    if is_object_dtype(y):
        y = pd.Categorical(y)

    if norm_y:
        assert is_numeric_dtype(y)
        y = (y - y.mean()) / y.std()

    return X, y
Exemplo n.º 9
0
    def get_object_dtypes(self, dtypes_validated: TYPE_DSTR) -> TYPE_DSTR:
        """Inspect all columns of dtype object and ensure no mixed dtypes are
        present. Raises type error otherwise. Ignores columns for which dtypes
        are already explicitly set.

        Parameters
        ----------
        dtypes_validated: dict
            Represents already given column/dtype pairs. Keys refer to column
            names and values represent dtypes.

        Returns
        -------
        dtypes_object: dict
            Keys refer to column names and values represent dtypes.

        """

        dtypes_object = {}

        for column in self.df.columns:
            if column in dtypes_validated:
                continue

            if pd_types.is_object_dtype(self.df[column]):
                dtypes_object[column] = self.inspect_dtype_object(column)

        return dtypes_object
Exemplo n.º 10
0
def series_is_boolean(col: pd.Series or pd.Index):
    """
    returns:
        None if column is all None;
        True if a pd.Series only contains True, False, and None;
        False otherwise

    caveat: does not interpret all-zero or all-one columns as boolean"""
    if len(col.unique()) == 1 and col.unique()[0] is None:
        # return None for all-None columns
        return None
    elif col.isna().all():
        return None
    elif is_bool_dtype(col):
        return True
    elif is_object_dtype(col):
        for val in col.unique():
            if val not in [True, False, None]:
                return False
        if not (False in col.unique() and True in col.unique()):
            return False
        return True
    elif is_integer_dtype(col) or is_float_dtype(col):
        for val in col.unique():
            if pd.isna(val):
                continue
            if val not in [1, 0, None]:
                return False
            if not (0 in col.unique() and 1 in col.unique()):
                return False
        return True
    return False
Exemplo n.º 11
0
    def test_upload_pandas_categorical_ipc(self, con):

        con.execute("DROP TABLE IF EXISTS test_categorical;")

        df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
        df["B"] = df["A"].astype('category')

        # test that table created correctly when it doesn't exist on server
        con.load_table("test_categorical", df)
        ans = con.execute("select * from test_categorical").fetchall()

        assert ans == [('a', 'a'), ('b', 'b'), ('c', 'c'), ('a', 'a')]

        assert con.get_table_details("test_categorical") == [
            ColumnDetails(
                name='A',
                type='STR',
                nullable=True,
                precision=0,
                scale=0,
                comp_param=32,
                encoding='DICT',
                is_array=False,
            ),
            ColumnDetails(
                name='B',
                type='STR',
                nullable=True,
                precision=0,
                scale=0,
                comp_param=32,
                encoding='DICT',
                is_array=False,
            ),
        ]

        # load row-wise
        con.load_table("test_categorical", df, method="rows")

        # load columnar
        con.load_table("test_categorical", df, method="columnar")

        # load arrow
        con.load_table("test_categorical", df, method="arrow")

        # test end result
        df_ipc = con.select_ipc("select * from test_categorical")
        assert df_ipc.shape == (16, 2)

        res = df.append([df, df, df]).reset_index(drop=True)
        res["A"] = res["A"].astype('category')
        res["B"] = res["B"].astype('category')
        assert pd.DataFrame.equals(df_ipc, res)

        # test that input df wasn't mutated
        # original input is object, categorical
        # to load via Arrow, converted internally to object, object
        assert is_object_dtype(df["A"])
        assert is_categorical_dtype(df["B"])
        con.execute("DROP TABLE IF EXISTS test_categorical;")
Exemplo n.º 12
0
def df_string_to_cat(df:pd.DataFrame) -> dict:
    catencoders = {}
    for colname in df.columns:
        if is_string_dtype(df[colname]) or is_object_dtype(df[colname]):
            df[colname] = df[colname].astype('category').cat.as_ordered()
            catencoders[colname] = df[colname].cat.categories
    return catencoders
Exemplo n.º 13
0
def _check_Xy(X: pd.DataFrame,
              y: pd.Series,
              *,
              norm_y=False) -> Tuple[pd.Series, pd.Series]:
    if np.ndim(X) == 1:
        X = pd.Series(X).to_frame()
    elif np.ndim(X) == 2:
        X = pd.DataFrame(X)

    assert X.ndim == 2
    assert np.ndim(y) == 1
    assert len(X) == len(y)

    valid = ~X.isnull().any(1).values
    X = pd.Series(list(zip(*X.values[valid].T)),
                  name=tuple(X.columns)).astype('category')
    y = pd.Series(y).reset_index(drop=True)[valid]

    if is_object_dtype(y):
        y = pd.Categorical(y)

    if norm_y:
        assert is_numeric_dtype(y)
        y = (y - y.mean()) / y.std()

    return X, y
Exemplo n.º 14
0
def string_contains(series: pd.Series, state: dict) -> bool:
    if pdt.is_categorical_dtype(series):
        return False
    elif not pdt.is_object_dtype(series):
        return pandas_has_string_dtype_flag and pdt.is_string_dtype(series)

    return _is_string(series, state)
Exemplo n.º 15
0
def explode(df):
    """
    Based on this answer:
    https://stackoverflow.com/questions/12680754/split-explode-pandas\
    -dataframe-string-entry-to-separate-rows/40449726#40449726
    """
    if df is None or df.empty:
        return df

    # get the list columns
    lst_cols = [col for col, dtype in df.dtypes.items() if is_object_dtype(dtype)]
    # Be more specific about which objects are ok
    lst_cols = [col for col in lst_cols if isinstance(df[col].iloc[0], _explodable_types)]
    if not lst_cols:
        return df

    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # check all lists have same length
    lens = pd.DataFrame({col: df[col].str.len() for col in lst_cols})
    different_length = (lens.nunique(axis=1) > 1).any()
    if different_length:
        raise ValueError("Cannot bin multiple arrays with different jaggedness")
    lens = lens[lst_cols[0]]

    # create "exploded" DF
    flattened = {col: df.loc[lens > 0, col].values for col in lst_cols}
    flattened = {col: sum(map(list, vals), []) for col, vals in flattened.items()}
    res = pd.DataFrame({col: np.repeat(df[col].values, lens) for col in idx_cols})
    res = res.assign(**flattened)

    # Check that rows are fully "exploded"
    return explode(res)
Exemplo n.º 16
0
def analyze_cat(df, save_pic=True, visual=True, path='') -> None:
    '''
    :Description: This function plot normalized frequency values of the variable in a bar chart for frequency larger than 25%.

    :param df: The data to be investigated.
    :type df: pandas data frame
    :param save_pic: if the user wants to save the results. default is True to save the results as a png file.
    :type save_pic: bool
    :param path: the path to save the plot in.
    :type path: str

    :return: None, a plot.
    '''

    if visual:
        print(
            "******************Plotting for non-numeric variables*********************"
        )
        obj_cols = [
            cols for cols in df.columns
            if is_object_dtype(df[cols]) and len(df[cols].dropna()) > 0
        ]

        print('These are non numeric columns\n', obj_cols)

        # For each object column in the list
        for x, col_name in enumerate(obj_cols):
            # print(x + 1, " of ", iter_len, " completed   ", col_name)
            values_freq_threshold = 25

            # If unique values count is below the threshold value then store the details of unique values
            # normalize True/False for counts
            col_unique_vals = df[col_name].value_counts(normalize=True,
                                                        sort=True)
            #generating a data frame for the normalized count value data
            f = pd.DataFrame(np.array(
                col_unique_vals.head(values_freq_threshold).reset_index()),
                             columns=['Values', 'Count'])

            # Plot the graphs
            fig, ax = plt.subplots(figsize=(17, 9), constrained_layout=True)
            fig.suptitle("Profile of column  " + str(col_name).strip(),
                         fontsize=25)
            ax.bar(f.Values, f.Count, color=perc_color[1])
            ax.set_title("Normalized bar chart for top 25 values")
            plt.xticks(rotation=90)
            ax.set_ylabel('Count')
            ax.set_xlabel('Values')
            for p in ax.patches:
                ax.annotate(str(round(p.get_height(), 2)),
                            (p.get_x(), p.get_height() * 1.01))

            fig_name = path + 'EDA_' + str(col_name).strip() + '.png'
            if save_pic: fig.savefig(fig_name, dpi=100)
            plt.show()
            plt.close(fig)
        else:
            print('****************Nothing will be plotted******************')
        pass
Exemplo n.º 17
0
def fillna(data):
    data = data.drop(columns=['乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体'])
    if is_object_dtype(data['性别']):
        data['性别'] = data['性别'].map({'男':0, '女':1})   
    feature_col = [column for column in data.columns if column not in ['id', '体检日期', '血糖']]
    
    # feature_min = data[feature_col].min()
    # feature_max = data[feature_col].max()
    # scaled_feature = (data[feature_col] - feature_min) / (feature_max - feature_min)

    # data.loc[:, feature_col] = scaled_feature.values
    columns_na = data.columns[data.isna().sum() > 0]
    complete_sample = data.loc[data.isna().sum(axis=1) == 0, :]
    incomplete_sample = data.loc[data.isna().sum(axis=1) > 0, :]

    params = {
        'objective': 'regression',
        'boosting': 'rf',
        'learning_rate': 0.01,
        'num_leaves': 15,
        'num_threads':  multiprocessing.cpu_count() // 2,
        'min_data_in_leaf': 50,
        'min_sum_hessian_in_leaf': 1e-2,
        'feature_fraction': 0.7,
        'feature_fraction_seed': 2018,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'bagging_seed': 2018,
        'tree_learner': 'feature',
        'verbose': -1,
        'metric': 'mse',
    }
    kf = KFold(n_splits=5, shuffle=True, random_state=2018)
    for target in columns_na:
        X = complete_sample.loc[:, [column for column in feature_col if column is not target]]
        y = complete_sample.loc[:, target]
        na_sample_idxer = incomplete_sample[target].isna()
        XTest = incomplete_sample.loc[na_sample_idxer, feature_col].values
        
        result_to_fill = np.zeros((XTest.shape[0], 5))
        for cv_idx, (train_idx, valid_idx) in enumerate(kf.split(X)):
            train_set = lgb.Dataset(X.iloc[train_idx], label=y.iloc[train_idx])
            valid_set = lgb.Dataset(X.iloc[valid_idx], label=y.iloc[valid_idx])
            
            gbm = lgb.train(params, train_set,
                        num_boost_round=3000,
                        categorical_feature=['性别'],
                        valid_sets=valid_set, valid_names='valid',
                        early_stopping_rounds=100,
                        verbose_eval=False)
            
            result_to_fill[:, cv_idx] = gbm.predict(XTest, num_iteration=gbm.best_iteration)
        incomplete_sample.loc[na_sample_idxer, target] = result_to_fill.mean(axis=1)
    
    data = pd.concat([complete_sample, incomplete_sample])
    # inverse_values = data[feature_col]*(feature_max - feature_min) + feature_min
    # data.loc[:, feature_col] = inverse_values

    return data
Exemplo n.º 18
0
def to_pandas_time_index(
    time: Union[pint.Quantity, np.ndarray, pd.TimedeltaIndex, pd.DatetimeIndex,
                xr.DataArray, "tf.LocalCoordinateSystem", ],
) -> Union[pd.TimedeltaIndex, pd.DatetimeIndex]:
    """Convert a time variable to the corresponding pandas time index type.

    Parameters
    ----------
    time :
        Variable that should be converted.

    Returns
    -------
    Union[pandas.TimedeltaIndex, pandas.DatetimeIndex] :
        Time union of all input objects

    """
    from weldx.transformations import LocalCoordinateSystem

    _input_type = type(time)

    if isinstance(time, (pd.DatetimeIndex, pd.TimedeltaIndex)):
        return time

    if isinstance(time, LocalCoordinateSystem):
        return to_pandas_time_index(time.time)

    if isinstance(time, pint.Quantity):
        base = "s"  # using low base unit could cause rounding errors
        if not np.iterable(time):  # catch zero-dim arrays
            time = np.expand_dims(time, 0)
        return pd.TimedeltaIndex(data=time.to(base).magnitude, unit=base)

    if isinstance(time, (xr.DataArray, xr.Dataset)):
        if "time" in time.coords:
            time = time.time
        time_index = pd.Index(time.values)
        if is_timedelta64_dtype(time_index) and time.weldx.time_ref:
            time_index = time_index + time.weldx.time_ref
        return time_index

    if not np.iterable(time) or isinstance(time, str):
        time = [time]
    time = pd.Index(time)

    if isinstance(time, (pd.DatetimeIndex, pd.TimedeltaIndex)):
        return time

    # try manual casting for object dtypes (i.e. strings), should avoid integers
    # warning: this allows something like ["1","2","3"] which will be ns !!
    if is_object_dtype(time):
        for func in (pd.DatetimeIndex, pd.TimedeltaIndex):
            try:
                return func(time)
            except (ValueError, TypeError):
                continue

    raise TypeError(f"Could not convert {_input_type} "
                    f"to pd.DatetimeIndex or pd.TimedeltaIndex")
Exemplo n.º 19
0
def generate_missing_value_indicator(df: pd.DataFrame, columns: list, fill_value="NA"):
    """Fill any na values in columns with fill_value."""
    for column in columns:
        if not is_object_dtype(df[column]):
            print("skipping non-object column {}".format(column))
        else:
            df.loc[:, column] = df[column].fillna(fill_value)
    return df
Exemplo n.º 20
0
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        if pdt.is_object_dtype(series):
            try:
                return series.isin({True, False}).all()
            except:
                return False

        return pdt.is_bool_dtype(series)
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        # TODO: without the object check this passes string categories... is there a better way?
        if pdt.is_categorical_dtype(series):
            return False
        elif not pdt.is_object_dtype(series):
            return pandas_has_string_dtype_flag and pdt.is_string_dtype(series)

        return series_is_string(series)
Exemplo n.º 22
0
def _dtype_represents_categories(series) -> bool:
    "Determines if the dtype of the series represents categorical values"
    return (
        is_bool_dtype(series)
        or is_object_dtype(series)
        or is_string_dtype(series)
        or is_categorical_dtype(series)
    )
Exemplo n.º 23
0
    def fit_transform(self, X, y=None, **fit_params):

        # preserve mlm_dtypes if it exists
        try:
            self.meta_mlm_dtypes = X.mlm_dtypes
            self.no_meta_mlm_dtypes = False
        except AttributeError:
            self.no_meta_mlm_dtypes = True
            pass

        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)

        if not self.no_meta_mlm_dtypes:
            Xs = Xs.loc[:, ~Xs.columns.duplicated()]
            Xs = PreserveMetaData(Xs)
            Xs.mlm_dtypes = self.meta_mlm_dtypes

            # reset dtype for any columns that were turned into object columns
            for mlm_dtype in Xs.mlm_dtypes.keys():
                for column in Xs.mlm_dtypes[mlm_dtype]:
                    try:
                        if is_object_dtype(Xs[column]):
                            if mlm_dtype == "boolean":
                                Xs[column] = Xs[column].astype("boolean")
                            elif mlm_dtype == "continuous":
                                Xs[column] = Xs[column].astype("float64")
                            elif mlm_dtype == "category":
                                Xs[column] = Xs[column].astype("category")
                            elif mlm_dtype == "count":
                                Xs[column] = Xs[column].astype("int64")
                            elif mlm_dtype == "date":
                                Xs[column] = Xs[column].astype("datetime64[ns]")
                            elif mlm_dtype == "nominal":
                                Xs[column] = Xs[column].astype("category")
                            elif mlm_dtype == "ordinal":
                                Xs[column] = Xs[column].astype("category")
                    except KeyError:
                        continue

        return Xs
Exemplo n.º 24
0
    def contains_op(series: pd.Series, state: dict) -> bool:
        is_valid_dtype = pdt.is_categorical_dtype(
            series) and not pdt.is_bool_dtype(series)
        if is_valid_dtype:
            return True
        elif not pdt.is_object_dtype(series):
            return pandas_has_string_dtype_flag and pdt.is_string_dtype(series)

        return series_is_string(series, state)
Exemplo n.º 25
0
def getFeatureCategorical(data):
    import pandas.api.types as types
    import Tools
    feature_categorical = []
    for column in list(data.columns):
        if types.is_object_dtype(data[column]):
            feature_categorical.append(column)

    return feature_categorical
Exemplo n.º 26
0
 def contains_op(cls, series: pd.Series) -> bool:
     is_object = pdt.is_object_dtype(series)
     if is_object:
         ret = True
     elif pandas_has_string_dtype_flag:
         ret = pdt.is_string_dtype(series) and not pdt.is_categorical_dtype(series)
     else:
         ret = False
     return ret
Exemplo n.º 27
0
Arquivo: array.py Projeto: tnir/pandas
 def astype(self, dtype, copy=True):
     if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
         if copy:
             return self.copy()
         return self
     elif is_string_dtype(dtype) and not is_object_dtype(dtype):
         # numpy has problems with astype(str) for nested elements
         return np.array([str(x) for x in self.data], dtype=dtype)
     return np.array(self.data, dtype=dtype, copy=copy)
Exemplo n.º 28
0
def df_normalize_strings(df):
    for col in df.columns:
        if is_string_dtype(df[col]) or is_object_dtype(df[col]):
            df[col] = df[col].str.lower()
            df[col] = df[col].fillna(np.nan)  # make None -> np.nan
            df[col] = df[col].replace('none or unspecified', np.nan)
            df[col] = df[col].replace('none', np.nan)
            df[col] = df[col].replace('#name?', np.nan)
            df[col] = df[col].replace('', np.nan)
Exemplo n.º 29
0
def format_missings(df):
  for column in df.columns:
    if is_numeric_dtype(df[column]):
      fill_value = df[column].mean()
      df[column] = df[column].fillna(fill_value, downcast=False)
    elif is_object_dtype(df[column]) or is_string_dtype(df[column]):
      df[column] = df[column].fillna('MISSING', downcast=False)
  print("Shape after format_missing:", df.shape)
  return df
 def _is_datetime(s):
     if is_datetime64_any_dtype(s):
         return True
     try:
         if is_object_dtype(s):
             pd.to_datetime(s, infer_datetime_format=True)
             return True
     except Exception:
         pass
     return False
Exemplo n.º 31
0
def object_is_bool(series: pd.Series, state) -> bool:
    if pdt.is_object_dtype(series):
        bool_set = {True, False}
        try:
            ret = all(item in bool_set for item in series)
        except:
            ret = False

        return ret
    return False
Exemplo n.º 32
0
 def _is_datetime(s):
     if is_datetime64_any_dtype(s):
         return True
     try:
         if is_object_dtype(s):
             pd.to_datetime(s, infer_datetime_format=True)
             return True
     except Exception:  # pylint: disable=broad-except
         pass
     return False
Exemplo n.º 33
0
    def contains_op(cls, series: pd.Series) -> bool:
        # TODO: without the object check this passes string categories... is there a better way?
        if not pdt.is_object_dtype(series):
            return False
        elif series.hasnans:
            series = series.dropna()
            if series.empty:
                return False

        return all(isinstance(v, str) for v in series)
Exemplo n.º 34
0
    def is_object(self):
        """
        Return if the current index type is a object type.

        Examples
        --------
        >>> ks.DataFrame({'a': [1]}, index=["a"]).index.is_object()
        True
        """
        return is_object_dtype(self.dtype)
Exemplo n.º 35
0
def convert_col_dtype(col, int_to_category=True, force_fp32=True):
    """Convert datatypes for columns according to "sensible" rules for the
    tasks in this module:

    * integer types are reduced to smallest integer type without losing
      information, or to a categorical if that uses less memory (roughly)
    * float types are all made the same: either the type of the first element,
      or all are reduced to single precision
    * object types that contain strings are converted to categoricals
    * object types that contain numbers are converted according to the rules
      above to either floats, shortest-possible ints, or a categorical
    * bool types are forced to ``numpy.dtype('bool')``

    Parameters
    ----------
    col : pandas.Series
        Column

    int_to_category : bool
        Whether to convert integer types to categoricals in the case that this
        will save memory.

    force_fp32 : bool
        Force all floating-point data types to be single precision (fp32). If
        False, the type of the first element is used instead (for all values in
        the column).

    Returns
    -------
    col : pandas.Series

    """
    from pisa.utils.fileio import fsort

    categorical_dtype = CategoricalDtype()

    recognized_dtype = False
    original_dtype = col.dtype
    col_name = col.name

    if len(col) == 0: #pylint: disable=len-as-condition
        return col

    first_item = col.iloc[0]

    # Default: keep current dtype
    new_dtype = original_dtype

    if (is_categorical_dtype(original_dtype)
            or is_datetime64_any_dtype(original_dtype)
            or is_timedelta64_dtype(original_dtype)
            or is_timedelta64_ns_dtype(original_dtype)):
        recognized_dtype = True
        new_dtype = original_dtype
    elif is_object_dtype(original_dtype):
        if isinstance(first_item, basestring):
            recognized_dtype = True
            new_dtype = categorical_dtype
        # NOTE: Must check bool before int since bools look like ints (but not
        # vice versa)
        elif isinstance(first_item, BOOL_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('bool')
        elif isinstance(first_item, INT_TYPES + UINT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('int')
        elif isinstance(first_item, FLOAT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype(type(first_item))

    # Convert ints to either shortest int possible or categorical,
    # whichever is smaller (use int if same size)
    if new_dtype in INT_DTYPES + UINT_DTYPES:
        recognized_dtype = True
        # See how large an int would be necessary
        col_min, col_max = col.min(), col.max()
        found_int_dtype = False
        int_dtype = None
        for int_dtype in INT_DTYPES:
            exponent = 8*int_dtype.itemsize - 1
            min_representable = -2 ** exponent
            max_representable = (2 ** exponent) - 1
            if col_min >= min_representable and col_max <= max_representable:
                found_int_dtype = True
                break
        if not found_int_dtype:
            raise ValueError('Value(s) in column "%s" exceed %s bounds'
                             % (col_name, int_dtype))

        # Check if categorical is probably smaller than int dtype; note that
        # the below is not perfect (i.e. is not based on exact internal
        # representation of categoricals in Pandas...) but should get us pretty
        # close, so that at least order-of-magnitude efficiencies will be
        # found)
        if int_to_category:
            num_unique = len(col.unique())
            category_bytes = int(np.ceil(np.log2(num_unique) / 8))
            if category_bytes < int_dtype.itemsize:
                new_dtype = categorical_dtype
            else:
                new_dtype = int_dtype

    elif new_dtype in FLOAT_DTYPES:
        recognized_dtype = True
        if force_fp32:
            new_dtype = np.dtype('float32')
        else:
            new_dtype = np.dtype(type(first_item))

    elif new_dtype in BOOL_DTYPES:
        recognized_dtype = True
        new_dtype = np.dtype('bool')

    if not recognized_dtype:
        wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"'
                ' and/or sub-type "%s"\n'
                % (col_name, original_dtype.name, type(first_item)))

    if is_dtype_equal(new_dtype, original_dtype):
        if isinstance(first_item, basestring):
            return col.cat.reorder_categories(fsort(col.cat.categories))
        return col

    if is_categorical_dtype(new_dtype):
        new_col = col.astype('category')
        if isinstance(first_item, basestring):
            new_col.cat.reorder_categories(fsort(new_col.cat.categories),
                                           inplace=True)
        return new_col

    try:
        return col.astype(new_dtype)
    except ValueError:
        wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping'
                ' original dtype "%s"\n'
                % (col_name, new_dtype, original_dtype))
        return col
Exemplo n.º 36
0
def coerce_dtypes(df, dtypes):
    """ Coerce dataframe to dtypes safely

    Operates in place

    Parameters
    ----------
    df: Pandas DataFrame
    dtypes: dict like {'x': float}
    """
    bad_dtypes = []
    bad_dates = []
    errors = []
    for c in df.columns:
        if c in dtypes and df.dtypes[c] != dtypes[c]:
            actual = df.dtypes[c]
            desired = dtypes[c]
            if is_float_dtype(actual) and is_integer_dtype(desired):
                bad_dtypes.append((c, actual, desired))
            elif is_object_dtype(actual) and is_datetime64_any_dtype(desired):
                # This can only occur when parse_dates is specified, but an
                # invalid date is encountered. Pandas then silently falls back
                # to object dtype. Since `object_array.astype(datetime)` will
                # silently overflow, error here and report.
                bad_dates.append(c)
            else:
                try:
                    df[c] = df[c].astype(dtypes[c])
                except Exception as e:
                    bad_dtypes.append((c, actual, desired))
                    errors.append((c, e))

    if bad_dtypes:
        if errors:
            ex = '\n'.join("- %s\n  %r" % (c, e) for c, e in
                           sorted(errors, key=lambda x: str(x[0])))
            exceptions = ("The following columns also raised exceptions on "
                          "conversion:\n\n%s\n\n") % ex
            extra = ""
        else:
            exceptions = ""
            # All mismatches are int->float, also suggest `assume_missing=True`
            extra = ("\n\nAlternatively, provide `assume_missing=True` "
                     "to interpret\n"
                     "all unspecified integer columns as floats.")

        bad_dtypes = sorted(bad_dtypes, key=lambda x: str(x[0]))
        table = asciitable(['Column', 'Found', 'Expected'], bad_dtypes)
        dtype_kw = ('dtype={%s}' % ',\n'
                    '       '.join("%r: '%s'" % (k, v)
                                   for (k, v, _) in bad_dtypes))

        dtype_msg = (
            "{table}\n\n"
            "{exceptions}"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n\n"
            "{dtype_kw}\n\n"
            "to the call to `read_csv`/`read_table`."
            "{extra}").format(table=table, exceptions=exceptions,
                              dtype_kw=dtype_kw, extra=extra)
    else:
        dtype_msg = None

    if bad_dates:
        also = " also " if bad_dtypes else " "
        cols = '\n'.join("- %s" % c for c in bad_dates)
        date_msg = (
            "The following columns{also}failed to properly parse as dates:\n\n"
            "{cols}\n\n"
            "This is usually due to an invalid value in that column. To\n"
            "diagnose and fix it's recommended to drop these columns from the\n"
            "`parse_dates` keyword, and manually convert them to dates later\n"
            "using `dd.to_datetime`.").format(also=also, cols=cols)
    else:
        date_msg = None

    if bad_dtypes or bad_dates:
        rule = "\n\n%s\n\n" % ('-' * 61)
        msg = ("Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n"
               "%s" % (rule.join(filter(None, [dtype_msg, date_msg]))))
        raise ValueError(msg)
Exemplo n.º 37
0
 def _is_discrete(s):
     return (is_categorical_dtype(s) or
             is_object_dtype(s) and (force_nominal or
                                     s.nunique() < s.size**.666))
Exemplo n.º 38
0
    def universal_dataset_check(self, dataset_name, object_headers=None,
                                numeric_headers=None, bool_headers=None,
                                test_func=None):

        # "Hard" integrity checks that take a long time.
        # These tests only run if the MATMINER_DATASET_FULL_TEST
        # environment variable is set to True
        if do_complete_test:
            # Get rid of dataset if it's on the disk already
            data_path = os.path.join(
                self.dataset_dir,
                dataset_name + "." + self.dataset_dict[dataset_name][
                    'file_type'
                ]
            )
            if os.path.exists(data_path):
                os.remove(data_path)

            # Test that dataset can be downloaded
            load_dataset(dataset_name)
            self.assertTrue(os.path.exists(data_path))

            # Test that data is now available and has all its elements
            df = load_dataset(dataset_name, download_if_missing=False)
            self.assertEqual(
                len(df), self.dataset_dict[dataset_name]["num_entries"]
            )

            # Test all columns are there
            self.assertEqual(sorted(list(df)), sorted(
                [header for header in
                 self.dataset_dict[dataset_name]['columns'].keys()]
            ))

            # Test each column for appropriate type
            if object_headers is None:
                object_headers = []
            if numeric_headers is None:
                numeric_headers = []
            if bool_headers is None:
                bool_headers = []

            df = load_dataset(dataset_name, download_if_missing=False)
            if object_headers:
                self.assertTrue(is_object_dtype(df[object_headers].values))
            if numeric_headers:
                self.assertTrue(is_numeric_dtype(df[numeric_headers].values))
            if bool_headers:
                self.assertTrue(is_bool_dtype(df[bool_headers].values))

            # Make sure all columns are accounted for
            column_headers = object_headers + numeric_headers + bool_headers
            self.assertEqual(sorted(list(df)), sorted(column_headers))

            # Run tests unique to the dataset
            if test_func is not None:
                test_func(df)

        # "Soft" check that just makes sure the dataset download page is active
        # This runs when on a system with the CI environment var present
        # (e.g. when running a continuous integration VCS system)
        else:
            download_page = requests.head(
                self.dataset_dict[dataset_name]["url"]
            )
            self.assertTrue(download_page.ok)
def pandas_to_table(df):
    # type: (pd.DataFrame) -> Orange.data.Table
    """
    Convert a pandas.DataFrame to a Orange.data.Table instance.
    """
    index = df.index
    if not isinstance(index, pd.RangeIndex):
        df = df.reset_index()

    columns = []  # type: List[Tuple[Orange.data.Variable, np.ndarray]]

    for header, series in df.items():  # type: (Any, pd.Series)
        if pdtypes.is_categorical(series):
            coldata = series.values  # type: pd.Categorical
            categories = [str(c) for c in coldata.categories]
            var = Orange.data.DiscreteVariable.make(
                str(header), values=categories, ordered=coldata.ordered
            )
            # Remap the coldata into the var.values order/set
            coldata = pd.Categorical(
                coldata, categories=var.values, ordered=coldata.ordered
            )
            codes = coldata.codes
            assert np.issubdtype(codes.dtype, np.integer)
            orangecol = np.array(codes, dtype=np.float)
            orangecol[codes < 0] = np.nan
        elif pdtypes.is_datetime64_any_dtype(series):
            # Check that this converts tz local to UTC
            series = series.astype(np.dtype("M8[ns]"))
            coldata = series.values  # type: np.ndarray
            assert coldata.dtype == "M8[ns]"
            mask = np.isnat(coldata)
            orangecol = coldata.astype(np.int64) / 10 ** 9
            orangecol[mask] = np.nan
            var = Orange.data.TimeVariable.make(str(header))
            var.have_date = var.have_time = 1
        elif pdtypes.is_object_dtype(series):
            coldata = series.values
            assert isinstance(coldata, np.ndarray)
            orangecol = coldata
            var = Orange.data.StringVariable.make(str(header))
        elif pdtypes.is_integer_dtype(series):
            coldata = series.values
            var = Orange.data.ContinuousVariable.make(str(header))
            var.number_of_decimals = 0
            orangecol = coldata.astype(np.float64)
        elif pdtypes.is_numeric_dtype(series):
            orangecol = series.values.astype(np.float64)
            var = Orange.data.ContinuousVariable.make(str(header))
            var._out_format = "%.15g"
        else:
            warnings.warn(
                "Column '{}' with dtype: {} skipped."
                .format(header, series.dtype),
                UserWarning
            )
            continue
        columns.append((var, orangecol))

    cols_x = [(var, col) for var, col in columns if var.is_primitive()]
    cols_m = [(var, col) for var, col in columns if not var.is_primitive()]

    variables = [v for v, _ in cols_x]
    if cols_x:
        X = np.column_stack([a for _, a in cols_x])
    else:
        X = np.empty((df.shape[0], 0), dtype=np.float)
    metas = [v for v, _ in cols_m]
    if cols_m:
        M = np.column_stack([a for _, a in cols_m])
    else:
        M = None

    domain = Orange.data.Domain(variables, metas=metas)
    return Orange.data.Table.from_numpy(domain, X, None, M)