예제 #1
0
    def train(self, df):

        #read csv into pandas dataframe
        df = pd.read_csv(df)
        #df = df.iloc[:1000,:]
        print('training model')

        #create dataframe for items to get content vectors
        df_items = df.iloc[:, 3:]
        df_items.drop_duplicates(inplace=True, subset='catalog_item_id')
        df_items.reset_index(inplace=True)

        #create df with just columns needed to return final output
        self.df_items_lookup = df_items.sort_values(
            by='catalog_item_name').reset_index(
            ).loc[:, ['catalog_item_id', 'catalog_item_name', 'brand_name']]

        #create pivot table for matrix factorization
        df = df.groupby(['user_id_hash',
                         'catalog_item_name'])['quantity'].sum().unstack()

        #create sparse matrix
        sdf = df.astype(pd.SparseDtype("float", np.nan))
        sdf2 = sdf.sparse.to_coo()

        #get list of user and item indexes for lookup later
        self.user_index = list(df.index)
        self.item_index = pd.DataFrame(df.columns)

        # initialize a model
        self.model = implicit.lmf.LogisticMatrixFactorization(factors=20)
        # train the model on a sparse matrix of item/user/confidence weights
        fit = self.model.fit(sdf2.T)

        # recommend items for a user
        self.user_items = sdf2.tocsr()
        print('training complete!')
예제 #2
0
def load_data(k=20):
    ratings_df = pd.read_csv("./ml-latest-small/ratings.csv")
    movies_df = pd.read_csv("./ml-latest-small/movies.csv")

    ratings_df = ratings_df.drop(columns=['timestamp'])
    films_nb = len(set(ratings_df.movieId))

    ratings_df = ratings_df.astype(pd.SparseDtype(np.float32, np.nan)).pivot(
        index='userId', columns='movieId', values='rating')
    iid_map = dict()
    i = 0
    for item in ratings_df:
        iid_map[i] = item
        i += 1

    users_mean = ratings_df.mean(axis=1).values
    R_demeaned = ratings_df.sub(ratings_df.mean(axis=1), axis=0)
    R_demeaned = coo_matrix(R_demeaned.fillna(0).values)
    del ratings_df

    U, sigma, Vt = svds(R_demeaned, k=k)
    sigma = np.diag(sigma)

    return U, sigma, Vt, movies_df, films_nb, iid_map, users_mean
예제 #3
0
def construct_load_shed(scenario_info, grid, infeasibilities=None):
    """Constructs load_shed dataframe from relevant scenario/grid data.

    :param dict scenario_info: info attribute of Scenario object.
    :param powersimdata.input.grid.Grid grid: grid to construct load_shed for.
    :param dict/None infeasibilities: dictionary of
        {interval (int): load shed percentage (int)}, or None.
    :return: (*pandas.DataFrame*) -- data frame of load_shed.
    """
    hours = pd.date_range(start=scenario_info["start_date"],
                          end=scenario_info["end_date"],
                          freq="1H").tolist()
    buses = grid.bus.index
    if infeasibilities is None:
        print("No infeasibilities, constructing DataFrame")
        load_shed_data = coo_matrix((len(hours), len(buses)))
        load_shed = pd.DataFrame.sparse.from_spmatrix(load_shed_data)
    else:
        print("Infeasibilities, constructing DataFrame")
        bus_demand = get_bus_demand(scenario_info, grid)
        load_shed = np.zeros((len(hours), len(buses)))
        # Convert '24H' to 24
        interval = int(scenario_info["interval"][:-1])
        for i, v in infeasibilities.items():
            start = i * interval
            end = (i + 1) * interval
            base_demand = bus_demand.iloc[start:end, :].to_numpy()
            shed_demand = base_demand * (v / 100)
            load_shed[start:end, :] = shed_demand
        load_shed = pd.DataFrame(load_shed, columns=buses, index=hours)
        load_shed = load_shed.astype(pd.SparseDtype("float", 0))
    load_shed.index = hours
    load_shed.index.name = "UTC"
    load_shed.columns = buses

    return load_shed
예제 #4
0
    def run(self):
        weight = []
        ret = []
        for i, adjust_date in enumerate(self.adjust_dates[1:]):
            self.account.trail(adjust_date)
            new_weight = self.get_new_weight()
            self.account.rebalance(adjust_date, new_weight)

            if i == 0:
                ret.append(self.account._ret)
            else:
                ret.append(self.account._ret.iloc[1:])
            weight.append(self.account._weight.iloc[:-1])

        self.account.trail(datetime.strptime(end_date, '%Y-%m-%d'))
        ret.append(self.account._ret.iloc[1:])
        weight.append(self.account._weight)

        ret = pd.concat(ret, sort=True)
        self.ret = ret
        nv = (ret + 1).cumprod()
        nv.iloc[0] = 1  # 初始净值为1
        self.net_value = nv
        self._weight = pd.concat(weight, sort=True).astype(pd.SparseDtype())
예제 #5
0
    def df(self):
        """
        :class:`pandas.SparseDataFrame` :
            DataFrame representation of the contact matrix

            Rows/columns correspond to indices and the values correspond to
            the count
        """
        mtx = self.sparse_matrix
        index = list(range(self.max_size))
        columns = list(range(self.max_size))

        if _PD_VERSION < (0, 25):  # py27 only  -no-cov-
            mtx = mtx.tocoo()
            return pd.SparseDataFrame(mtx, index=index, columns=columns)

        df = pd.DataFrame.sparse.from_spmatrix(mtx, index=index,
                                               columns=columns)
        # note: I think we can always use float here for dtype; but in
        # principle maybe we need to inspect and get the internal type?
        # Problem is, pandas technically stores a different dtype for each
        # column.
        df = df.astype(pd.SparseDtype("float", np.nan))
        return df
예제 #6
0
    def test_dataframe_dummies_prefix_dict(self, sparse):
        prefixes = {"A": "from_A", "B": "from_B"}
        df = DataFrame({
            "C": [1, 2, 3],
            "A": ["a", "b", "a"],
            "B": ["b", "b", "c"]
        })
        result = get_dummies(df, prefix=prefixes, sparse=sparse)

        expected = DataFrame({
            "C": [1, 2, 3],
            "from_A_a": [1, 0, 1],
            "from_A_b": [0, 1, 0],
            "from_B_b": [1, 1, 0],
            "from_B_c": [0, 0, 1],
        })

        columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
        expected[columns] = expected[columns].astype(np.uint8)
        if sparse:
            expected[columns] = expected[columns].astype(
                pd.SparseDtype("uint8", 0))

        tm.assert_frame_equal(result, expected)
예제 #7
0
def mask_tissue(image: np.ndarray, counts: pd.DataFrame,
                label: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray]:
    r"""
    Detects the tissue in `image`. The area outside of the tissue is given a
    new label with zero counts everywhere.
    """
    mask = compute_tissue_mask(image)

    counts.index += 1
    label[label != 0] += 1

    in_mask = np.unique(label[mask & (label != 0)])
    label[~mask.astype(bool) & ~np.isin(label, in_mask)] = 1

    counts = pd.concat([
        pd.DataFrame(
            [np.repeat(0, counts.shape[1])],
            columns=counts.columns,
            index=[1],
        ).astype(pd.SparseDtype("float", 0)),
        counts,
    ])

    return counts, label
예제 #8
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error #均方误差
from sklearn.metrics import mean_absolute_error #平方绝对误差
from sklearn.metrics import r2_score#R square
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib


IDIR = 'G:\\bigdata\\badou\\00-data//'
df_train = pd.read_csv(IDIR + 'train_feat.csv').fillna(0.).astype(pd.SparseDtype("float", np.nan))
labels = np.load(IDIR + 'labels.npy')
X_train, X_test, y_train, y_test = train_test_split(df_train, labels, test_size=0.2, random_state=2020)
print('load_model')
rfr=joblib.load('randomForestRegressor.m')
print('W:',rfr.feature_importances_)
y_pred = rfr.predict(X_test)
# y_pred_train = rfr.predict(X_train)
# 0.8272266140627522
print('auc_test0:',roc_auc_score(y_test,y_pred))
# print('auc_train0:',roc_auc_score(y_train,y_pred_train))
print('train again...')
# 模型再训练
rfr.fit(X_train, y_train)
print('W2:',rfr.feature_importances_)
y_pred = rfr.predict(X_test)
print('auc_test1:',roc_auc_score(y_test,y_pred))
예제 #9
0
def dataframe_to_sparse(x, fill_value=0.0):
    return x.astype(pd.SparseDtype(float, fill_value=fill_value))
예제 #10
0
 def test_take_all_empty(self):
     a = pd.array([0, 0], dtype=pd.SparseDtype("int64"))
     result = a.take([0, 1], allow_fill=True, fill_value=np.nan)
     tm.assert_sp_array_equal(a, result)
예제 #11
0
 def __post_init__(self):
     object.__setattr__(
         self,
         "type",
         pd.SparseDtype(dtype=self.dtype, fill_value=self.fill_value),
     )
예제 #12
0
    pandas_engine.DateTime(unit="ns", tz="CET"):
    "datetime64[ns, CET]",  # type: ignore
}

timedelta_dtypes = {
    datetime.timedelta: "timedelta64",
    datetime.timedelta: "timedelta64",
    np.timedelta64: "timedelta64",
    pd.Timedelta: "timedelta64",
    pa.Timedelta: "timedelta64",
}

period_dtypes = {pd.PeriodDtype(freq="D"): "period[D]"}
# Series.astype does not accept a string alias for SparseDtype.
sparse_dtypes = {
    pd.SparseDtype: pd.SparseDtype(),
    pd.SparseDtype(np.float64): pd.SparseDtype(np.float64),
}
interval_dtypes = {pd.IntervalDtype(subtype=np.int64): "interval[int64]"}

dtype_fixtures: List[Tuple[Dict, List]] = [
    (int_dtypes, [-1]),
    (nullable_int_dtypes, [-1, None]),
    (uint_dtypes, [1]),
    (nullable_uint_dtypes, [1, None]),
    (float_dtypes, [1.0]),
    (complex_dtypes, [complex(1)]),
    (boolean_dtypes, [True, False]),
    (nullable_boolean_dtypes, [True, None]),
    (string_dtypes, ["A", "B"]),
    (object_dtypes, ["A", "B"]),
예제 #13
0
def test_pandas_sparse_iloc():
    X = pd.DataFrame([[0, 1, 1], [0, 0, 1],
                      [0, 0, 0]]).astype(pd.SparseDtype(float, fill_value=0.0))
    assert np.all(~np.isnan(X.iloc[[0, 1]].to_numpy()))
예제 #14
0
 def test_astype_str(self, data):
     result = pd.Series(data[:5]).astype(str)
     expected_dtype = pd.SparseDtype(str, str(data.fill_value))
     expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype)
     self.assert_series_equal(result, expected)
예제 #15
0
from pathlib import Path
from pandas.api.types import union_categoricals

pathdata = Path() / "data"
csvpath = pathdata / "airlinetrain1m.csv"
df = pd.read_csv(csvpath.resolve())

label = df["dep_delayed_15min"].map({"N": 0, "Y": 1})

covariates = ["DepTime", "Distance"]
factors = [
    "Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest"
]

sp_covariates = list(
    map(lambda col: df[col].astype(pd.SparseDtype("float32", 0.0)),
        covariates))
sp_factors = list(
    map(
        lambda col: pd.get_dummies(
            df[col], prefix=col, sparse=True, dtype=np.float32), factors))

data = pd.concat(sp_factors + sp_covariates, axis=1)
spdata = coo_matrix(data.sparse.to_coo()).tocsr()

# featmappath = pathdata / "featmap.txt"
# with open(featmappath.resolve(), "w") as f:
#     lines = ["{fid} {fname} {ftype}\n".format(fid=i, fname=col, ftype="int" if col.startswith(("deptime", "distance")) else "i") for (i, col) in enumerate(data.columns)]
#     f.writelines(lines)

eta = 0.1
예제 #16
0
파일: utility.py 프로젝트: roromaniac/xfuse
def write_data(
    counts: pd.DataFrame,
    image: np.ndarray,
    label: np.ndarray,
    annotation: Dict[str, np.ndarray],
    type_label: str,
    path: str = "data.h5",
) -> None:
    r"""Writes data to the format used by XFuse."""
    if image.shape[:2] != label.shape[:2]:
        raise RuntimeError(
            f"Image shape ({image.shape[:2]}) is not equal to"
            f" the shape of the label image ({label.shape[:2]}).")

    if np.max(image.shape[:2]) > 5000:
        log(
            WARNING,
            "The image resolution is very large! 😱"
            " XFuse typically works best on medium resolution images"
            " (approximately 1000x1000 px)."
            " If you experience performance issues, please consider reducing"
            " the resolution.",
        )

    if counts.columns.duplicated().any():
        log(
            WARNING,
            "Count matrix contains duplicated columns."
            " Counts will be summed by column name.",
        )
        counts = counts.sum(axis=1, level=0)

    log(DEBUG, "writing data to %s", path)
    os.makedirs(os.path.normpath(os.path.dirname(path)), exist_ok=True)
    with h5py.File(path, "w") as data_file:
        data = (counts.astype(pd.SparseDtype("float",
                                             0.0)).sparse.to_coo().tocsr())
        data_file.create_dataset("counts/data", data.data.shape, float,
                                 data.data.astype(float))
        data_file.create_dataset(
            "counts/indices",
            data.indices.shape,
            data.indices.dtype,
            data.indices,
        )
        data_file.create_dataset("counts/indptr", data.indptr.shape,
                                 data.indptr.dtype, data.indptr)
        data_file.create_dataset(
            "counts/columns",
            counts.columns.shape,
            h5py.string_dtype(),
            counts.columns.values,
        )
        data_file.create_dataset("counts/index", counts.index.shape, int,
                                 counts.index.astype(int))
        data_file.create_dataset("image", image.shape, np.uint8, image)
        data_file.create_dataset("label", label.shape, np.int16, label)
        data_file.create_group("annotation", track_order=True)
        for k, v in annotation.items():
            data_file.create_dataset(f"annotation/{k}", v.shape, np.uint16, v)
        data_file.create_dataset("type",
                                 data=type_label,
                                 dtype=h5py.string_dtype())
예제 #17
0
 def _convert_to_dense(cls, series):
     if isinstance(series.dtype, pd.SparseDtype):
         return series.astype(pd.SparseDtype(series.dtype.subtype,
                                             np.nan)).sparse.to_dense()
     return series
예제 #18
0
    def _sparse_reindex(cls, inp, index=None, columns=None):
        if inp.ndim == 2:
            columns = inp.columns if columns is None else columns
            index_shape = len(index) if index is not None else len(inp)
            i_to_columns = dict()

            for i, col in enumerate(columns):
                if col in inp.dtypes:
                    if index is None:
                        i_to_columns[i] = inp[col]
                    else:
                        indexer = inp.index.reindex(index)[1]
                        cond = indexer >= 0
                        available_indexer = indexer[cond]
                        del indexer
                        data = inp[col].iloc[available_indexer].to_numpy()
                        ind = cond.nonzero()[0]
                        spmatrix = sps.csc_matrix(
                            (data, (ind, np.zeros_like(ind))),
                            shape=(index_shape, 1),
                            dtype=inp[col].dtype)
                        sparse_array = pd.arrays.SparseArray.from_spmatrix(
                            spmatrix)
                        # convert to SparseDtype(xxx, np.nan)
                        # to ensure 0 in sparse_array not converted to np.nan
                        sparse_array = pd.arrays.SparseArray(
                            sparse_array.sp_values,
                            sparse_index=sparse_array.sp_index,
                            fill_value=np.nan,
                            dtype=pd.SparseDtype(sparse_array.dtype, np.nan))
                        series = pd.Series(sparse_array, index=index)

                        i_to_columns[i] = series
                else:
                    ind = index if index is not None else inp.index
                    i_to_columns[i] = pd.DataFrame.sparse.from_spmatrix(
                        sps.coo_matrix((index_shape, 1), dtype=np.float64),
                        index=ind).iloc[:, 0]

            df = pd.DataFrame(i_to_columns)
            df.columns = columns
            return df
        else:
            indexer = inp.index.reindex(index)[1]
            cond = indexer >= 0
            available_indexer = indexer[cond]
            del indexer
            data = inp.iloc[available_indexer].to_numpy()
            ind = cond.nonzero()[0]
            spmatrix = sps.csc_matrix((data, (ind, np.zeros_like(ind))),
                                      shape=(len(index), 1),
                                      dtype=inp.dtype)
            sparse_array = pd.arrays.SparseArray.from_spmatrix(spmatrix)
            # convert to SparseDtype(xxx, np.nan)
            # to ensure 0 in sparse_array not converted to np.nan
            sparse_array = pd.arrays.SparseArray(
                sparse_array.sp_values,
                sparse_index=sparse_array.sp_index,
                fill_value=np.nan,
                dtype=pd.SparseDtype(sparse_array.dtype, np.nan))
            series = pd.Series(sparse_array, index=index, name=inp.name)
            return series
예제 #19
0
def read_csv(filename):
    return (pd.read_csv(filename, index_col=["store_id",
                                             "item_id"]).fillna(0).astype(
                                                 pd.SparseDtype("float32", 0)))
예제 #20
0
class TestDataFrameAppend:
    def test_append_empty_list(self):
        # GH 28769
        df = DataFrame()
        result = df.append([])
        expected = df
        tm.assert_frame_equal(result, expected)
        assert result is not df

        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])
        result = df.append([])
        expected = df
        tm.assert_frame_equal(result, expected)
        assert result is not df  # .append() should return a new object

    def test_append_series_dict(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        series = df.loc[4]
        msg = "Indexes have overlapping values"
        with pytest.raises(ValueError, match=msg):
            df.append(series, verify_integrity=True)

        series.name = None
        msg = "Can only append a Series if ignore_index=True"
        with pytest.raises(TypeError, match=msg):
            df.append(series, verify_integrity=True)

        result = df.append(series[::-1], ignore_index=True)
        expected = df.append(DataFrame({
            0: series[::-1]
        }, index=df.columns).T,
                             ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # dict
        result = df.append(series.to_dict(), ignore_index=True)
        tm.assert_frame_equal(result, expected)

        result = df.append(series[::-1][:3], ignore_index=True)
        expected = df.append(DataFrame({
            0: series[::-1][:3]
        }).T,
                             ignore_index=True,
                             sort=True)
        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        # can append when name set
        row = df.loc[4]
        row.name = 5
        result = df.append(row)
        expected = df.append(df[-1:], ignore_index=True)
        tm.assert_frame_equal(result, expected)

    def test_append_list_of_series_dicts(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        dicts = [x.to_dict() for idx, x in df.iterrows()]

        result = df.append(dicts, ignore_index=True)
        expected = df.append(df, ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # different columns
        dicts = [
            {
                "foo": 1,
                "bar": 2,
                "baz": 3,
                "peekaboo": 4
            },
            {
                "foo": 5,
                "bar": 6,
                "baz": 7,
                "peekaboo": 8
            },
        ]
        result = df.append(dicts, ignore_index=True, sort=True)
        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        tm.assert_frame_equal(result, expected)

    def test_append_missing_cols(self):
        # GH22252
        # exercise the conditional branch in append method where the data
        # to be appended is a list and does not contain all columns that are in
        # the target DataFrame
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        dicts = [{"foo": 9}, {"bar": 10}]
        with tm.assert_produces_warning(None):
            result = df.append(dicts, ignore_index=True, sort=True)

        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        tm.assert_frame_equal(result, expected)

    def test_append_empty_dataframe(self):

        # Empty df append empty df
        df1 = DataFrame()
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Non-empty df append empty df
        df1 = DataFrame(np.random.randn(5, 2))
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Empty df with columns append empty df
        df1 = DataFrame(columns=["bar", "foo"])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Non-Empty df with columns append empty df
        df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

    def test_append_dtypes(self):

        # GH 5754
        # row appends of different dtypes (so need to do by-item)
        # can sometimes infer the correct type

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5))
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": "foo"}, index=range(1, 2))
        result = df1.append(df2)
        expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": np.nan}, index=range(1, 2))
        result = df1.append(df2)
        expected = DataFrame(
            {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object)
        result = df1.append(df2)
        expected = DataFrame(
            {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": np.nan}, index=range(1))
        df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2))
        result = df1.append(df2)
        expected = DataFrame(
            {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object)
        result = df1.append(df2)
        expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"])
    def test_append_timestamps_aware_or_naive(self, tz_naive_fixture,
                                              timestamp):
        # GH 30238
        tz = tz_naive_fixture
        df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)])
        result = df.append(df.iloc[0]).iloc[-1]
        expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "data, dtype",
        [
            ([1], pd.Int64Dtype()),
            ([1], pd.CategoricalDtype()),
            ([pd.Interval(left=0, right=5)], pd.IntervalDtype()),
            ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")),
            ([1], pd.SparseDtype()),
        ],
    )
    def test_other_dtypes(self, data, dtype):
        df = pd.DataFrame(data, dtype=dtype)
        result = df.append(df.iloc[0]).iloc[-1]
        expected = pd.Series(data, name=0, dtype=dtype)
        tm.assert_series_equal(result, expected)
예제 #21
0
def main():
    # ---------------- Set MLK Enviroment Variables for better Gensim LDA performance  ----------------
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"
    os.environ["OMP_NUM_THREADS"] = "1"
    # ---------------- Prepare LDA Inputs & Run LDA ----------------
    # Parse command line args
    save_name = str(sys.argv[1])
    cap = str(sys.argv[2])
    num_topics = int(sys.argv[3])
    # Load data
    bow_matrix_train_path = save_name + "_train_sparse.npz"
    bow_matrix_test_path = save_name + "_test_sparse.npz"

    if False: #os.path.exists(bow_matrix_train_path):
        #X_train = scipy.sparse.load_npz(bow_matrix_train_path)
        #X_test = scipy.sparse.load_npz(bow_matrix_test_path)
        pass
    else: 
        data = load_pickle("FINRA_TRACE_2014.pkl.zip")
        #data = data.append(load_pickle("FINRA_TRACE_2014.pkl.zip"),ignore_index=True)
        #data = data.append(load_pickle("FINRA_TRACE_2013.pkl.zip"),ignore_index=True)
        #data = data.append(load_pickle("FINRA_TRACE_2012.pkl.zip"),ignore_index=True)
        # Compute a version of bag_of_words given the save_name
        if save_name=="trade_frac_out":
            bag_of_words = trade_frac_out(data)
            del data
            save_name = save_name
        elif save_name=="trade_vol_BoW":
            bag_of_words = trade_vol_BoW(data,cap)
            del data
            save_name = save_name + "_" + cap
        elif save_name=="trade_vol_BoW_norm":
            bag_of_words = trade_vol_BoW_norm(data,cap)
            del data
            save_name = save_name + "_" + cap
        elif save_name=="trade_count":
            bag_of_words = compute_count(data)
            del data
        else:
            raise Exception("the save_name does not have a corresponding bag_of_words")
            
        dtype = pd.SparseDtype(float, fill_value=0)
        X = scipy.sparse.csr_matrix(bag_of_words.astype(dtype).sparse.to_coo()) 
        #X = bag_of_words.astype(dtype)
        #cutoff = int(X.shape[0]*0.9)
        #X_train = X[:cutoff]
        #X_test = X[cutoff:]
        X_train, X_test, train_idx, test_idx = train_test_split(X, np.arange(X.shape[0]), test_size=0.1, random_state=42)
        scipy.sparse.save_npz(save_name + "_train_sparse.npz", X_train) 
        scipy.sparse.save_npz(save_name + "_test_sparse.npz", X_test)
	# slice our matrix to be just the training data
        #bag_of_words = bag_of_words.iloc[train_idx]
        train_index = bag_of_words.index[train_idx]

    # Compute input for gensim LDA
    corpus = compute_corpus(X_train,save_name)
    test_corpus = compute_corpus(X_test,save_name + "_test") 
    id2word = compute_id2word(bag_of_words,save_name)
    # Run Gensim LDA
    start = time.time()
    lda = compute_topic(save_name,corpus,num_topics,id2word,workers=11,chunksize=12500,passes=10,iterations=600)
    
    lda_time = time.time()-start
    train_perplex = lda.log_perplexity(corpus)
    test_perplex = lda.log_perplexity(test_corpus)
    
    print("perplexity scores: ", train_perplex, test_perplex)
    with open("perplex_scores.csv","a+") as f:
        writer = csv.writer(f)
        writer.writerow([save_name,num_topics,train_perplex,test_perplex,lda_time])


    # ---------------- LDA Analysis  ----------------
    #os.environ["MKL_NUM_THREADS"] = "4"
    #os.environ["NUMEXPR_NUM_THREADS"] = "4"
    #os.environ["OMP_NUM_THREADS"] = "4"
    # Run PyLDAvis
    dictionary = Dictionary.from_corpus(corpus,id2word=id2word)
    save_pyldavis2html(lda, corpus, dictionary,save_name,num_topics)
    # Save document X topic matrix to csv
    document_topic_distribution(corpus,train_index,lda,save_name,num_topics)
예제 #22
0
def get_sparse_series():
    test_series = [
        # Numpy dtypes
        pd.Series([-1, 0, 1, 2, 3],
                  name="int_sparse",
                  dtype=pd.SparseDtype(np.int32, 0)),
        pd.Series(
            [np.nan, 0, 1, 2, 3],
            name="float_sparse",
            dtype=pd.SparseDtype(np.float64, np.nan),
        ),
        pd.Series(
            [
                np.nan,
                complex(0, 1),
                complex(1, -1),
                complex(2, 4),
                complex(3, -12)
            ],
            name="complex_sparse",
            dtype=pd.SparseDtype(np.complex128, np.nan),
        ),
        pd.Series(
            [True, False, False],
            name="bool_sparse",
            dtype=pd.SparseDtype(np.bool, False),
        ),
        pd.Series(
            pd.arrays.SparseArray([None, None, "gold", "black", "silver"]),
            name="str_obj_sparse",
        ),
        # Pending https://github.com/pandas-dev/pandas/issues/35762
        # pd.Series([NoneT, 0, 1, 2, 3, 4], name="datetime_sparse", dtype=pd.SparseDtype(np.datetime64)),
        # Pandas dtypes
        pd.Series(
            [0, 1, 2, 3, None],
            name="pd_int64_sparse",
            dtype=pd.SparseDtype(pd.Int64Dtype()),
        ),
        # Pending https://github.com/pandas-dev/pandas/issues/35793
        # pd.Series(
        #     ["a", "b", "c", None],
        #     name="pd_categorical_sparse",
        #     dtype=pd.SparseDtype(pd.CategoricalDtype(['a', 'b', 'c', 'd']))
        # )
    ]

    if int(pd.__version__.split(".")[0]) >= 1:
        pandas_1_series = [
            pd.Series(
                ["Patty", "Valentine", "Upper", "", "", ""],
                name="pd_string_sparse",
                dtype=pd.SparseDtype(pd.StringDtype(), ""),
            ),
            pd.Series(
                [True, False, False, None],
                name="pd_bool_sparse",
                dtype=pd.SparseDtype(pd.BooleanDtype(), None),
            ),
        ]
        test_series.extend(pandas_1_series)

    return test_series
예제 #23
0
def _deserialize_sparse_dtype(obj):
    dtype, fill_value = obj
    return pd.SparseDtype(dtype=dtype, fill_value=fill_value)
예제 #24
0
 def test_all_sparse(self):
     df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))})
     result = df.loc[[0, 1]]
     tm.assert_frame_equal(result, df)
예제 #25
0
    def test_concat_empty_series_dtypes(self):

        # booleans
        assert (pd.concat([Series(dtype=np.bool_),
                           Series(dtype=np.int32)]).dtype == np.int32)
        assert (pd.concat([Series(dtype=np.bool_),
                           Series(dtype=np.float32)]).dtype == np.object_)

        # datetime-like
        assert (pd.concat([Series(dtype="m8[ns]"),
                           Series(dtype=np.bool)]).dtype == np.object_)
        assert (pd.concat([Series(dtype="m8[ns]"),
                           Series(dtype=np.int64)]).dtype == np.object_)
        assert (pd.concat([Series(dtype="M8[ns]"),
                           Series(dtype=np.bool)]).dtype == np.object_)
        assert (pd.concat([Series(dtype="M8[ns]"),
                           Series(dtype=np.int64)]).dtype == np.object_)
        assert (pd.concat([
            Series(dtype="M8[ns]"),
            Series(dtype=np.bool_),
            Series(dtype=np.int64)
        ]).dtype == np.object_)

        # categorical
        assert (pd.concat([Series(dtype="category"),
                           Series(dtype="category")]).dtype == "category")
        # GH 18515
        assert (pd.concat(
            [Series(np.array([]), dtype="category"),
             Series(dtype="float64")]).dtype == "float64")
        assert (pd.concat([Series(dtype="category"),
                           Series(dtype="object")]).dtype == "object")

        # sparse
        # TODO: move?
        result = pd.concat([
            Series(dtype="float64").astype("Sparse"),
            Series(dtype="float64").astype("Sparse"),
        ])
        assert result.dtype == "Sparse[float64]"

        # GH 26705 - Assert .ftype is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert result.ftype == "float64:sparse"

        result = pd.concat([
            Series(dtype="float64").astype("Sparse"),
            Series(dtype="float64")
        ])
        # TODO: release-note: concat sparse dtype
        expected = pd.SparseDtype(np.float64)
        assert result.dtype == expected

        # GH 26705 - Assert .ftype is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert result.ftype == "float64:sparse"

        result = pd.concat(
            [Series(dtype="float64").astype("Sparse"),
             Series(dtype="object")])
        # TODO: release-note: concat sparse dtype
        expected = pd.SparseDtype("object")
        assert result.dtype == expected

        # GH 26705 - Assert .ftype is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert result.ftype == "object:sparse"
예제 #26
0
def load_Seqtable_from_count_files(count_files,
                                   file_list=None,
                                   pattern_filter=None,
                                   black_list=None,
                                   name_template=None,
                                   sort_by=None,
                                   x_values=None,
                                   x_unit=None,
                                   input_sample_name=None,
                                   sample_metadata=None,
                                   note=None,
                                   dry_run=False):

    if isinstance(count_files, list):
        samples = {}
        for cf in count_files:
            samples = {**samples, **_load_single_source(**cf)}
    else:
        samples = _load_single_source(count_files=count_files,
                                      file_list=file_list,
                                      pattern_filter=pattern_filter,
                                      black_list=black_list,
                                      name_template=name_template)

    # add extra metadata
    if sample_metadata is not None:
        for file_name, f_meta in sample_metadata.items():
            samples[file_name].udpate(f_meta)

    # sort file order if applicable
    sample_names = list(samples.keys())
    if sort_by is not None:
        if isinstance(sort_by, str):

            def sort_fn(sample_name):
                return samples[sample_name].get(sort_by, np.nan)
        elif callable(sort_by):
            sort_fn = sort_by
        else:
            logging.error('Unknown sort_by format', error_type=TypeError)
        sample_names = sorted(sample_names, key=sort_fn)

    if dry_run:
        # return a list of samples without importing
        return pd.DataFrame(samples)[sample_names].transpose()

    data_mtx = {
        sample: read_count_file(file_path=samples[sample]['file_path'],
                                as_dict=True)[2]
        for sample in sample_names
    }
    data_mtx = pd.DataFrame.from_dict(data_mtx).fillna(
        0, inplace=False).astype(pd.SparseDtype(dtype='int'))
    if input_sample_name is not None:
        grouper = {
            'input':
            [name for name in sample_names if name in input_sample_name],
            'reacted':
            [name for name in sample_names if name not in input_sample_name]
        }
    else:
        grouper = None

    if x_values is not None and isinstance(x_values, str):
        x_values = {
            sample: sample_metadata.pop(x_values, None)
            for sample, sample_metadata in samples.items()
        }

    from .seq_data import SeqData

    return SeqData(data_mtx,
                   data_unit='count',
                   grouper=grouper,
                   sample_metadata={'info': samples},
                   x_values=x_values,
                   x_unit=x_unit,
                   note=note)
예제 #27
0
def test_dataframe_with_sparse_array_int_columns(vineyard_client):
    df = pd.DataFrame(np.random.randn(100, 4), columns=[1, 2, 3, 4])
    df.iloc[:98] = np.nan
    sdf = df.astype(pd.SparseDtype("float", np.nan))
    object_id = vineyard_client.put(sdf)
    pd.testing.assert_frame_equal(df, vineyard_client.get(object_id))
예제 #28
0
    def test_uses_first_kind(self, kind):
        other = "integer" if kind == "block" else "block"
        a = SparseArray([1, 0, 0, 2], kind=kind)
        b = SparseArray([1, 0, 2, 2], kind=other)

        result = SparseArray._concat_same_type([a, b])
        expected = np.array([1, 2, 1, 2, 2], dtype="int64")
        tm.assert_numpy_array_equal(result.sp_values, expected)
        assert result.kind == kind


@pytest.mark.parametrize(
    "other, expected_dtype",
    [
        # compatible dtype -> preserve sparse
        (pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)),
        # (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)),
        # incompatible dtype -> Sparse[common dtype]
        (pd.Series([1.5, 2.5, 3.5],
                   dtype="float64"), pd.SparseDtype("float64", 0)),
        # incompatible dtype -> Sparse[object] dtype
        (pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)),
        # categorical with compatible categories -> dtype of the categories
        (pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")),
        (pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")),
        # categorical with incompatible categories -> object dtype
        (pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)),
    ],
)
def test_concat_with_non_sparse(other, expected_dtype):
    # https://github.com/pandas-dev/pandas/issues/34336
예제 #29
0
class TestDataFrameAppend:
    @pytest.mark.filterwarnings(
        "ignore:.*append method is deprecated.*:FutureWarning")
    def test_append_multiindex(self, multiindex_dataframe_random_data,
                               frame_or_series):
        obj = multiindex_dataframe_random_data
        obj = tm.get_obj(obj, frame_or_series)

        a = obj[:5]
        b = obj[5:]

        result = a.append(b)
        tm.assert_equal(result, obj)

    def test_append_empty_list(self):
        # GH 28769
        df = DataFrame()
        result = df._append([])
        expected = df
        tm.assert_frame_equal(result, expected)
        assert result is not df

        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])
        result = df._append([])
        expected = df
        tm.assert_frame_equal(result, expected)
        assert result is not df  # ._append() should return a new object

    def test_append_series_dict(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        series = df.loc[4]
        msg = "Indexes have overlapping values"
        with pytest.raises(ValueError, match=msg):
            df._append(series, verify_integrity=True)

        series.name = None
        msg = "Can only append a Series if ignore_index=True"
        with pytest.raises(TypeError, match=msg):
            df._append(series, verify_integrity=True)

        result = df._append(series[::-1], ignore_index=True)
        expected = df._append(DataFrame({
            0: series[::-1]
        }, index=df.columns).T,
                              ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # dict
        result = df._append(series.to_dict(), ignore_index=True)
        tm.assert_frame_equal(result, expected)

        result = df._append(series[::-1][:3], ignore_index=True)
        expected = df._append(DataFrame({
            0: series[::-1][:3]
        }).T,
                              ignore_index=True,
                              sort=True)
        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        msg = "Can only append a dict if ignore_index=True"
        with pytest.raises(TypeError, match=msg):
            df._append(series.to_dict())

        # can append when name set
        row = df.loc[4]
        row.name = 5
        result = df._append(row)
        expected = df._append(df[-1:], ignore_index=True)
        tm.assert_frame_equal(result, expected)

    def test_append_list_of_series_dicts(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        dicts = [x.to_dict() for idx, x in df.iterrows()]

        result = df._append(dicts, ignore_index=True)
        expected = df._append(df, ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # different columns
        dicts = [
            {
                "foo": 1,
                "bar": 2,
                "baz": 3,
                "peekaboo": 4
            },
            {
                "foo": 5,
                "bar": 6,
                "baz": 7,
                "peekaboo": 8
            },
        ]
        result = df._append(dicts, ignore_index=True, sort=True)
        expected = df._append(DataFrame(dicts), ignore_index=True, sort=True)
        tm.assert_frame_equal(result, expected)

    def test_append_list_retain_index_name(self):
        df = DataFrame([[1, 2], [3, 4]],
                       index=pd.Index(["a", "b"], name="keepthisname"))

        serc = Series([5, 6], name="c")

        expected = DataFrame(
            [[1, 2], [3, 4], [5, 6]],
            index=pd.Index(["a", "b", "c"], name="keepthisname"),
        )

        # append series
        result = df._append(serc)
        tm.assert_frame_equal(result, expected)

        # append list of series
        result = df._append([serc])
        tm.assert_frame_equal(result, expected)

    def test_append_missing_cols(self):
        # GH22252
        # exercise the conditional branch in append method where the data
        # to be appended is a list and does not contain all columns that are in
        # the target DataFrame
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        dicts = [{"foo": 9}, {"bar": 10}]
        result = df._append(dicts, ignore_index=True, sort=True)

        expected = df._append(DataFrame(dicts), ignore_index=True, sort=True)
        tm.assert_frame_equal(result, expected)

    def test_append_empty_dataframe(self):

        # Empty df append empty df
        df1 = DataFrame()
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Non-empty df append empty df
        df1 = DataFrame(np.random.randn(5, 2))
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Empty df with columns append empty df
        df1 = DataFrame(columns=["bar", "foo"])
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Non-Empty df with columns append empty df
        df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"])
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

    def test_append_dtypes(self, using_array_manager):

        # GH 5754
        # row appends of different dtypes (so need to do by-item)
        # can sometimes infer the correct type

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5))
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": "foo"}, index=range(1, 2))
        result = df1._append(df2)
        expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": np.nan}, index=range(1, 2))
        result = df1._append(df2)
        expected = DataFrame(
            {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")})
        if using_array_manager:
            # TODO(ArrayManager) decide on exact casting rules in concat
            # With ArrayManager, all-NaN float is not ignored
            expected = expected.astype(object)
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object)
        result = df1._append(df2)
        expected = DataFrame(
            {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")})
        if using_array_manager:
            # With ArrayManager, all-NaN float is not ignored
            expected = expected.astype(object)
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": np.nan}, index=range(1))
        df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2))
        result = df1._append(df2)
        expected = DataFrame(
            {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")})
        if using_array_manager:
            # With ArrayManager, all-NaN float is not ignored
            expected = expected.astype(object)
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object)
        result = df1._append(df2)
        expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"])
    def test_append_timestamps_aware_or_naive(self, tz_naive_fixture,
                                              timestamp):
        # GH 30238
        tz = tz_naive_fixture
        df = DataFrame([Timestamp(timestamp, tz=tz)])
        result = df._append(df.iloc[0]).iloc[-1]
        expected = Series(Timestamp(timestamp, tz=tz), name=0)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "data, dtype",
        [
            ([1], pd.Int64Dtype()),
            ([1], pd.CategoricalDtype()),
            ([pd.Interval(left=0, right=5)], pd.IntervalDtype()),
            ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")),
            ([1], pd.SparseDtype()),
        ],
    )
    def test_other_dtypes(self, data, dtype, using_array_manager):
        df = DataFrame(data, dtype=dtype)

        warn = None
        if using_array_manager and isinstance(dtype, pd.SparseDtype):
            warn = FutureWarning

        with tm.assert_produces_warning(warn, match="astype from SparseDtype"):
            result = df._append(df.iloc[0]).iloc[-1]

        expected = Series(data, name=0, dtype=dtype)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
    def test_append_numpy_bug_1681(self, dtype):
        # another datetime64 bug
        if dtype == "datetime64[ns]":
            index = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
        else:
            index = timedelta_range("1 days", "10 days", freq="2D")

        df = DataFrame()
        other = DataFrame({"A": "foo", "B": index}, index=index)

        result = df._append(other)
        assert (result["B"] == index).all()

    @pytest.mark.filterwarnings("ignore:The values in the array:RuntimeWarning"
                                )
    def test_multiindex_column_append_multiple(self):
        # GH 29699
        df = DataFrame(
            [[1, 11], [2, 12], [3, 13]],
            columns=pd.MultiIndex.from_tuples([("multi", "col1"),
                                               ("multi", "col2")],
                                              names=["level1", None]),
        )
        df2 = df.copy()
        for i in range(1, 10):
            df[i, "colA"] = 10
            df = df._append(df2, ignore_index=True)
            result = df["multi"]
            expected = DataFrame({
                "col1": [1, 2, 3] * (i + 1),
                "col2": [11, 12, 13] * (i + 1)
            })
            tm.assert_frame_equal(result, expected)

    def test_append_raises_future_warning(self):
        # GH#35407
        df1 = DataFrame([[1, 2], [3, 4]])
        df2 = DataFrame([[5, 6], [7, 8]])
        with tm.assert_produces_warning(FutureWarning):
            df1.append(df2)
예제 #30
0
    def testDataSerialize(self):
        for type_, compress in itertools.product(
                (None,) + tuple(dataserializer.SerialType.__members__.values()),
                (None,) + tuple(dataserializer.CompressType.__members__.values())):
            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.load(
                BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.float64(0.2345)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

        # test non-serializable object
        if pyarrow:
            non_serial = type('non_serial', (object,), dict(nbytes=10))
            with self.assertRaises(SerializationFailed):
                dataserializer.dumps(non_serial())

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100,), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin')
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                            'c': list('aabaaddce')})
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)

        # test complex
        s = complex(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        s = np.complex64(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        # test pickle
        d = ClassToPickle(dict(a=1, b='uvw'))
        dest_d = dataserializer.loads((dataserializer.dumps(d)))
        self.assertIs(type(d), type(dest_d))
        self.assertEqual(d.a, dest_d.a)

        # test ndarray with negative strides
        arr = np.zeros((5, 6, 3))
        arr2 = arr[:, :, ::-1]
        dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2))
        np.testing.assert_array_equal(arr2, dest_arr2)

        # test ArrowArray
        df = pd.DataFrame({'a': ['s1', 's2', 's3'],
                           'b': [['s1', 's2'], ['s3'], ['s4', 's5']]})
        df['a'] = df['a'].astype(ArrowStringDtype())
        df['b'] = df['b'].astype(ArrowListDtype(str))
        dest_df = dataserializer.loads(dataserializer.dumps(df))
        self.assertIs(type(df), type(dest_df))
        pd.testing.assert_frame_equal(df, dest_df)

        # test DataFrame with SparseDtype
        s = pd.Series([1, 2, np.nan, np.nan, 3]).astype(
            pd.SparseDtype(np.dtype(np.float64), np.nan))
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_series_equal(s, dest_s)
        df = pd.DataFrame({'s': s})
        dest_df = dataserializer.loads((dataserializer.dumps(df)))
        pd.testing.assert_frame_equal(df, dest_df)