def train(self, df): #read csv into pandas dataframe df = pd.read_csv(df) #df = df.iloc[:1000,:] print('training model') #create dataframe for items to get content vectors df_items = df.iloc[:, 3:] df_items.drop_duplicates(inplace=True, subset='catalog_item_id') df_items.reset_index(inplace=True) #create df with just columns needed to return final output self.df_items_lookup = df_items.sort_values( by='catalog_item_name').reset_index( ).loc[:, ['catalog_item_id', 'catalog_item_name', 'brand_name']] #create pivot table for matrix factorization df = df.groupby(['user_id_hash', 'catalog_item_name'])['quantity'].sum().unstack() #create sparse matrix sdf = df.astype(pd.SparseDtype("float", np.nan)) sdf2 = sdf.sparse.to_coo() #get list of user and item indexes for lookup later self.user_index = list(df.index) self.item_index = pd.DataFrame(df.columns) # initialize a model self.model = implicit.lmf.LogisticMatrixFactorization(factors=20) # train the model on a sparse matrix of item/user/confidence weights fit = self.model.fit(sdf2.T) # recommend items for a user self.user_items = sdf2.tocsr() print('training complete!')
def load_data(k=20): ratings_df = pd.read_csv("./ml-latest-small/ratings.csv") movies_df = pd.read_csv("./ml-latest-small/movies.csv") ratings_df = ratings_df.drop(columns=['timestamp']) films_nb = len(set(ratings_df.movieId)) ratings_df = ratings_df.astype(pd.SparseDtype(np.float32, np.nan)).pivot( index='userId', columns='movieId', values='rating') iid_map = dict() i = 0 for item in ratings_df: iid_map[i] = item i += 1 users_mean = ratings_df.mean(axis=1).values R_demeaned = ratings_df.sub(ratings_df.mean(axis=1), axis=0) R_demeaned = coo_matrix(R_demeaned.fillna(0).values) del ratings_df U, sigma, Vt = svds(R_demeaned, k=k) sigma = np.diag(sigma) return U, sigma, Vt, movies_df, films_nb, iid_map, users_mean
def construct_load_shed(scenario_info, grid, infeasibilities=None): """Constructs load_shed dataframe from relevant scenario/grid data. :param dict scenario_info: info attribute of Scenario object. :param powersimdata.input.grid.Grid grid: grid to construct load_shed for. :param dict/None infeasibilities: dictionary of {interval (int): load shed percentage (int)}, or None. :return: (*pandas.DataFrame*) -- data frame of load_shed. """ hours = pd.date_range(start=scenario_info["start_date"], end=scenario_info["end_date"], freq="1H").tolist() buses = grid.bus.index if infeasibilities is None: print("No infeasibilities, constructing DataFrame") load_shed_data = coo_matrix((len(hours), len(buses))) load_shed = pd.DataFrame.sparse.from_spmatrix(load_shed_data) else: print("Infeasibilities, constructing DataFrame") bus_demand = get_bus_demand(scenario_info, grid) load_shed = np.zeros((len(hours), len(buses))) # Convert '24H' to 24 interval = int(scenario_info["interval"][:-1]) for i, v in infeasibilities.items(): start = i * interval end = (i + 1) * interval base_demand = bus_demand.iloc[start:end, :].to_numpy() shed_demand = base_demand * (v / 100) load_shed[start:end, :] = shed_demand load_shed = pd.DataFrame(load_shed, columns=buses, index=hours) load_shed = load_shed.astype(pd.SparseDtype("float", 0)) load_shed.index = hours load_shed.index.name = "UTC" load_shed.columns = buses return load_shed
def run(self): weight = [] ret = [] for i, adjust_date in enumerate(self.adjust_dates[1:]): self.account.trail(adjust_date) new_weight = self.get_new_weight() self.account.rebalance(adjust_date, new_weight) if i == 0: ret.append(self.account._ret) else: ret.append(self.account._ret.iloc[1:]) weight.append(self.account._weight.iloc[:-1]) self.account.trail(datetime.strptime(end_date, '%Y-%m-%d')) ret.append(self.account._ret.iloc[1:]) weight.append(self.account._weight) ret = pd.concat(ret, sort=True) self.ret = ret nv = (ret + 1).cumprod() nv.iloc[0] = 1 # 初始净值为1 self.net_value = nv self._weight = pd.concat(weight, sort=True).astype(pd.SparseDtype())
def df(self): """ :class:`pandas.SparseDataFrame` : DataFrame representation of the contact matrix Rows/columns correspond to indices and the values correspond to the count """ mtx = self.sparse_matrix index = list(range(self.max_size)) columns = list(range(self.max_size)) if _PD_VERSION < (0, 25): # py27 only -no-cov- mtx = mtx.tocoo() return pd.SparseDataFrame(mtx, index=index, columns=columns) df = pd.DataFrame.sparse.from_spmatrix(mtx, index=index, columns=columns) # note: I think we can always use float here for dtype; but in # principle maybe we need to inspect and get the internal type? # Problem is, pandas technically stores a different dtype for each # column. df = df.astype(pd.SparseDtype("float", np.nan)) return df
def test_dataframe_dummies_prefix_dict(self, sparse): prefixes = {"A": "from_A", "B": "from_B"} df = DataFrame({ "C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"] }) result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame({ "C": [1, 2, 3], "from_A_a": [1, 0, 1], "from_A_b": [0, 1, 0], "from_B_b": [1, 1, 0], "from_B_c": [0, 0, 1], }) columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected[columns] = expected[columns].astype(np.uint8) if sparse: expected[columns] = expected[columns].astype( pd.SparseDtype("uint8", 0)) tm.assert_frame_equal(result, expected)
def mask_tissue(image: np.ndarray, counts: pd.DataFrame, label: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray]: r""" Detects the tissue in `image`. The area outside of the tissue is given a new label with zero counts everywhere. """ mask = compute_tissue_mask(image) counts.index += 1 label[label != 0] += 1 in_mask = np.unique(label[mask & (label != 0)]) label[~mask.astype(bool) & ~np.isin(label, in_mask)] = 1 counts = pd.concat([ pd.DataFrame( [np.repeat(0, counts.shape[1])], columns=counts.columns, index=[1], ).astype(pd.SparseDtype("float", 0)), counts, ]) return counts, label
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error #均方误差 from sklearn.metrics import mean_absolute_error #平方绝对误差 from sklearn.metrics import r2_score#R square from sklearn.metrics import roc_auc_score from sklearn.externals import joblib IDIR = 'G:\\bigdata\\badou\\00-data//' df_train = pd.read_csv(IDIR + 'train_feat.csv').fillna(0.).astype(pd.SparseDtype("float", np.nan)) labels = np.load(IDIR + 'labels.npy') X_train, X_test, y_train, y_test = train_test_split(df_train, labels, test_size=0.2, random_state=2020) print('load_model') rfr=joblib.load('randomForestRegressor.m') print('W:',rfr.feature_importances_) y_pred = rfr.predict(X_test) # y_pred_train = rfr.predict(X_train) # 0.8272266140627522 print('auc_test0:',roc_auc_score(y_test,y_pred)) # print('auc_train0:',roc_auc_score(y_train,y_pred_train)) print('train again...') # 模型再训练 rfr.fit(X_train, y_train) print('W2:',rfr.feature_importances_) y_pred = rfr.predict(X_test) print('auc_test1:',roc_auc_score(y_test,y_pred))
def dataframe_to_sparse(x, fill_value=0.0): return x.astype(pd.SparseDtype(float, fill_value=fill_value))
def test_take_all_empty(self): a = pd.array([0, 0], dtype=pd.SparseDtype("int64")) result = a.take([0, 1], allow_fill=True, fill_value=np.nan) tm.assert_sp_array_equal(a, result)
def __post_init__(self): object.__setattr__( self, "type", pd.SparseDtype(dtype=self.dtype, fill_value=self.fill_value), )
pandas_engine.DateTime(unit="ns", tz="CET"): "datetime64[ns, CET]", # type: ignore } timedelta_dtypes = { datetime.timedelta: "timedelta64", datetime.timedelta: "timedelta64", np.timedelta64: "timedelta64", pd.Timedelta: "timedelta64", pa.Timedelta: "timedelta64", } period_dtypes = {pd.PeriodDtype(freq="D"): "period[D]"} # Series.astype does not accept a string alias for SparseDtype. sparse_dtypes = { pd.SparseDtype: pd.SparseDtype(), pd.SparseDtype(np.float64): pd.SparseDtype(np.float64), } interval_dtypes = {pd.IntervalDtype(subtype=np.int64): "interval[int64]"} dtype_fixtures: List[Tuple[Dict, List]] = [ (int_dtypes, [-1]), (nullable_int_dtypes, [-1, None]), (uint_dtypes, [1]), (nullable_uint_dtypes, [1, None]), (float_dtypes, [1.0]), (complex_dtypes, [complex(1)]), (boolean_dtypes, [True, False]), (nullable_boolean_dtypes, [True, None]), (string_dtypes, ["A", "B"]), (object_dtypes, ["A", "B"]),
def test_pandas_sparse_iloc(): X = pd.DataFrame([[0, 1, 1], [0, 0, 1], [0, 0, 0]]).astype(pd.SparseDtype(float, fill_value=0.0)) assert np.all(~np.isnan(X.iloc[[0, 1]].to_numpy()))
def test_astype_str(self, data): result = pd.Series(data[:5]).astype(str) expected_dtype = pd.SparseDtype(str, str(data.fill_value)) expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype) self.assert_series_equal(result, expected)
from pathlib import Path from pandas.api.types import union_categoricals pathdata = Path() / "data" csvpath = pathdata / "airlinetrain1m.csv" df = pd.read_csv(csvpath.resolve()) label = df["dep_delayed_15min"].map({"N": 0, "Y": 1}) covariates = ["DepTime", "Distance"] factors = [ "Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest" ] sp_covariates = list( map(lambda col: df[col].astype(pd.SparseDtype("float32", 0.0)), covariates)) sp_factors = list( map( lambda col: pd.get_dummies( df[col], prefix=col, sparse=True, dtype=np.float32), factors)) data = pd.concat(sp_factors + sp_covariates, axis=1) spdata = coo_matrix(data.sparse.to_coo()).tocsr() # featmappath = pathdata / "featmap.txt" # with open(featmappath.resolve(), "w") as f: # lines = ["{fid} {fname} {ftype}\n".format(fid=i, fname=col, ftype="int" if col.startswith(("deptime", "distance")) else "i") for (i, col) in enumerate(data.columns)] # f.writelines(lines) eta = 0.1
def write_data( counts: pd.DataFrame, image: np.ndarray, label: np.ndarray, annotation: Dict[str, np.ndarray], type_label: str, path: str = "data.h5", ) -> None: r"""Writes data to the format used by XFuse.""" if image.shape[:2] != label.shape[:2]: raise RuntimeError( f"Image shape ({image.shape[:2]}) is not equal to" f" the shape of the label image ({label.shape[:2]}).") if np.max(image.shape[:2]) > 5000: log( WARNING, "The image resolution is very large! 😱" " XFuse typically works best on medium resolution images" " (approximately 1000x1000 px)." " If you experience performance issues, please consider reducing" " the resolution.", ) if counts.columns.duplicated().any(): log( WARNING, "Count matrix contains duplicated columns." " Counts will be summed by column name.", ) counts = counts.sum(axis=1, level=0) log(DEBUG, "writing data to %s", path) os.makedirs(os.path.normpath(os.path.dirname(path)), exist_ok=True) with h5py.File(path, "w") as data_file: data = (counts.astype(pd.SparseDtype("float", 0.0)).sparse.to_coo().tocsr()) data_file.create_dataset("counts/data", data.data.shape, float, data.data.astype(float)) data_file.create_dataset( "counts/indices", data.indices.shape, data.indices.dtype, data.indices, ) data_file.create_dataset("counts/indptr", data.indptr.shape, data.indptr.dtype, data.indptr) data_file.create_dataset( "counts/columns", counts.columns.shape, h5py.string_dtype(), counts.columns.values, ) data_file.create_dataset("counts/index", counts.index.shape, int, counts.index.astype(int)) data_file.create_dataset("image", image.shape, np.uint8, image) data_file.create_dataset("label", label.shape, np.int16, label) data_file.create_group("annotation", track_order=True) for k, v in annotation.items(): data_file.create_dataset(f"annotation/{k}", v.shape, np.uint16, v) data_file.create_dataset("type", data=type_label, dtype=h5py.string_dtype())
def _convert_to_dense(cls, series): if isinstance(series.dtype, pd.SparseDtype): return series.astype(pd.SparseDtype(series.dtype.subtype, np.nan)).sparse.to_dense() return series
def _sparse_reindex(cls, inp, index=None, columns=None): if inp.ndim == 2: columns = inp.columns if columns is None else columns index_shape = len(index) if index is not None else len(inp) i_to_columns = dict() for i, col in enumerate(columns): if col in inp.dtypes: if index is None: i_to_columns[i] = inp[col] else: indexer = inp.index.reindex(index)[1] cond = indexer >= 0 available_indexer = indexer[cond] del indexer data = inp[col].iloc[available_indexer].to_numpy() ind = cond.nonzero()[0] spmatrix = sps.csc_matrix( (data, (ind, np.zeros_like(ind))), shape=(index_shape, 1), dtype=inp[col].dtype) sparse_array = pd.arrays.SparseArray.from_spmatrix( spmatrix) # convert to SparseDtype(xxx, np.nan) # to ensure 0 in sparse_array not converted to np.nan sparse_array = pd.arrays.SparseArray( sparse_array.sp_values, sparse_index=sparse_array.sp_index, fill_value=np.nan, dtype=pd.SparseDtype(sparse_array.dtype, np.nan)) series = pd.Series(sparse_array, index=index) i_to_columns[i] = series else: ind = index if index is not None else inp.index i_to_columns[i] = pd.DataFrame.sparse.from_spmatrix( sps.coo_matrix((index_shape, 1), dtype=np.float64), index=ind).iloc[:, 0] df = pd.DataFrame(i_to_columns) df.columns = columns return df else: indexer = inp.index.reindex(index)[1] cond = indexer >= 0 available_indexer = indexer[cond] del indexer data = inp.iloc[available_indexer].to_numpy() ind = cond.nonzero()[0] spmatrix = sps.csc_matrix((data, (ind, np.zeros_like(ind))), shape=(len(index), 1), dtype=inp.dtype) sparse_array = pd.arrays.SparseArray.from_spmatrix(spmatrix) # convert to SparseDtype(xxx, np.nan) # to ensure 0 in sparse_array not converted to np.nan sparse_array = pd.arrays.SparseArray( sparse_array.sp_values, sparse_index=sparse_array.sp_index, fill_value=np.nan, dtype=pd.SparseDtype(sparse_array.dtype, np.nan)) series = pd.Series(sparse_array, index=index, name=inp.name) return series
def read_csv(filename): return (pd.read_csv(filename, index_col=["store_id", "item_id"]).fillna(0).astype( pd.SparseDtype("float32", 0)))
class TestDataFrameAppend: def test_append_empty_list(self): # GH 28769 df = DataFrame() result = df.append([]) expected = df tm.assert_frame_equal(result, expected) assert result is not df df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) result = df.append([]) expected = df tm.assert_frame_equal(result, expected) assert result is not df # .append() should return a new object def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) series = df.loc[4] msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): df.append(series, verify_integrity=True) series.name = None msg = "Can only append a Series if ignore_index=True" with pytest.raises(TypeError, match=msg): df.append(series, verify_integrity=True) result = df.append(series[::-1], ignore_index=True) expected = df.append(DataFrame({ 0: series[::-1] }, index=df.columns).T, ignore_index=True) tm.assert_frame_equal(result, expected) # dict result = df.append(series.to_dict(), ignore_index=True) tm.assert_frame_equal(result, expected) result = df.append(series[::-1][:3], ignore_index=True) expected = df.append(DataFrame({ 0: series[::-1][:3] }).T, ignore_index=True, sort=True) tm.assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set row = df.loc[4] row.name = 5 result = df.append(row) expected = df.append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected) def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [x.to_dict() for idx, x in df.iterrows()] result = df.append(dicts, ignore_index=True) expected = df.append(df, ignore_index=True) tm.assert_frame_equal(result, expected) # different columns dicts = [ { "foo": 1, "bar": 2, "baz": 3, "peekaboo": 4 }, { "foo": 5, "bar": 6, "baz": 7, "peekaboo": 8 }, ] result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data # to be appended is a list and does not contain all columns that are in # the target DataFrame df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [{"foo": 9}, {"bar": 10}] with tm.assert_produces_warning(None): result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) def test_append_empty_dataframe(self): # Empty df append empty df df1 = DataFrame() df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Non-empty df append empty df df1 = DataFrame(np.random.randn(5, 2)) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Empty df with columns append empty df df1 = DataFrame(columns=["bar", "foo"]) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Non-Empty df with columns append empty df df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) def test_append_dtypes(self): # GH 5754 # row appends of different dtypes (so need to do by-item) # can sometimes infer the correct type df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) result = df1.append(df2) expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) result = df1.append(df2) expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"]) def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): # GH 30238 tz = tz_naive_fixture df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)]) result = df.append(df.iloc[0]).iloc[-1] expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, dtype", [ ([1], pd.Int64Dtype()), ([1], pd.CategoricalDtype()), ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), ([1], pd.SparseDtype()), ], ) def test_other_dtypes(self, data, dtype): df = pd.DataFrame(data, dtype=dtype) result = df.append(df.iloc[0]).iloc[-1] expected = pd.Series(data, name=0, dtype=dtype) tm.assert_series_equal(result, expected)
def main(): # ---------------- Set MLK Enviroment Variables for better Gensim LDA performance ---------------- os.environ["MKL_NUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" # ---------------- Prepare LDA Inputs & Run LDA ---------------- # Parse command line args save_name = str(sys.argv[1]) cap = str(sys.argv[2]) num_topics = int(sys.argv[3]) # Load data bow_matrix_train_path = save_name + "_train_sparse.npz" bow_matrix_test_path = save_name + "_test_sparse.npz" if False: #os.path.exists(bow_matrix_train_path): #X_train = scipy.sparse.load_npz(bow_matrix_train_path) #X_test = scipy.sparse.load_npz(bow_matrix_test_path) pass else: data = load_pickle("FINRA_TRACE_2014.pkl.zip") #data = data.append(load_pickle("FINRA_TRACE_2014.pkl.zip"),ignore_index=True) #data = data.append(load_pickle("FINRA_TRACE_2013.pkl.zip"),ignore_index=True) #data = data.append(load_pickle("FINRA_TRACE_2012.pkl.zip"),ignore_index=True) # Compute a version of bag_of_words given the save_name if save_name=="trade_frac_out": bag_of_words = trade_frac_out(data) del data save_name = save_name elif save_name=="trade_vol_BoW": bag_of_words = trade_vol_BoW(data,cap) del data save_name = save_name + "_" + cap elif save_name=="trade_vol_BoW_norm": bag_of_words = trade_vol_BoW_norm(data,cap) del data save_name = save_name + "_" + cap elif save_name=="trade_count": bag_of_words = compute_count(data) del data else: raise Exception("the save_name does not have a corresponding bag_of_words") dtype = pd.SparseDtype(float, fill_value=0) X = scipy.sparse.csr_matrix(bag_of_words.astype(dtype).sparse.to_coo()) #X = bag_of_words.astype(dtype) #cutoff = int(X.shape[0]*0.9) #X_train = X[:cutoff] #X_test = X[cutoff:] X_train, X_test, train_idx, test_idx = train_test_split(X, np.arange(X.shape[0]), test_size=0.1, random_state=42) scipy.sparse.save_npz(save_name + "_train_sparse.npz", X_train) scipy.sparse.save_npz(save_name + "_test_sparse.npz", X_test) # slice our matrix to be just the training data #bag_of_words = bag_of_words.iloc[train_idx] train_index = bag_of_words.index[train_idx] # Compute input for gensim LDA corpus = compute_corpus(X_train,save_name) test_corpus = compute_corpus(X_test,save_name + "_test") id2word = compute_id2word(bag_of_words,save_name) # Run Gensim LDA start = time.time() lda = compute_topic(save_name,corpus,num_topics,id2word,workers=11,chunksize=12500,passes=10,iterations=600) lda_time = time.time()-start train_perplex = lda.log_perplexity(corpus) test_perplex = lda.log_perplexity(test_corpus) print("perplexity scores: ", train_perplex, test_perplex) with open("perplex_scores.csv","a+") as f: writer = csv.writer(f) writer.writerow([save_name,num_topics,train_perplex,test_perplex,lda_time]) # ---------------- LDA Analysis ---------------- #os.environ["MKL_NUM_THREADS"] = "4" #os.environ["NUMEXPR_NUM_THREADS"] = "4" #os.environ["OMP_NUM_THREADS"] = "4" # Run PyLDAvis dictionary = Dictionary.from_corpus(corpus,id2word=id2word) save_pyldavis2html(lda, corpus, dictionary,save_name,num_topics) # Save document X topic matrix to csv document_topic_distribution(corpus,train_index,lda,save_name,num_topics)
def get_sparse_series(): test_series = [ # Numpy dtypes pd.Series([-1, 0, 1, 2, 3], name="int_sparse", dtype=pd.SparseDtype(np.int32, 0)), pd.Series( [np.nan, 0, 1, 2, 3], name="float_sparse", dtype=pd.SparseDtype(np.float64, np.nan), ), pd.Series( [ np.nan, complex(0, 1), complex(1, -1), complex(2, 4), complex(3, -12) ], name="complex_sparse", dtype=pd.SparseDtype(np.complex128, np.nan), ), pd.Series( [True, False, False], name="bool_sparse", dtype=pd.SparseDtype(np.bool, False), ), pd.Series( pd.arrays.SparseArray([None, None, "gold", "black", "silver"]), name="str_obj_sparse", ), # Pending https://github.com/pandas-dev/pandas/issues/35762 # pd.Series([NoneT, 0, 1, 2, 3, 4], name="datetime_sparse", dtype=pd.SparseDtype(np.datetime64)), # Pandas dtypes pd.Series( [0, 1, 2, 3, None], name="pd_int64_sparse", dtype=pd.SparseDtype(pd.Int64Dtype()), ), # Pending https://github.com/pandas-dev/pandas/issues/35793 # pd.Series( # ["a", "b", "c", None], # name="pd_categorical_sparse", # dtype=pd.SparseDtype(pd.CategoricalDtype(['a', 'b', 'c', 'd'])) # ) ] if int(pd.__version__.split(".")[0]) >= 1: pandas_1_series = [ pd.Series( ["Patty", "Valentine", "Upper", "", "", ""], name="pd_string_sparse", dtype=pd.SparseDtype(pd.StringDtype(), ""), ), pd.Series( [True, False, False, None], name="pd_bool_sparse", dtype=pd.SparseDtype(pd.BooleanDtype(), None), ), ] test_series.extend(pandas_1_series) return test_series
def _deserialize_sparse_dtype(obj): dtype, fill_value = obj return pd.SparseDtype(dtype=dtype, fill_value=fill_value)
def test_all_sparse(self): df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))}) result = df.loc[[0, 1]] tm.assert_frame_equal(result, df)
def test_concat_empty_series_dtypes(self): # booleans assert (pd.concat([Series(dtype=np.bool_), Series(dtype=np.int32)]).dtype == np.int32) assert (pd.concat([Series(dtype=np.bool_), Series(dtype=np.float32)]).dtype == np.object_) # datetime-like assert (pd.concat([Series(dtype="m8[ns]"), Series(dtype=np.bool)]).dtype == np.object_) assert (pd.concat([Series(dtype="m8[ns]"), Series(dtype=np.int64)]).dtype == np.object_) assert (pd.concat([Series(dtype="M8[ns]"), Series(dtype=np.bool)]).dtype == np.object_) assert (pd.concat([Series(dtype="M8[ns]"), Series(dtype=np.int64)]).dtype == np.object_) assert (pd.concat([ Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64) ]).dtype == np.object_) # categorical assert (pd.concat([Series(dtype="category"), Series(dtype="category")]).dtype == "category") # GH 18515 assert (pd.concat( [Series(np.array([]), dtype="category"), Series(dtype="float64")]).dtype == "float64") assert (pd.concat([Series(dtype="category"), Series(dtype="object")]).dtype == "object") # sparse # TODO: move? result = pd.concat([ Series(dtype="float64").astype("Sparse"), Series(dtype="float64").astype("Sparse"), ]) assert result.dtype == "Sparse[float64]" # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): assert result.ftype == "float64:sparse" result = pd.concat([ Series(dtype="float64").astype("Sparse"), Series(dtype="float64") ]) # TODO: release-note: concat sparse dtype expected = pd.SparseDtype(np.float64) assert result.dtype == expected # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): assert result.ftype == "float64:sparse" result = pd.concat( [Series(dtype="float64").astype("Sparse"), Series(dtype="object")]) # TODO: release-note: concat sparse dtype expected = pd.SparseDtype("object") assert result.dtype == expected # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): assert result.ftype == "object:sparse"
def load_Seqtable_from_count_files(count_files, file_list=None, pattern_filter=None, black_list=None, name_template=None, sort_by=None, x_values=None, x_unit=None, input_sample_name=None, sample_metadata=None, note=None, dry_run=False): if isinstance(count_files, list): samples = {} for cf in count_files: samples = {**samples, **_load_single_source(**cf)} else: samples = _load_single_source(count_files=count_files, file_list=file_list, pattern_filter=pattern_filter, black_list=black_list, name_template=name_template) # add extra metadata if sample_metadata is not None: for file_name, f_meta in sample_metadata.items(): samples[file_name].udpate(f_meta) # sort file order if applicable sample_names = list(samples.keys()) if sort_by is not None: if isinstance(sort_by, str): def sort_fn(sample_name): return samples[sample_name].get(sort_by, np.nan) elif callable(sort_by): sort_fn = sort_by else: logging.error('Unknown sort_by format', error_type=TypeError) sample_names = sorted(sample_names, key=sort_fn) if dry_run: # return a list of samples without importing return pd.DataFrame(samples)[sample_names].transpose() data_mtx = { sample: read_count_file(file_path=samples[sample]['file_path'], as_dict=True)[2] for sample in sample_names } data_mtx = pd.DataFrame.from_dict(data_mtx).fillna( 0, inplace=False).astype(pd.SparseDtype(dtype='int')) if input_sample_name is not None: grouper = { 'input': [name for name in sample_names if name in input_sample_name], 'reacted': [name for name in sample_names if name not in input_sample_name] } else: grouper = None if x_values is not None and isinstance(x_values, str): x_values = { sample: sample_metadata.pop(x_values, None) for sample, sample_metadata in samples.items() } from .seq_data import SeqData return SeqData(data_mtx, data_unit='count', grouper=grouper, sample_metadata={'info': samples}, x_values=x_values, x_unit=x_unit, note=note)
def test_dataframe_with_sparse_array_int_columns(vineyard_client): df = pd.DataFrame(np.random.randn(100, 4), columns=[1, 2, 3, 4]) df.iloc[:98] = np.nan sdf = df.astype(pd.SparseDtype("float", np.nan)) object_id = vineyard_client.put(sdf) pd.testing.assert_frame_equal(df, vineyard_client.get(object_id))
def test_uses_first_kind(self, kind): other = "integer" if kind == "block" else "block" a = SparseArray([1, 0, 0, 2], kind=kind) b = SparseArray([1, 0, 2, 2], kind=other) result = SparseArray._concat_same_type([a, b]) expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind @pytest.mark.parametrize( "other, expected_dtype", [ # compatible dtype -> preserve sparse (pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)), # (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)), # incompatible dtype -> Sparse[common dtype] (pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)), # incompatible dtype -> Sparse[object] dtype (pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)), # categorical with compatible categories -> dtype of the categories (pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")), (pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")), # categorical with incompatible categories -> object dtype (pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)), ], ) def test_concat_with_non_sparse(other, expected_dtype): # https://github.com/pandas-dev/pandas/issues/34336
class TestDataFrameAppend: @pytest.mark.filterwarnings( "ignore:.*append method is deprecated.*:FutureWarning") def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series): obj = multiindex_dataframe_random_data obj = tm.get_obj(obj, frame_or_series) a = obj[:5] b = obj[5:] result = a.append(b) tm.assert_equal(result, obj) def test_append_empty_list(self): # GH 28769 df = DataFrame() result = df._append([]) expected = df tm.assert_frame_equal(result, expected) assert result is not df df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) result = df._append([]) expected = df tm.assert_frame_equal(result, expected) assert result is not df # ._append() should return a new object def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) series = df.loc[4] msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): df._append(series, verify_integrity=True) series.name = None msg = "Can only append a Series if ignore_index=True" with pytest.raises(TypeError, match=msg): df._append(series, verify_integrity=True) result = df._append(series[::-1], ignore_index=True) expected = df._append(DataFrame({ 0: series[::-1] }, index=df.columns).T, ignore_index=True) tm.assert_frame_equal(result, expected) # dict result = df._append(series.to_dict(), ignore_index=True) tm.assert_frame_equal(result, expected) result = df._append(series[::-1][:3], ignore_index=True) expected = df._append(DataFrame({ 0: series[::-1][:3] }).T, ignore_index=True, sort=True) tm.assert_frame_equal(result, expected.loc[:, result.columns]) msg = "Can only append a dict if ignore_index=True" with pytest.raises(TypeError, match=msg): df._append(series.to_dict()) # can append when name set row = df.loc[4] row.name = 5 result = df._append(row) expected = df._append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected) def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [x.to_dict() for idx, x in df.iterrows()] result = df._append(dicts, ignore_index=True) expected = df._append(df, ignore_index=True) tm.assert_frame_equal(result, expected) # different columns dicts = [ { "foo": 1, "bar": 2, "baz": 3, "peekaboo": 4 }, { "foo": 5, "bar": 6, "baz": 7, "peekaboo": 8 }, ] result = df._append(dicts, ignore_index=True, sort=True) expected = df._append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) def test_append_list_retain_index_name(self): df = DataFrame([[1, 2], [3, 4]], index=pd.Index(["a", "b"], name="keepthisname")) serc = Series([5, 6], name="c") expected = DataFrame( [[1, 2], [3, 4], [5, 6]], index=pd.Index(["a", "b", "c"], name="keepthisname"), ) # append series result = df._append(serc) tm.assert_frame_equal(result, expected) # append list of series result = df._append([serc]) tm.assert_frame_equal(result, expected) def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data # to be appended is a list and does not contain all columns that are in # the target DataFrame df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [{"foo": 9}, {"bar": 10}] result = df._append(dicts, ignore_index=True, sort=True) expected = df._append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) def test_append_empty_dataframe(self): # Empty df append empty df df1 = DataFrame() df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Non-empty df append empty df df1 = DataFrame(np.random.randn(5, 2)) df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Empty df with columns append empty df df1 = DataFrame(columns=["bar", "foo"]) df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Non-Empty df with columns append empty df df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) def test_append_dtypes(self, using_array_manager): # GH 5754 # row appends of different dtypes (so need to do by-item) # can sometimes infer the correct type df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) result = df1._append(df2) expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) result = df1._append(df2) expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}) if using_array_manager: # TODO(ArrayManager) decide on exact casting rules in concat # With ArrayManager, all-NaN float is not ignored expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) result = df1._append(df2) expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}) if using_array_manager: # With ArrayManager, all-NaN float is not ignored expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) result = df1._append(df2) expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}) if using_array_manager: # With ArrayManager, all-NaN float is not ignored expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) result = df1._append(df2) expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"]) def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): # GH 30238 tz = tz_naive_fixture df = DataFrame([Timestamp(timestamp, tz=tz)]) result = df._append(df.iloc[0]).iloc[-1] expected = Series(Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, dtype", [ ([1], pd.Int64Dtype()), ([1], pd.CategoricalDtype()), ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), ([1], pd.SparseDtype()), ], ) def test_other_dtypes(self, data, dtype, using_array_manager): df = DataFrame(data, dtype=dtype) warn = None if using_array_manager and isinstance(dtype, pd.SparseDtype): warn = FutureWarning with tm.assert_produces_warning(warn, match="astype from SparseDtype"): result = df._append(df.iloc[0]).iloc[-1] expected = Series(data, name=0, dtype=dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) def test_append_numpy_bug_1681(self, dtype): # another datetime64 bug if dtype == "datetime64[ns]": index = date_range("2011/1/1", "2012/1/1", freq="W-FRI") else: index = timedelta_range("1 days", "10 days", freq="2D") df = DataFrame() other = DataFrame({"A": "foo", "B": index}, index=index) result = df._append(other) assert (result["B"] == index).all() @pytest.mark.filterwarnings("ignore:The values in the array:RuntimeWarning" ) def test_multiindex_column_append_multiple(self): # GH 29699 df = DataFrame( [[1, 11], [2, 12], [3, 13]], columns=pd.MultiIndex.from_tuples([("multi", "col1"), ("multi", "col2")], names=["level1", None]), ) df2 = df.copy() for i in range(1, 10): df[i, "colA"] = 10 df = df._append(df2, ignore_index=True) result = df["multi"] expected = DataFrame({ "col1": [1, 2, 3] * (i + 1), "col2": [11, 12, 13] * (i + 1) }) tm.assert_frame_equal(result, expected) def test_append_raises_future_warning(self): # GH#35407 df1 = DataFrame([[1, 2], [3, 4]]) df2 = DataFrame([[5, 6], [7, 8]]) with tm.assert_produces_warning(FutureWarning): df1.append(df2)
def testDataSerialize(self): for type_, compress in itertools.product( (None,) + tuple(dataserializer.SerialType.__members__.values()), (None,) + tuple(dataserializer.CompressType.__members__.values())): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.load( BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) # test non-serializable object if pyarrow: non_serial = type('non_serial', (object,), dict(nbytes=10)) with self.assertRaises(SerializationFailed): dataserializer.dumps(non_serial()) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100,), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin') try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,)) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce')}) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s) # test complex s = complex(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) s = np.complex64(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) # test pickle d = ClassToPickle(dict(a=1, b='uvw')) dest_d = dataserializer.loads((dataserializer.dumps(d))) self.assertIs(type(d), type(dest_d)) self.assertEqual(d.a, dest_d.a) # test ndarray with negative strides arr = np.zeros((5, 6, 3)) arr2 = arr[:, :, ::-1] dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2)) np.testing.assert_array_equal(arr2, dest_arr2) # test ArrowArray df = pd.DataFrame({'a': ['s1', 's2', 's3'], 'b': [['s1', 's2'], ['s3'], ['s4', 's5']]}) df['a'] = df['a'].astype(ArrowStringDtype()) df['b'] = df['b'].astype(ArrowListDtype(str)) dest_df = dataserializer.loads(dataserializer.dumps(df)) self.assertIs(type(df), type(dest_df)) pd.testing.assert_frame_equal(df, dest_df) # test DataFrame with SparseDtype s = pd.Series([1, 2, np.nan, np.nan, 3]).astype( pd.SparseDtype(np.dtype(np.float64), np.nan)) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_series_equal(s, dest_s) df = pd.DataFrame({'s': s}) dest_df = dataserializer.loads((dataserializer.dumps(df))) pd.testing.assert_frame_equal(df, dest_df)