def test_data_catalog_queries(catalog): result = catalog.trade_ticks().dtypes.to_dict() expected = { "aggressor_side": CategoricalDtype(categories=["UNKNOWN"], ordered=False), "instrument_id": CategoricalDtype( categories=[ "Basketball,,29628709,20191221-001000,ODDS,MATCH_ODDS,1.166564490,237491,.BETFAIR", "Basketball,,29628709,20191221-001000,ODDS,MATCH_ODDS,1.166564490,60424,.BETFAIR", ], ordered=False, ), "match_id": dtype("O"), "price": dtype("float64"), "size": dtype("float64"), "ts_event_ns": dtype("int64"), "ts_recv_ns": dtype("int64"), "type": CategoricalDtype(categories=["TradeTick"], ordered=False), } assert result == expected
def test_astype_categorical_to_categorical( self, name, dtype_ordered, series_ordered ): # GH#10696, GH#18593 s_data = list("abcaacbab") s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered) s = Series(s_data, dtype=s_dtype, name=name) # unspecified categories dtype = CategoricalDtype(ordered=dtype_ordered) result = s.astype(dtype) exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) expected = Series(s_data, name=name, dtype=exp_dtype) tm.assert_series_equal(result, expected) # different categories dtype = CategoricalDtype(list("adc"), dtype_ordered) result = s.astype(dtype) expected = Series(s_data, name=name, dtype=dtype) tm.assert_series_equal(result, expected) if dtype_ordered is False: # not specifying ordered, so only test once expected = s result = s.astype("category") tm.assert_series_equal(result, expected)
def create_schema(phases, sources): dataframe_fields_types = { "name": StringDtype(), "schema": CategoricalDtype(settings.SCHEMAS), "collection_id": StringDtype(), "id": StringDtype(), "country": StringDtype(), "address": StringDtype(), "registrationNumber": StringDtype(), "alias": StringDtype(), "status": StringDtype(), "classification": StringDtype(), "gender": StringDtype(), "firstName": StringDtype(), "lastName": StringDtype(), "birthPlace": StringDtype(), "birthDate": StringDtype(), "idNumber": StringDtype(), "motherName": StringDtype(), "nationality": StringDtype(), } dataframe_meta = { f"{which}_{c}": t for which in ("left", "right") for c, t in dataframe_fields_types.items() } dataframe_meta["judgement"] = bool dataframe_meta["source"] = CategoricalDtype(sources) dataframe_meta["phase"] = CategoricalDtype(phases) dataframe_meta["features"] = object dataframe_meta["schema"] = StringDtype() return dataframe_meta
def test_astype_category(self, name, dtype_ordered, index_ordered): # GH#18630 index = CategoricalIndex( list("aabbca"), categories=list("cab"), ordered=index_ordered ) if name: index = index.rename(name) # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) result = index.astype(dtype) expected = CategoricalIndex( index.tolist(), name=name, categories=index.categories, ordered=dtype_ordered, ) tm.assert_index_equal(result, expected) # non-standard categories dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) result = index.astype(dtype) expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) tm.assert_index_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once result = index.astype("category") expected = index tm.assert_index_equal(result, expected)
def test_astype_from_categorical_with_keywords(self): # with keywords lst = ["a", "b", "c", "a"] ser = Series(lst) exp = Series(Categorical(lst, ordered=True)) res = ser.astype(CategoricalDtype(None, ordered=True)) tm.assert_series_equal(res, exp) exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True)) res = ser.astype(CategoricalDtype(list("abcdef"), ordered=True)) tm.assert_series_equal(res, exp)
def X_feature_onehot(self, dataframe: DataFrame) -> DataFrame: # fieldgroups[basename] = [ fieldname ] # noinspection PyArgumentList fieldgroups = groupby( curry(re.sub)('\d+(st|nd|rd)?$')(''), # basename self.params['X_feature_onehot'] # fieldnames ) encodings = {} for basename, fieldnames in fieldgroups.items(): # NOTE: in theory, unique_values should be hardcoded based on data_description.txt # for Kaggle, we can cheat and just take unique_values from self.data['combined'] # BUGFIX: running to_X() separately on test/train/validate datasets results in column name mismatches unique_values = np.unique( self.data['combined'][fieldnames].dropna().values) category_dtype = CategoricalDtype(categories=unique_values) for fieldname in fieldnames: dataframe[fieldname] = dataframe[fieldname].astype( category_dtype) onehot = pd.get_dummies(dataframe[fieldname], prefix=basename, prefix_sep='_') if not basename in encodings: encodings[basename] = onehot else: encodings[basename] = onehot & encodings[ basename] # Bitwise addition # Add additional onehot columns to dataframe for basename, onehot in encodings.items(): dataframe = dataframe.join(onehot) # Mark original categorical columns for exclusion self.params['X_feature_exclude'] += self.params['X_feature_onehot'] return dataframe
def test_construction_with_categorical_dtype(self): # construction with CategoricalDtype # GH#18109 data, cats, ordered = "a a b b".split(), "c b a".split(), True dtype = CategoricalDtype(categories=cats, ordered=ordered) result = CategoricalIndex(data, dtype=dtype) expected = CategoricalIndex(data, categories=cats, ordered=ordered) tm.assert_index_equal(result, expected, exact=True) # GH#19032 result = Index(data, dtype=dtype) tm.assert_index_equal(result, expected, exact=True) # error when combining categories/ordered and dtype kwargs msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): CategoricalIndex(data, categories=cats, dtype=dtype) with pytest.raises(ValueError, match=msg): Index(data, categories=cats, dtype=dtype) with pytest.raises(ValueError, match=msg): CategoricalIndex(data, ordered=ordered, dtype=dtype) with pytest.raises(ValueError, match=msg): Index(data, ordered=ordered, dtype=dtype)
def to_dataframe(self) -> pd.DataFrame: """次期難易度表フォーマットのデータを DataFrame として返す。 "level" カラムは Categorical, 他のカラムはすべて文字列 (object 型) とする。 表記レベルの先頭にはシンボルを付加する (たとえば "▼0")。 欠損値は空文字列とする。 :return: DataFrame """ assert self.data is not None if len(self.data) == 0: # 空の場合も、仕様上の必須カラムは用意しておく。"level" カラムは存在しないと以下の処理で困る table = pd.DataFrame(columns=["md5", "level"], dtype=object) else: table = pd.DataFrame.from_dict(self.data, dtype=object).fillna("") tag = self.header.get("tag") or self.header["symbol"] level_order = self.header.get( "level_order") or table["level"].drop_duplicates().values level_order = list(map( str, level_order)) # 仕様では Array(String | Integer) となっている。str に統一しておく。 return (table.astype({ "level": str }) # 仕様では str なのだが、int が入っていることがある (例: 新 Overjoy) ので str に統一しておく .astype({ "level": CategoricalDtype(categories=level_order, ordered=True) }).assign(level=lambda df: df["level"].cat.rename_categories( [tag + level for level in level_order])))
def test_at_setitem_categorical_missing(self): df = DataFrame(index=range(3), columns=range(3), dtype=CategoricalDtype(["foo", "bar"])) df.at[1, 1] = "foo" expected = DataFrame( [ [np.nan, np.nan, np.nan], [np.nan, "foo", np.nan], [np.nan, np.nan, np.nan], ], dtype=CategoricalDtype(["foo", "bar"]), ) tm.assert_frame_equal(df, expected)
def test_astype_categoricaldtype(self): s = Series(["a", "b", "a"]) result = s.astype(CategoricalDtype(["a", "b"], ordered=True)) expected = Series(Categorical(["a", "b", "a"], ordered=True)) tm.assert_series_equal(result, expected) result = s.astype(CategoricalDtype(["a", "b"], ordered=False)) expected = Series(Categorical(["a", "b", "a"], ordered=False)) tm.assert_series_equal(result, expected) result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False)) expected = Series( Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False) ) tm.assert_series_equal(result, expected) tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"]))
def __init__(self, *args, categories=None, **kwargs): if not isinstance(categories, Mapping): categories = {str(i): c for i, c in enumerate(categories, start=1)} self.categories = categories self.dtype = CategoricalDtype(self.categories.values(), ordered=None) self.items = ''.join(option(i, c) for i, c in categories.items()).strip('\n') super().__init__(*args, **kwargs)
def test_astype_bool_missing_to_categorical(self): # GH-19182 s = Series([True, False, np.nan]) assert s.dtypes == np.object_ result = s.astype(CategoricalDtype(categories=[True, False])) expected = Series(Categorical([True, False, np.nan], categories=[True, False])) tm.assert_series_equal(result, expected)
def scraper_output(): """Can't run the scraper just for data tests, so these check that the latest output works. This reads in that data and provides it to pytest tests""" results = pd.read_csv( f"{params.ROOT}/../data/london_marathon_latest.csv", dtype={ "Place (Overall)": "Int64", "Place (Gender)": "Int64", "Name": str, "Sex": str, "Club": str, "Running Number": object, "Category": CategoricalDtype( categories=[ "18-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70+", "70-74", "75-79", "80-84", "85+", "80+", "Unknown", ], ordered=False, ), "Year": "Int64", "Country": str, "FirstName": str, "LastName": str, "DSQ": bool, "Finish (Total Seconds)": "float64", }, parse_dates=["Finish"], ) results["Finish"] = pd.to_timedelta(results["Finish"]) return results
def test_astype_str_int_categories_to_nullable_int(self): # GH#39616 dtype = CategoricalDtype([str(i) for i in range(5)]) codes = np.random.randint(5, size=20) arr = Categorical.from_codes(codes, dtype=dtype) res = arr.astype("Int64") expected = array(codes, dtype="Int64") tm.assert_extension_array_equal(res, expected)
def test_setitem_mix_of_nan_and_interval(self, not_na, nulls_fixture): # GH#27937 dtype = CategoricalDtype(categories=[not_na]) ser = Series( [nulls_fixture, nulls_fixture, nulls_fixture, nulls_fixture], dtype=dtype ) ser.iloc[:3] = [nulls_fixture, not_na, nulls_fixture] exp = Series([nulls_fixture, not_na, nulls_fixture, nulls_fixture], dtype=dtype) tm.assert_series_equal(ser, exp)
def test_update_with_categorical_type(self): # GH 25744 dtype = CategoricalDtype(["a", "b", "c", "d"]) s1 = Series(["a", "b", "c"], index=[1, 2, 3], dtype=dtype) s2 = Series(["b", "a"], index=[1, 2], dtype=dtype) s1.update(s2) result = s1 expected = Series(["b", "a", "c"], index=[1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected)
def test_impl(): ct_dtype = CategoricalDtype(['A', 'B', 'C', 'D']) df = pd.read_csv("csv_data_cat1.csv", names=['C1', 'C2', 'C3'], dtype={ 'C1': int_type, 'C2': ct_dtype, 'C3': str }) return df
def read_data(table_name, df, report_columns, report): """Map raw data file to formatted table @param table_name: name of table @param df: DataFrame of raw data @param report_columns: List of column names for report table @param report: Report object @return: DataFrame of formatted data """ logger = logging.getLogger("SimpleReplayLogger") if df.empty: logger.error("Data is empty. Failed to generate report.") exit(-1) cols = [g_columns[x] for x in report_columns] table_type = report.tables.get(table_name).get('type') report_table = None if table_type == 'breakdown': report_table = df[cols] elif table_type == 'metric': order = CategoricalDtype([ 'Query Latency', 'Compile Time', 'Queue Time', 'Execution Time', 'Commit Queue Time', 'Commit Time' ], ordered=True) df[g_columns.get('Measure')] = df[g_columns.get('Measure')].astype( order) frame = df.sort_values(g_columns.get('Measure')) report_table = frame[cols] elif table_type == 'measure': # filter for specific measure type report_table = df[cols][df[g_columns.get("Measure")] == table_name] report_table = pd.DataFrame(report_table).round( 2) # round values in dataframe to thousandths place report_table.reindex( columns=report_columns) # add columns names to dataframe # upload formatted dataframe to S3 as csv try: s3_resource = boto3.resource('s3') file = f"{table_name.replace(' ', '')}.csv" # set filename for saving csv_buffer = StringIO() report_table.to_csv(csv_buffer) logger.debug(report.bucket) s3_resource.Object(report.bucket.get("bucket_name"), f'{report.path}/aggregated_data/{file}').put( Body=csv_buffer.getvalue()) except Exception as e: logger.error( f"Could not upload aggregated data. Please confirm bucket. Error occurred while processing " f"data. {e}") exit(-1) return report_table
def test_astype_categorical_to_other(self): cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) ser = Series(np.random.RandomState(0).randint(0, 10000, 100)).sort_values() ser = cut(ser, range(0, 10500, 500), right=False, labels=cat) expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) msg = r"Cannot cast object dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(ser.values), name="value_group") cmp(ser.astype("object"), expected) cmp(ser.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(ser), np.array(ser.values)) tm.assert_series_equal(ser.astype("category"), ser) tm.assert_series_equal(ser.astype(CategoricalDtype()), ser) roundtrip_expected = ser.cat.set_categories( ser.cat.categories.sort_values()).cat.remove_unused_categories() result = ser.astype("object").astype("category") tm.assert_series_equal(result, roundtrip_expected) result = ser.astype("object").astype(CategoricalDtype()) tm.assert_series_equal(result, roundtrip_expected)
def test_iloc_getitem_categorical_values(self): # GH#14580 # test iloc() on Series with Categorical data ser = Series([1, 2, 3]).astype("category") # get slice result = ser.iloc[0:2] expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3])) tm.assert_series_equal(result, expected) # get list of indexes result = ser.iloc[[0, 1]] expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3])) tm.assert_series_equal(result, expected) # get boolean array result = ser.iloc[[True, False, False]] expected = Series([1]).astype(CategoricalDtype([1, 2, 3])) tm.assert_series_equal(result, expected)
def test_ordinal_encode_category(self): df2 = pd.DataFrame([ ['C', '3'], ['D', '4'], ['D', '4'], ], columns=['alpha', 'digits']) df2["digits"] = df2["digits"].astype( CategoricalDtype(categories=["4", "3"], ordered=True)) df2["alpha"] = df2["alpha"].astype( CategoricalDtype(categories=["D", "C"], ordered=True)) df2_ = df2.loc[1:, :] df2_1 = df2.loc[:1, :] df2_c = pd.concat([df2_, df2_1]) df2_c.index = range(4) encoder = OrdinalEncoder() encoder.in_feature_groups = "cat" encoder.out_feature_groups = "ordinal" # RunFeatureSelection().test_univar_clf() # RunCoding().test_procedure() dc = DataFrameContainer(dataset_instance=df2_c) dc.set_feature_groups(["cat"] * 2) encoder.fit(X_train=dc) result = encoder.transform(X_train=dc)["X_train"] print(result) should_be = pd.DataFrame({ 'alpha': { 0: 0, 1: 0, 2: 1, 3: 0 }, 'digits': { 0: 0, 1: 0, 2: 1, 3: 0 } }) assert np.all(result.data == should_be)
def _get_dataframe_by_attrs(data, attributes): df = pd.DataFrame(data, columns=[attr_name for attr_name, _ in attributes]) for attr_name, attr_type in attributes: if type(attr_type) == str: if attr_type.upper() in ['NUMERIC', 'REAL']: df[attr_name] = df[attr_name].astype(float) elif type(attr_type) == list: df[attr_name] = df[attr_name].astype( CategoricalDtype(attr_type)) else: raise Exception( f'Unknown attribute type while loading arff: "{attr_type}"' ) return df
def test_astype_category(self, dtype_ordered, cat_ordered): # GH#10696/GH#18593 data = list("abcaacbab") cat = Categorical(data, categories=list("bac"), ordered=cat_ordered) # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) result = cat.astype(dtype) expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered) tm.assert_categorical_equal(result, expected) # non-standard categories dtype = CategoricalDtype(list("adc"), dtype_ordered) result = cat.astype(dtype) expected = Categorical(data, dtype=dtype) tm.assert_categorical_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once result = cat.astype("category") expected = cat tm.assert_categorical_equal(result, expected)
def test_astype_categorical_retains_ordered(self, ordered): index = IntervalIndex.from_breaks(range(5)) arr = index._data dtype = CategoricalDtype(None, ordered=ordered) expected = Categorical(list(arr), ordered=ordered) result = arr.astype(dtype) assert result.ordered is ordered tm.assert_categorical_equal(result, expected) # test IntervalIndex.astype while we're at it. result = index.astype(dtype) expected = Index(expected) tm.assert_index_equal(result, expected)
def test_sort_index_categorical_index(self): df = DataFrame({ "A": np.arange(6, dtype="int64"), "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), }).set_index("B") result = df.sort_index() expected = df.iloc[[4, 0, 1, 5, 2, 3]] tm.assert_frame_equal(result, expected) result = df.sort_index(ascending=False) expected = df.iloc[[2, 3, 0, 1, 5, 4]] tm.assert_frame_equal(result, expected)
def test_unique_index_series(self, ordered): # GH38140 dtype = CategoricalDtype([3, 2, 1], ordered=ordered) c = Categorical([3, 1, 2, 2, 1], dtype=dtype) # Categorical.unique sorts categories by appearance order # if ordered=False exp = Categorical([3, 1, 2], dtype=dtype) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp) c = Categorical([1, 1, 2, 2], dtype=dtype) exp = Categorical([1, 2], dtype=dtype) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp)
def X_feature_label_encode(self, dataframe: DataFrame) -> DataFrame: for label_string, fieldnames in self.params[ 'X_feature_label_encode'].items(): labels = label_string.split(',') category_dtype = CategoricalDtype(categories=labels, ordered=True) encoder = LabelEncoder() encoder.fit(labels) for fieldname in fieldnames: # Replace NaN with first label 'NA', encoder.transform() will throw exception on unseen values dataframe[fieldname] = dataframe[fieldname].astype( category_dtype) dataframe[fieldname].fillna(labels[0], inplace=True) dataframe[f"{fieldname}_Numeric"] = encoder.transform( dataframe[fieldname]) self.params['X_feature_exclude'] += list( flatten(self.params['X_feature_label_encode'].values())) return dataframe
def clean_nsr(df): od = developer nsr_var = [ 'parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health', 'target' ] df.columns = nsr_var raw = df.copy() #raw = raw.replace({'inconv': 0, 'convenient': 1}) df = df.replace('_', '', regex=True) #df = df.replace(' ', '', regex=True) df = df.drop(columns=['finance']) for i in df.columns: df[i] = df[i].astype('category') r = od[i] cat_r = CategoricalDtype(categories=r, ordered=True) # give the order df[i] = df[i].cat.reorder_categories(r, ordered=True) df['finance'] = raw['finance'] return df
def test_unique(self, ordered): # GH38140 dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered) # categories are reordered based on value when ordered=False cat = Categorical(["a", "b", "c"], dtype=dtype) res = cat.unique() tm.assert_categorical_equal(res, cat) cat = Categorical(["a", "b", "a", "a"], dtype=dtype) res = cat.unique() tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype)) cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype) res = cat.unique() exp_cat = Categorical(["c", "a", "b"], dtype=dtype) tm.assert_categorical_equal(res, exp_cat) # nan must be removed cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype) res = cat.unique() exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype) tm.assert_categorical_equal(res, exp_cat)
class TestAstype: def test_astype_float(self, float_frame): casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) casted = float_frame.astype(np.int32) expected = DataFrame( float_frame.values.astype(np.int32), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) float_frame["foo"] = "5" casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) def test_astype_mixed_float(self, mixed_float_frame): # mixed casting casted = mixed_float_frame.reindex( columns=["A", "B"]).astype("float32") _check_cast(casted, "float32") casted = mixed_float_frame.reindex( columns=["A", "B"]).astype("float16") _check_cast(casted, "float16") def test_astype_mixed_type(self, mixed_type_frame): # mixed casting mn = mixed_type_frame._get_numeric_data().copy() mn["little_float"] = np.array(12345.0, dtype="float16") mn["big_float"] = np.array(123456789101112.0, dtype="float64") casted = mn.astype("float64") _check_cast(casted, "float64") casted = mn.astype("int64") _check_cast(casted, "int64") casted = mn.reindex(columns=["little_float"]).astype("float16") _check_cast(casted, "float16") casted = mn.astype("float32") _check_cast(casted, "float32") casted = mn.astype("int32") _check_cast(casted, "int32") # to object casted = mn.astype("O") _check_cast(casted, "object") @td.skip_array_manager_not_yet_implemented def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) df["string"] = "foo" casted = df.astype(int, errors="ignore") expected["string"] = "foo" tm.assert_frame_equal(casted, expected) df = float_frame.copy() expected = float_frame.astype(np.int32) df["string"] = "foo" casted = df.astype(np.int32, errors="ignore") expected["string"] = "foo" tm.assert_frame_equal(casted, expected) def test_astype_with_view_float(self, float_frame): # this is the only real reason to do it this way tf = np.round(float_frame).astype(np.int32) casted = tf.astype(np.float32, copy=False) # TODO(wesm): verification? tf = float_frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa def test_astype_with_view_mixed_float(self, mixed_float_frame): tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): # see GH#14265 # # Check NaN and inf --> raise error when converting to int. msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" df = DataFrame([val]) with pytest.raises(ValueError, match=msg): df.astype(dtype) def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like result = df.astype(str) expected = DataFrame({ "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), "d": list(map(str, d._values)), "e": list(map(str, e._values)), }) tm.assert_frame_equal(result, expected) def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.NaN]).astype(str) expected = DataFrame(["nan"]) tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" expected = DataFrame([val]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 a = Series(date_range("2010-01-04", periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) d = Series(["1.0", "2", "3.14", "4", "5.4"]) df = DataFrame({"a": a, "b": b, "c": c, "d": d}) original = df.copy(deep=True) # change type of a subset of columns dt1 = dtype_class({"b": "str", "d": "float32"}) result = df.astype(dt1) expected = DataFrame({ "a": a, "b": Series(["0", "1", "2", "3", "4"]), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), }) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, original) dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) result = df.astype(dt2) expected = DataFrame({ "a": a, "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), }) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, original) # change all columns dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) tm.assert_frame_equal(df.astype(dt3), df.astype(str)) tm.assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict dt4 = dtype_class({"b": str, 2: str}) dt5 = dtype_class({"e": str}) msg = "Only a column name can be used for the key in a dtype mappings argument" with pytest.raises(KeyError, match=msg): df.astype(dt4) with pytest.raises(KeyError, match=msg): df.astype(dt5) tm.assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the # resulting DataFrame should be the same as the original DataFrame dt6 = dtype_class({col: df[col].dtype for col in df.columns}) equiv = df.astype(dt6) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) # GH#16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame dt7 = dtype_class({}) if dtype_class is dict else dtype_class( {}, dtype=object) equiv = df.astype(dt7) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) def test_astype_duplicate_col(self): a1 = Series([1, 2, 3, 4, 5], name="a") b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) result = df.astype(str) a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) result = df.astype({"a": "str"}) expected = concat([a1_str, b, a2_str], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [ "category", CategoricalDtype(), CategoricalDtype(ordered=True), CategoricalDtype(ordered=False), CategoricalDtype(categories=list("abcdef")), CategoricalDtype(categories=list("edba"), ordered=False), CategoricalDtype(categories=list("edcb"), ordered=True), ], ids=repr, ) def test_astype_categorical(self, dtype): # GH#18099 d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ["a", "a", "b", "c"]}) xpr = f"Expected an instance of {cls.__name__}" with pytest.raises(TypeError, match=xpr): df.astype({"A": cls}) with pytest.raises(TypeError, match=xpr): df["A"].astype(cls) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes(self, dtype): # GH#22578 df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) expected1 = DataFrame({ "a": pd.array([1, 3, 5], dtype=dtype), "b": pd.array([2, 4, 6], dtype=dtype), }) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) df["b"] = df["b"].astype(dtype) expected2 = DataFrame({ "a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype) }) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes_1d(self, dtype): # GH#22578 df = DataFrame({"a": [1.0, 2.0, 3.0]}) expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) df = DataFrame({"a": [1.0, 2.0, 3.0]}) df["a"] = df["a"].astype(dtype) expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["category", "Int64"]) def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH#24704 a1 = Series([0, np.nan, 4], name="a") a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [{ 100: "float64", 200: "uint64" }, "category", "float64"]) def test_astype_column_metadata(self, dtype): # GH#19920 columns = UInt64Index([100, 200, 300], name="foo") df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_object(self, dtype, unit): # tests astype to object dtype # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(object) assert (result.dtypes == object).all() if dtype.startswith("M8"): assert result.iloc[0, 0] == Timestamp(1, unit=unit) else: assert result.iloc[0, 0] == Timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" arr = np.array([[1, 2, 3]], dtype=arr_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # GH#19223 dtype = f"M8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns"]) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # GH#19223 dtype = f"m8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float # GH#19223 dtype = f"m8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(df.values.astype(dtype).astype(float)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # GH#19224 dtype = f"M8[{unit}]" other = f"m8[{unit}]" df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]" with pytest.raises(TypeError, match=msg): df.astype(other) msg = fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]" df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) @td.skip_array_manager_not_yet_implemented def test_astype_arg_for_errors(self): # GH#14878 df = DataFrame([1, 2, 3]) msg = ("Expected value of kwarg 'errors' to be one of " "['raise', 'ignore']. Supplied value is 'True'") with pytest.raises(ValueError, match=re.escape(msg)): df.astype(np.float64, errors=True) df.astype(np.int8, errors="ignore") def test_astype_arg_for_errors_dictlist(self): # GH#25905 df = DataFrame([ { "a": "1", "b": "16.5%", "c": "test" }, { "a": "2.2", "b": "15.3", "c": "another_test" }, ]) expected = DataFrame([ { "a": 1.0, "b": "16.5%", "c": "test" }, { "a": 2.2, "b": "15.3", "c": "another_test" }, ]) type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") tm.assert_frame_equal(result, expected) def test_astype_dt64tz(self, timezone_frame): # astype expected = np.array( [ [ Timestamp("2013-01-01 00:00:00"), Timestamp("2013-01-02 00:00:00"), Timestamp("2013-01-03 00:00:00"), ], [ Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), NaT, Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), ], [ Timestamp("2013-01-01 00:00:00+0100", tz="CET"), NaT, Timestamp("2013-01-03 00:00:00+0100", tz="CET"), ], ], dtype=object, ).T expected = DataFrame( expected, index=timezone_frame.index, columns=timezone_frame.columns, dtype=object, ) result = timezone_frame.astype(object) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # dt64tz->dt64 deprecated result = timezone_frame.astype("datetime64[ns]") expected = DataFrame({ "A": date_range("20130101", periods=3), "B": (date_range("20130101", periods=3, tz="US/Eastern").tz_convert("UTC").tz_localize(None)), "C": (date_range("20130101", periods=3, tz="CET").tz_convert("UTC").tz_localize(None)), }) expected.iloc[1, 1] = NaT expected.iloc[1, 2] = NaT tm.assert_frame_equal(result, expected) def test_astype_dt64tz_to_str(self, timezone_frame): # str formatting result = timezone_frame.astype(str) expected = DataFrame( [ [ "2013-01-01", "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], ["2013-01-02", "NaT", "NaT"], [ "2013-01-03", "2013-01-03 00:00:00-05:00", "2013-01-03 00:00:00+01:00", ], ], columns=timezone_frame.columns, ) tm.assert_frame_equal(result, expected) with option_context("display.max_columns", 20): result = str(timezone_frame) assert ( "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" ) in result assert ( "1 2013-01-02 NaT NaT" ) in result assert ( "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" ) in result def test_astype_empty_dtype_dict(self): # issue mentioned further down in the following issue's thread # https://github.com/pandas-dev/pandas/issues/33113 df = DataFrame() result = df.astype({}) tm.assert_frame_equal(result, df) assert result is not df @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) ignore keyword @pytest.mark.parametrize( "df", [ DataFrame(Series(["x", "y", "z"], dtype="string")), DataFrame(Series(["x", "y", "z"], dtype="category")), DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), DataFrame(Series(3 * [Interval(0, 1)])), ], ) @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, df, errors): # https://github.com/pandas-dev/pandas/issues/35471 if errors == "ignore": expected = df result = df.astype(float, errors=errors) tm.assert_frame_equal(result, expected) else: msg = "(Cannot cast)|(could not convert)" with pytest.raises((ValueError, TypeError), match=msg): df.astype(float, errors=errors) def test_astype_tz_conversion(self): # GH 35973 val = { "tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London") } df = DataFrame(val) result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) expected = df expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) def test_astype_tz_object_conversion(self, tz): # GH 35973 val = { "tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London") } expected = DataFrame(val) # convert expected to object dtype from other tz str (independently tested) result = expected.astype({"tz": f"datetime64[ns, {tz}]"}) result = result.astype({"tz": "object"}) # do real test: object dtype to a specified tz, different from construction tz. result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request): tz = tz_naive_fixture if tz is None: mark = pytest.mark.xfail( reason= "GH#36153 uses ndarray formatting instead of DTA formatting") request.node.add_marker(mark) dti = date_range("2016-01-01", periods=3, tz=tz) dta = dti._data dta[0] = NaT obj = frame_or_series(dta) result = obj.astype("string") # Check that Series/DataFrame.astype matches DatetimeArray.astype expected = frame_or_series(dta.astype("string")) tm.assert_equal(result, expected) item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) assert np.all(alt.iloc[1:] == result.iloc[1:]) def test_astype_bytes(self): # GH#39474 result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3")