def test_sort_index_categorical_index(self): df = DataFrame({ "A": np.arange(6, dtype="int64"), "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), }).set_index("B") result = df.sort_index() expected = df.iloc[[4, 0, 1, 5, 2, 3]] tm.assert_frame_equal(result, expected) result = df.sort_index(ascending=False) expected = df.iloc[[2, 3, 0, 1, 5, 4]] tm.assert_frame_equal(result, expected)
def test_unique_index_series(self, ordered): # GH38140 dtype = CategoricalDtype([3, 2, 1], ordered=ordered) c = Categorical([3, 1, 2, 2, 1], dtype=dtype) # Categorical.unique sorts categories by appearance order # if ordered=False exp = Categorical([3, 1, 2], dtype=dtype) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp) c = Categorical([1, 1, 2, 2], dtype=dtype) exp = Categorical([1, 2], dtype=dtype) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp)
def X_feature_label_encode(self, dataframe: DataFrame) -> DataFrame: for label_string, fieldnames in self.params[ 'X_feature_label_encode'].items(): labels = label_string.split(',') category_dtype = CategoricalDtype(categories=labels, ordered=True) encoder = LabelEncoder() encoder.fit(labels) for fieldname in fieldnames: # Replace NaN with first label 'NA', encoder.transform() will throw exception on unseen values dataframe[fieldname] = dataframe[fieldname].astype( category_dtype) dataframe[fieldname].fillna(labels[0], inplace=True) dataframe[f"{fieldname}_Numeric"] = encoder.transform( dataframe[fieldname]) self.params['X_feature_exclude'] += list( flatten(self.params['X_feature_label_encode'].values())) return dataframe
def clean_nsr(df): od = developer nsr_var = [ 'parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health', 'target' ] df.columns = nsr_var raw = df.copy() #raw = raw.replace({'inconv': 0, 'convenient': 1}) df = df.replace('_', '', regex=True) #df = df.replace(' ', '', regex=True) df = df.drop(columns=['finance']) for i in df.columns: df[i] = df[i].astype('category') r = od[i] cat_r = CategoricalDtype(categories=r, ordered=True) # give the order df[i] = df[i].cat.reorder_categories(r, ordered=True) df['finance'] = raw['finance'] return df
def test_unique(self, ordered): # GH38140 dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered) # categories are reordered based on value when ordered=False cat = Categorical(["a", "b", "c"], dtype=dtype) res = cat.unique() tm.assert_categorical_equal(res, cat) cat = Categorical(["a", "b", "a", "a"], dtype=dtype) res = cat.unique() tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype)) cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype) res = cat.unique() exp_cat = Categorical(["c", "a", "b"], dtype=dtype) tm.assert_categorical_equal(res, exp_cat) # nan must be removed cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype) res = cat.unique() exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype) tm.assert_categorical_equal(res, exp_cat)
def load_dframe(sujb_num): fname = "prodroma.xlsx" subj_num = 0 dframe = pd.read_excel(fname, sheet_name=subj_num, skiprows=10, usecols="B:CR", index_col=0).T dframe = dframe.rename( columns={ "день": "day", "Время заполнения ТП": "fillin_time", "ГБ новая": "ha_new", "ГБ продолжение": "ha_cont", "Начало боли": "ha_start", "Окончание боли": "ha_stop", "Обезболивающее": "painkiller", "Название": "painkiller_name", "аура": "aura", "Боль сейчас": "ha_now", "ВАШ макс": "your_max", "односторонняя": "onesided", "пульсация": "pulsation", "усиление движением": "intens_by_mov", "тошнота": "vomiting", "чувствительность к свету": "light_sens_bin", "чувствительность к звуку": "noise_sens_bin", "чувствительность к запахам": "smell_sens_bin", "заметил провокатор": "noticed_trigger", "какой триггер": "which_trigger", "Продолжительность сна": "sleep_duration", "Качество сна": "sleep_quality", "Свежесть после сна": "sleep_freshness", "Больше света, чем обычно": "a_lot_light", "Чувствительность к свету": "light_sens_cat", "Больше звука чем обычно": "a_lot_noise", "Чувствительность к звуку": "noise_sens_cat", "Были резкие запахи?": "strong_smells", "Чувствительность к запахам": "smell_sens_cat", "Пропуск приема пищи": "meal_skip", "Чувство голода": "hunger", "Воды достаточно?": "hydration", "Жажда": "thirst", "Алкоголь": "alcohol", "кофеин": "caffeine", "сыр, шоко, цитрус": "cheese_choco_citrus", "Хотелось шоколада": "wanted_choco", "Чувство усталости": "tiredness", "Сложность концентрации": "focus_difficulty", "Тревога": "anxiety", "Депрессия": "depression", "Работоспособность": "productivity", "Работосособность": "productivity", "Сонливость": "sleepiness", "Зевания": "yawning", "Напряжение глаз": "eye_strain", "боль в шее": "neck_pain", "Чувствит кожи головы": "scalp_sens", "Физическая ативность": "exercise", "какой день": "which_day", "Перелеты": "flights", "1 день менструации": "pms_1st_day", "подташнивает": "nausea", "вегетатика": "vegetatics", "мочеиспускание": "urination", "% заполнения дневника": "journal_completion_percentage", "комментарий": "comment", "дата": "date", "ТП": "TP", }) dframe = dframe.set_index(["date", "TP"]) dframe.columns.rename(None, inplace=True) dframe.fillin_time = pd.to_datetime(dframe.fillin_time) dframe.replace(to_replace="да", value=True, inplace=True) dframe.replace(to_replace="нет", value=False, inplace=True) dframe["ha_new"] = dframe["ha_new"].fillna(False) dframe["ha_cont"] = dframe["ha_cont"].fillna(False) dframe["ha_now"] = dframe["ha_new"] | dframe["ha_cont"] dframe["painkiller"] = dframe["painkiller"].fillna(False) dframe["vomiting"] = dframe["vomiting"].fillna(False) dframe["intens_by_mov"] = dframe["intens_by_mov"].fillna(False) dframe["pulsation"] = dframe["pulsation"].fillna(False) dframe["light_sens_bin"] = dframe["light_sens_bin"].fillna(False) dframe["noise_sens_bin"] = dframe["noise_sens_bin"].fillna(False) dframe["smell_sens_bin"] = dframe["smell_sens_bin"].fillna(False) dframe["flights"] = dframe["flights"].fillna(False) dframe["pms_1st_day"] = dframe["pms_1st_day"].fillna(False) cat_type = CategoricalDtype([1, 2, 3, 4, 5], ordered=True) for col in [ "anxiety", "depression", "tiredness", "productivity", "sleepiness", "light_sens_cat", "smell_sens_cat", "noise_sens_cat", "sleep_quality", "sleep_freshness", "hunger", ]: dframe[col] = dframe[col].astype(cat_type) # dframe["anxiety"] = dframe["depression"].astype(int).astype('category') return dframe
def test_unique(self, data, categories, expected_data, ordered): dtype = CategoricalDtype(categories, ordered=ordered) idx = CategoricalIndex(data, dtype=dtype) expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected)
columns=list('ABCDEF')) initMetadata(df) MARKERS = ['hex', 'circle_x', 'triangle', 'square'] markerFactor = factor_mark('DDC', MARKERS, ["A0", "A1", "A2", "A3", "A4"]) colorFactor = factor_cmap('DDC', 'Category10_6', ["A0", "A1", "A2", "A3", "A4"]) mapDDC = {0: "A0", 1: "A1", 2: "A2", 3: "A3", 4: "A4"} df.eval("Bool=A>0.5", inplace=True) df.eval("BoolB=B>0.5", inplace=True) df.eval("BoolC=C>0.1", inplace=True) df["A"] = df["A"].round(3) df["B"] = df["B"].round(3) df["C"] = df["C"].round(3) df["D"] = df["D"].round(3) df["AA"] = ((df.A * 10).round(0)).astype(CategoricalDtype(ordered=True)) df["CC"] = ((df.C * 5).round(0)).astype(int) df["DD"] = ((df.D * 4).round(0)).astype(int) df["DDC"] = ((df.D * 4).round(0)).astype(int).map(mapDDC) df["EE"] = (df.E * 4).round(0) df['errY'] = df.A * 0.02 + 0.02 df.head(10) df.meta.metaData = { 'A.AxisTitle': "A (cm)", 'B.AxisTitle': "B (cm/s)", 'C.AxisTitle': "C (s)", 'D.AxisTitle': "D (a.u.)", 'Bool.AxisTitle': "A>half", 'E.AxisTitle': "Category" }
def tree2Panda(tree, include, selection, **kwargs): r""" Convert selected items from the tree into panda table TODO: * to consult with uproot * currently not able to work with friend trees * check the latest version of RDeatFrame (in AliRoot latest v16.16.00) * Add filter on metadata - e.g class of variables :param tree: input tree :param include: regular expresion array - processing Tree+Friends, branches, aliases :param selection: tree selection () :param kwargs: * exclude exclude arrray * firstEntry firt entry to enter * nEntries number of entries to convert * column mask :return: panda data frame """ options = { "exclude": [], "firstEntry": 0, "nEntries": 100000000, "columnMask": [[".fX$", "_X"], [".fY$", "_y"], [".fElements", ""]], "category":0, "verbose": 0 } options.update(kwargs) if not hasattr(tree, 'anyTree'): treeToAnyTree(tree) # expand tree/aliases/variables - if not done before anyTree = tree.anyTree # check regular expressions in anyTree variablesTree = findSelectedBranches(anyTree, include, options["exclude"]) variables = "" for var in variablesTree: # if var.length<2: continue var = var.replace("/", ".") variables += var + ":" # check if valid TTree formula for var in include: if ".*" in var: continue formula= ROOT.TTreeFormula('test', var, tree) if (formula.GetNdim()>0): variables += var + ":" variables = variables[0:-1] entries = tree.Draw(str(variables), selection, "goffpara", options["nEntries"], options["firstEntry"]) # query data columns = variables.split(":") for i, column in enumerate(columns): columns[i] = column.replace(".", "_") # replace column names # 1.) pandas does not allow dots in names # 2.) user can specified own column mask for i, column in enumerate(columns): for mask in options["columnMask"]: columns[i] = columns[i].replace(mask[0], mask[1]) ex_dict = {} for i, a in enumerate(columns): val = tree.GetVal(i) ex_dict[a] = np.frombuffer(val, dtype=float, count=entries) df = pd.DataFrame(ex_dict, columns=columns) for i, a in enumerate(columns): if (tree.GetLeaf(a)): if (tree.GetLeaf(a).ClassName() == 'TLeafC'): df[a]=df[a].astype(np.int8) if (tree.GetLeaf(a).ClassName() == 'TLeafS'): df[a]=df[a].astype(np.int16) if (tree.GetLeaf(a).ClassName() == 'TLeafI'): df[a]=df[a].astype(np.int32) if (tree.GetLeaf(a).ClassName() == 'TLeafL'): df[a]=df[a].astype(np.int64) if (tree.GetLeaf(a).ClassName() == 'TLeafB'): df[a] = df[a].astype(bool) if (options["category"]>0): dfUniq=df[a].unique() if dfUniq.shape[0]<=options["category"] : df[a]=df[a].astype(CategoricalDtype(ordered=True)) initMetadata(df) metaData = tree.GetUserInfo().FindObject("metaTable") if metaData: for key in metaData: df.meta.metaData[key.GetName()] = key.GetTitle() return df
def test_getitem_bool_mask_categorical_index(self): df3 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex( [1, 1, 2, 1, 3, 2], dtype=CategoricalDtype([3, 2, 1], ordered=True), name="B", ), ) df4 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex( [1, 1, 2, 1, 3, 2], dtype=CategoricalDtype([3, 2, 1], ordered=False), name="B", ), ) result = df3[df3.index == "a"] expected = df3.iloc[[]] tm.assert_frame_equal(result, expected) result = df4[df4.index == "a"] expected = df4.iloc[[]] tm.assert_frame_equal(result, expected) result = df3[df3.index == 1] expected = df3.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) result = df4[df4.index == 1] expected = df4.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # since we have an ordered categorical # CategoricalIndex([1, 1, 2, 1, 3, 2], # categories=[3, 2, 1], # ordered=True, # name='B') result = df3[df3.index < 2] expected = df3.iloc[[4]] tm.assert_frame_equal(result, expected) result = df3[df3.index > 1] expected = df3.iloc[[]] tm.assert_frame_equal(result, expected) # unordered # cannot be compared # CategoricalIndex([1, 1, 2, 1, 3, 2], # categories=[3, 2, 1], # ordered=False, # name='B') msg = "Unordered Categoricals can only compare equality or not" with pytest.raises(TypeError, match=msg): df4[df4.index < 2] with pytest.raises(TypeError, match=msg): df4[df4.index > 1]
from . import loader REGIONS = { "north-america": "North America", "south-asia": "South Asia", "sub-saharan-africa": "Sub-Saharan Africa", "europe": "Europe & Central Asia", "latin-america": "Latin America & Caribbean", "middle-east": "Middle East & North Africa", "east-asia": "East Asia & Pacific", } INCOME_GROUPS = { "low": "Low income", "lower-middle": "Lower middle income", "upper-middle": "Upper middle income", "high": "High income", } IncomeGroup = CategoricalDtype(categories=INCOME_GROUPS, ordered=True) Region = CategoricalDtype(categories=REGIONS, ordered=False) @loader.filtering_from_data(["region"]) def load_region(): return loader.load_database("un.pkl.gz").astype(Region) @loader.filtering_from_data(["income_group"]) def load_income_group(): return loader.load_database("un.pkl.gz").astype(IncomeGroup)
class TestUpdate: def test_update(self): s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) s2 = Series([np.nan, 3.5, np.nan, 5.0]) s.update(s2) expected = Series([1.5, 3.5, 3.0, 5.0, np.nan]) tm.assert_series_equal(s, expected) # GH 3217 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df["c"] = np.nan df["c"].update(Series(["foo"], index=[0])) expected = DataFrame( [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] ) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( "other, dtype, expected", [ # other is int ([61, 63], "int32", Series([10, 61, 12], dtype="int32")), ([61, 63], "int64", Series([10, 61, 12])), ([61, 63], float, Series([10.0, 61.0, 12.0])), ([61, 63], object, Series([10, 61, 12], dtype=object)), # other is float, but can be cast to int ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32")), ([61.0, 63.0], "int64", Series([10, 61, 12])), ([61.0, 63.0], float, Series([10.0, 61.0, 12.0])), ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object)), # others is float, cannot be cast to int ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0])), ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0])), ([61.1, 63.1], float, Series([10.0, 61.1, 12.0])), ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object)), # other is object, cannot be cast ([(61,), (63,)], "int32", Series([10, (61,), 12])), ([(61,), (63,)], "int64", Series([10, (61,), 12])), ([(61,), (63,)], float, Series([10.0, (61,), 12.0])), ([(61,), (63,)], object, Series([10, (61,), 12])), ], ) def test_update_dtypes(self, other, dtype, expected): ser = Series([10, 11, 12], dtype=dtype) other = Series(other, index=[1, 3]) ser.update(other) tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( "series, other, expected", [ # update by key ( Series({"a": 1, "b": 2, "c": 3, "d": 4}), {"b": 5, "c": np.nan}, Series({"a": 1, "b": 5, "c": 3, "d": 4}), ), # update by position (Series([1, 2, 3, 4]), [np.nan, 5, 1], Series([1, 5, 1, 4])), ], ) def test_update_from_non_series(self, series, other, expected): # GH 33215 series.update(other) tm.assert_series_equal(series, expected) @pytest.mark.parametrize( "data, other, expected, dtype", [ (["a", None], [None, "b"], ["a", "b"], "string"), pytest.param( ["a", None], [None, "b"], ["a", "b"], "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), ([1, None], [None, 2], [1, 2], "Int64"), ([True, None], [None, False], [True, False], "boolean"), ( ["a", None], [None, "b"], ["a", "b"], CategoricalDtype(categories=["a", "b"]), ), ( [Timestamp(year=2020, month=1, day=1, tz="Europe/London"), NaT], [NaT, Timestamp(year=2020, month=1, day=1, tz="Europe/London")], [Timestamp(year=2020, month=1, day=1, tz="Europe/London")] * 2, "datetime64[ns, Europe/London]", ), ], ) def test_update_extension_array_series(self, data, other, expected, dtype): result = Series(data, dtype=dtype) other = Series(other, dtype=dtype) expected = Series(expected, dtype=dtype) result.update(other) tm.assert_series_equal(result, expected) def test_update_with_categorical_type(self): # GH 25744 dtype = CategoricalDtype(["a", "b", "c", "d"]) s1 = Series(["a", "b", "c"], index=[1, 2, 3], dtype=dtype) s2 = Series(["b", "a"], index=[1, 2], dtype=dtype) s1.update(s2) result = s1 expected = Series(["b", "a", "c"], index=[1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected)
if kind == "kn": subkind = "tsne" else: sub_kind = kind subset = cl_df[[c + "_" + sub_kind for c in ['x', 'y', 'z']]] print(subset[:10]) points = [list(x) for x in subset.to_numpy()] print(points[:10]) print(len(points)) arr = np.array(points) dist = Y = cdist(arr, arr, 'euclidean') new_path = make_path(np.array(points), dist)[:-1] print(new_path) cl_df[['cl_%s' % k for k in things]] = cl_cols path_order_categories = CategoricalDtype(categories=new_path, ordered=True) cl_df['cl_%s' % kind] = cl_df['cl'].astype(path_order_categories) cl_df.sort_values(['cl_%s' % kind], inplace=True) cl_df['cl_%s' % kind] = cl_df['cl'].astype('int32') cl_df.to_csv('%s_clusters_mean_points.csv' % kind, sep='\t', header=True, index=False) print(kind + " " + str(new_path))
def test_numpy_transpose(index_or_series_obj): msg = "the 'axes' parameter is not supported" obj = index_or_series_obj tm.assert_equal(np.transpose(obj), obj) with pytest.raises(ValueError, match=msg): np.transpose(obj, axes=1) @pytest.mark.parametrize( "data, transposed_data, index, columns, dtype", [ ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int), ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])), ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int), ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])), ([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int), ( [[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], CategoricalDtype([1, 2, 3, 4]), ), ], ) def test_duplicate_labels(data, transposed_data, index, columns, dtype): # GH 42380 df = DataFrame(data, index=index, columns=columns, dtype=dtype)
def test_output_attributes(scraper_output): results = scraper_output exp_cols = [ "Place (Overall)", "Place (Gender)", "Place (Category)", "Name", "Sex", "Club", "Running Number", "Category", "Finish", "Year", "Country", "FirstName", "LastName", "DSQ", "Finish (Total Seconds)", ] exp_dtypes = pd.Series({ "Place (Overall)": Int64Dtype(), "Place (Gender)": Int64Dtype(), "Place (Category)": dtype("float64"), "Name": dtype("O"), "Sex": dtype("O"), "Club": dtype("O"), "Running Number": dtype("O"), "Category": CategoricalDtype( categories=[ "18-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70+", "70-74", "75-79", "80-84", "85+", "80+", "Unknown", ], ordered=False, ), "Finish": dtype("<m8[ns]"), "Year": Int64Dtype(), "Country": dtype("O"), "FirstName": dtype("O"), "LastName": dtype("O"), "DSQ": dtype("bool"), "Finish (Total Seconds)": dtype("float64"), }) exp_rows_min = 1000 # One sex for one year should give at least this many assert exp_cols == list(results.columns), "Expected columns not found" assert exp_rows_min <= results.shape[ 0], "Less than minimum expected number of rows" assert exp_dtypes.values.tolist() == results.dtypes.values.tolist()
import numpy as np import pandas as pd from pandas import CategoricalDtype df = pd.read_csv('D:\\Study\\ML\\Final_Project\\dataset-har-PUC-Rio-ugulino\\Full_Data.csv', delimiter=';') df['how_tall_in_meters'] = df['how_tall_in_meters'].apply(lambda x: int(x.replace(',', ''))) df['body_mass_index'] = df['body_mass_index'].apply(lambda x: float(x.replace(',', '.'))) df["user"] = df["user"].astype(CategoricalDtype(['debora', 'katia', 'wallace', 'jose_carlos'])) df = pd.concat([df, pd.get_dummies(df['user'], prefix='user')], axis=1) df["gender"] = df["gender"].astype(CategoricalDtype(['Woman', 'Man'])) df = pd.concat([df, pd.get_dummies(df['gender'], prefix='gender')], axis=1) df["class"] = df["class"].astype(CategoricalDtype(['sitting', 'sittingdown', 'standing', 'standingup', 'walking'])) df = pd.concat([df, pd.get_dummies(df['class'], prefix='class')], axis=1) df.drop(['user'], axis=1, inplace=True) df.drop(['gender'], axis=1, inplace=True) df.drop(['class'], axis=1, inplace=True) array = df.to_numpy() np.random.shuffle(array) train_data = array[:int(len(array) * 0.8)] test_data = array[int(len(array) * 0.8):] pd.DataFrame(train_data).to_csv("D:\\Study\\ML\\Final_Project\\Sources\\Datasets\\Train_data.csv", header=df.columns, index=False) pd.DataFrame(test_data).to_csv("D:\\Study\\ML\\Final_Project\\Sources\\Datasets\\Test_data.csv", header=df.columns, index=False)
def preprocess_features(fp_processed, only_label=True): # Load and merge the datasets train = pd.read_csv(fp_processed + 'train.csv', index_col=0) valid = pd.read_csv(fp_processed + 'valid.csv', index_col=0) test = pd.read_csv(fp_processed + 'test.csv', index_col=0) # For easier splitting afterwards train['dataset'] = 'train' valid['dataset'] = 'valid' test['dataset'] = 'test' tvt = pd.concat([train, valid, test]) labels = [ 'concrete_cement', 'healthy_metal', 'incomplete', 'irregular_metal', 'other' ] countries = ['colombia', 'guatemala', 'st_lucia'] places = [ 'borde_rural', 'borde_soacha', 'castries', 'dennery', 'gros_islet', 'mixco_1_and_ebenezer', 'mixco_3' ] countries_cat_type = CategoricalDtype(categories=countries, ordered=True) places_cat_type = CategoricalDtype(categories=places, ordered=True) labels_cat_type = CategoricalDtype( categories=labels + ['unknown'], ordered=True) # +['unknown] for the nan's in neighbour labels # Encode labels tvt.loc[:, 'label'] = tvt.loc[:, 'label'].astype(labels_cat_type).cat.codes if not only_label: # Encode categories # First handle nan, otherwise cat.code for nan is -1, resulting in error in ebedding (index out of range: -1) tvt = tvt.fillna('unknown') tvt.loc[:, 'country'] = tvt.loc[:, 'country'].astype(str).astype( countries_cat_type).cat.codes tvt.loc[:, 'place'] = tvt.loc[:, 'place'].astype(places_cat_type).cat.codes tvt.loc[:, 'verified'] = tvt.loc[:, 'verified'].astype(int) for i in range(1, 21): tvt.loc[:, f'l_{i}'] = tvt.loc[:, f'l_{i}'].astype( labels_cat_type).cat.codes # Normalize continuous features continuous_cols = [ 'area', 'complexity', 'z_min', 'z_max', 'z_median', 'z_count', 'z_majority', 'z_minority', 'z_unique', 'z_range', 'z_sum' ] for col in continuous_cols: mu = tvt[col].mean() sigma = tvt[col].std() tvt.loc[:, col] = (tvt[col] - mu) / sigma # Normalize distances mu = tvt.loc[:, 'd_1':'d_19'].values.mean() sigma = tvt.loc[:, 'd_1':'d_19'].values.std() for i in range(1, 21): tvt.loc[:, f'd_{i}'] = (tvt[f'd_{i}'] - mu) / sigma # split and save train = tvt[tvt['dataset'] == 'train'] valid = tvt[tvt['dataset'] == 'valid'] test = tvt[tvt['dataset'] == 'test'] train.to_csv(fp_processed + 'train_.csv') valid.to_csv(fp_processed + 'valid.csv') test.to_csv(fp_processed + 'test.csv')
def test_impl(): names = ['C1', 'C2', 'C3'] ct_dtype = CategoricalDtype(['A', 'B', 'C']) dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str} df = pd.read_csv("csv_data_cat1.csv", names=names, dtype=dtypes) return df
class TestDataFrameToRecords: def test_to_records_dt64(self): df = DataFrame( [["one", "two", "three"], ["four", "five", "six"]], index=date_range("2012-01-01", "2012-01-02"), ) expected = df.index.values[0] result = df.to_records()["index"][0] assert expected == result def test_to_records_dt64tz_column(self): # GH#32535 dont less tz in to_records df = DataFrame( {"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")}) result = df.to_records() assert result.dtype["A"] == object val = result[0][1] assert isinstance(val, Timestamp) assert val == df.loc[0, "A"] def test_to_records_with_multindex(self): # GH#3189 index = [ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] data = np.zeros((8, 4)) df = DataFrame(data, index=index) r = df.to_records(index=True)["level_0"] assert "bar" in r assert "one" not in r def test_to_records_with_Mapping_type(self): import email from email.parser import Parser abc.Mapping.register(email.message.Message) headers = Parser().parsestr("From: <*****@*****.**>\n" "To: <*****@*****.**>\n" "Subject: Test message\n" "\n" "Body would go here\n") frame = DataFrame.from_records([headers]) all(x in frame for x in ["Type", "Subject", "From"]) def test_to_records_floats(self): df = DataFrame(np.random.rand(10, 10)) df.to_records() def test_to_records_index_name(self): df = DataFrame(np.random.randn(3, 3)) df.index.name = "X" rs = df.to_records() assert "X" in rs.dtype.fields df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() assert "index" in rs.dtype.fields df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) df.index.names = ["A", None] rs = df.to_records() assert "level_0" in rs.dtype.fields def test_to_records_with_unicode_index(self): # GH#13172 # unicode_literals conflict with to_records result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) tm.assert_almost_equal(result, expected) def test_to_records_with_unicode_column_names(self): # xref issue: https://github.com/numpy/numpy/issues/2407 # Issue GH#11879. to_records used to raise an exception when used # with column names containing non-ascii characters in Python 2 result = DataFrame(data={"accented_name_é": [1.0]}).to_records() # Note that numpy allows for unicode field names but dtypes need # to be specified using dictionary instead of list of tuples. expected = np.rec.array( [(0, 1.0)], dtype={ "names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"] }, ) tm.assert_almost_equal(result, expected) def test_to_records_with_categorical(self): # GH#8626 # dict creation df = DataFrame({"A": list("abc")}, dtype="category") expected = Series(list("abc"), dtype="category", name="A") tm.assert_series_equal(df["A"], expected) # list-like creation df = DataFrame(list("abc"), dtype="category") expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # to record array # this coerces result = df.to_records() expected = np.rec.array([(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")]) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( "kwargs,expected", [ # No dtypes --> default to array dtypes. ( dict(), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Should have no effect in this case. ( dict(index=True), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Column dtype applied across the board. Index unaffected. ( dict(column_dtypes="<U4"), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"), ("C", "<U4")], ), ), # Index dtype applied across the board. Columns unaffected. ( dict(index_dtypes="<U1"), np.rec.array( [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")], dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Pass in a type instance. ( dict(column_dtypes=np.unicode), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")], ), ), # Pass in a dtype instance. ( dict(column_dtypes=np.dtype("unicode")), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")], ), ), # Pass in a dictionary (name-only). ( dict(column_dtypes={ "A": np.int8, "B": np.float32, "C": "<U2" }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "<U2")], ), ), # Pass in a dictionary (indices-only). ( dict(index_dtypes={0: "int16"}), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Ignore index mappings if index is not True. ( dict(index=False, index_dtypes="<U2"), np.rec.array( [(1, 0.2, "a"), (2, 1.5, "bc")], dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Non-existent names / indices in mapping should not error. ( dict(index_dtypes={ 0: "int16", "not-there": "float32" }), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Names / indices not in mapping default to array dtype. ( dict(column_dtypes={ "A": np.int8, "B": np.float32 }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Names / indices not in dtype mapping default to array dtype. ( dict(column_dtypes={ "A": np.dtype("int8"), "B": np.dtype("float32") }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Mixture of everything. ( dict(column_dtypes={ "A": np.int8, "B": np.float32 }, index_dtypes="<U2"), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Invalid dype values. ( dict(index=False, column_dtypes=list()), (ValueError, "Invalid dtype \\[\\] specified for column A"), ), ( dict(index=False, column_dtypes={ "A": "int32", "B": 5 }), (ValueError, "Invalid dtype 5 specified for column B"), ), # Numpy can't handle EA types, so check error is raised ( dict( index=False, column_dtypes={ "A": "int32", "B": CategoricalDtype(["a", "b"]) }, ), (ValueError, "Invalid dtype category specified for column B"), ), # Check that bad types raise ( dict(index=False, column_dtypes={ "A": "int32", "B": "foo" }), (TypeError, "data type [\"']foo[\"'] not understood"), ), ], ) def test_to_records_dtype(self, kwargs, expected): # see GH#18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) if not isinstance(expected, np.recarray): with pytest.raises(expected[0], match=expected[1]): df.to_records(**kwargs) else: result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( "df,kwargs,expected", [ # MultiIndex in the index. ( DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")).set_index(["a", "b"]), dict(column_dtypes="float64", index_dtypes={ 0: "int32", 1: "int8" }), np.rec.array( [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)], dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")], ), ), # MultiIndex in the columns. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")]), ), dict(column_dtypes={ 0: "<U1", 2: "float32" }, index_dtypes="float32"), np.rec.array( [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)], dtype=[ ("index", "<f4"), ("('a', 'd')", "<U1"), ("('b', 'e')", "<i8"), ("('c', 'f')", "<f4"), ], ), ), # MultiIndex in both the columns and index. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")), index=MultiIndex.from_tuples([("d", -4), ("d", -5), ("f", -6)], names=list("cd")), ), dict(column_dtypes="float64", index_dtypes={ 0: "<U2", 1: "int8" }), np.rec.array( [ ("d", -4, 1.0, 2.0, 3.0), ("d", -5, 4.0, 5.0, 6.0), ("f", -6, 7, 8, 9.0), ], dtype=[ ("c", "<U2"), ("d", "i1"), ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"), ("('c', 'f')", "<f8"), ], ), ), ], ) def test_to_records_dtype_mi(self, df, kwargs, expected): # see GH#18146 result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) def test_to_records_dict_like(self): # see GH#18146 class DictLike: def __init__(self, **kwargs): self.d = kwargs.copy() def __getitem__(self, key): return self.d.__getitem__(key) def __contains__(self, key) -> bool: return key in self.d def keys(self): return self.d.keys() df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) dtype_mappings = dict( column_dtypes=DictLike(**{ "A": np.int8, "B": np.float32 }), index_dtypes="<U2", ) result = df.to_records(**dtype_mappings) expected = np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) def test_to_records_datetimeindex_with_tz(self, tz): # GH#13937 dr = date_range("2016-01-01", periods=10, freq="S", tz=tz) df = DataFrame({"datetime": dr}, index=dr) expected = df.to_records() result = df.tz_convert("UTC").to_records() # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected)
class TestDataFrameConvertTo(TestData): def test_to_dict_timestamp(self): # GH11247 # split/records producing np.datetime64 rather than Timestamps # on datetime64[ns] dtypes only tsmp = Timestamp('20130101') test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]}) test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]}) expected_records = [{'A': tsmp, 'B': tsmp}, {'A': tsmp, 'B': tsmp}] expected_records_mixed = [{'A': tsmp, 'B': 1}, {'A': tsmp, 'B': 2}] assert (test_data.to_dict(orient='records') == expected_records) assert (test_data_mixed.to_dict(orient='records') == expected_records_mixed) expected_series = { 'A': Series([tsmp, tsmp], name='A'), 'B': Series([tsmp, tsmp], name='B'), } expected_series_mixed = { 'A': Series([tsmp, tsmp], name='A'), 'B': Series([1, 2], name='B'), } tm.assert_dict_equal(test_data.to_dict(orient='series'), expected_series) tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'), expected_series_mixed) expected_split = { 'index': [0, 1], 'data': [[tsmp, tsmp], [tsmp, tsmp]], 'columns': ['A', 'B'] } expected_split_mixed = { 'index': [0, 1], 'data': [[tsmp, 1], [tsmp, 2]], 'columns': ['A', 'B'] } tm.assert_dict_equal(test_data.to_dict(orient='split'), expected_split) tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'), expected_split_mixed) def test_to_dict_index_not_unique_with_index_orient(self): # GH22801 # Data loss when indexes are not unique. Raise ValueError. df = DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) msg = "DataFrame index must be unique for orient='index'" with pytest.raises(ValueError, match=msg): df.to_dict(orient='index') def test_to_dict_invalid_orient(self): df = DataFrame({'A': [0, 1]}) msg = "orient 'xinvalid' not understood" with pytest.raises(ValueError, match=msg): df.to_dict(orient='xinvalid') def test_to_records_dt64(self): df = DataFrame([["one", "two", "three"], ["four", "five", "six"]], index=date_range("2012-01-01", "2012-01-02")) # convert_datetime64 defaults to None expected = df.index.values[0] result = df.to_records()['index'][0] assert expected == result # check for FutureWarning if convert_datetime64=False is passed with tm.assert_produces_warning(FutureWarning): expected = df.index.values[0] result = df.to_records(convert_datetime64=False)['index'][0] assert expected == result # check for FutureWarning if convert_datetime64=True is passed with tm.assert_produces_warning(FutureWarning): expected = df.index[0] result = df.to_records(convert_datetime64=True)['index'][0] assert expected == result def test_to_records_with_multindex(self): # GH3189 index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] data = np.zeros((8, 4)) df = DataFrame(data, index=index) r = df.to_records(index=True)['level_0'] assert 'bar' in r assert 'one' not in r def test_to_records_with_Mapping_type(self): import email from email.parser import Parser abc.Mapping.register(email.message.Message) headers = Parser().parsestr('From: <*****@*****.**>\n' 'To: <*****@*****.**>\n' 'Subject: Test message\n' '\n' 'Body would go here\n') frame = DataFrame.from_records([headers]) all(x in frame for x in ['Type', 'Subject', 'From']) def test_to_records_floats(self): df = DataFrame(np.random.rand(10, 10)) df.to_records() def test_to_records_index_name(self): df = DataFrame(np.random.randn(3, 3)) df.index.name = 'X' rs = df.to_records() assert 'X' in rs.dtype.fields df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() assert 'index' in rs.dtype.fields df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]) df.index.names = ['A', None] rs = df.to_records() assert 'level_0' in rs.dtype.fields def test_to_records_with_unicode_index(self): # GH13172 # unicode_literals conflict with to_records result = DataFrame([{'a': 'x', 'b': 'y'}]).set_index('a') \ .to_records() expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) tm.assert_almost_equal(result, expected) def test_to_records_with_unicode_column_names(self): # xref issue: https://github.com/numpy/numpy/issues/2407 # Issue #11879. to_records used to raise an exception when used # with column names containing non-ascii characters in Python 2 result = DataFrame(data={"accented_name_é": [1.0]}).to_records() # Note that numpy allows for unicode field names but dtypes need # to be specified using dictionary instead of list of tuples. expected = np.rec.array( [(0, 1.0)], dtype={"names": ["index", "accented_name_é"], "formats": ['=i8', '=f8']} ) tm.assert_almost_equal(result, expected) def test_to_records_with_categorical(self): # GH8626 # dict creation df = DataFrame({'A': list('abc')}, dtype='category') expected = Series(list('abc'), dtype='category', name='A') tm.assert_series_equal(df['A'], expected) # list-like creation df = DataFrame(list('abc'), dtype='category') expected = Series(list('abc'), dtype='category', name=0) tm.assert_series_equal(df[0], expected) # to record array # this coerces result = df.to_records() expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')], dtype=[('index', '=i8'), ('0', 'O')]) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("kwargs,expected", [ # No dtypes --> default to array dtypes. (dict(), np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")])), # Should have no effect in this case. (dict(index=True), np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")])), # Column dtype applied across the board. Index unaffected. (dict(column_dtypes="<U4"), np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"), ("C", "<U4")])), # Index dtype applied across the board. Columns unaffected. (dict(index_dtypes="<U1"), np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")], dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"), ("C", "O")])), # Pass in a type instance. (dict(column_dtypes=np.unicode), np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")])), # Pass in a dtype instance. (dict(column_dtypes=np.dtype('unicode')), np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")])), # Pass in a dictionary (name-only). (dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}), np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "<U2")])), # Pass in a dictionary (indices-only). (dict(index_dtypes={0: "int16"}), np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")])), # Ignore index mappings if index is not True. (dict(index=False, index_dtypes="<U2"), np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")], dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])), # Non-existent names / indices in mapping should not error. (dict(index_dtypes={0: "int16", "not-there": "float32"}), np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")])), # Names / indices not in mapping default to array dtype. (dict(column_dtypes={"A": np.int8, "B": np.float32}), np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")])), # Names / indices not in dtype mapping default to array dtype. (dict(column_dtypes={"A": np.dtype('int8'), "B": np.dtype('float32')}), np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")])), # Mixture of everything. (dict(column_dtypes={"A": np.int8, "B": np.float32}, index_dtypes="<U2"), np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")])), # Invalid dype values. (dict(index=False, column_dtypes=list()), (ValueError, "Invalid dtype \\[\\] specified for column A")), (dict(index=False, column_dtypes={"A": "int32", "B": 5}), (ValueError, "Invalid dtype 5 specified for column B")), # Numpy can't handle EA types, so check error is raised (dict(index=False, column_dtypes={"A": "int32", "B": CategoricalDtype(['a', 'b'])}), (ValueError, 'Invalid dtype category specified for column B')), # Check that bad types raise (dict(index=False, column_dtypes={"A": "int32", "B": "foo"}), (TypeError, 'data type "foo" not understood')), ]) def test_to_records_dtype(self, kwargs, expected): # see gh-18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) if not isinstance(expected, np.recarray): with pytest.raises(expected[0], match=expected[1]): df.to_records(**kwargs) else: result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("df,kwargs,expected", [ # MultiIndex in the index. (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")).set_index(["a", "b"]), dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}), np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)], dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])), # MultiIndex in the columns. (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")])), dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"), np.rec.array([(0., "1", 2, 3.), (1., "4", 5, 6.), (2., "7", 8, 9.)], dtype=[("index", "<f4"), ("('a', 'd')", "<U1"), ("('b', 'e')", "<i8"), ("('c', 'f')", "<f4")])), # MultiIndex in both the columns and index. (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples([ ("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")), index=MultiIndex.from_tuples([ ("d", -4), ("d", -5), ("f", -6)], names=list("cd"))), dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}), np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.), ("f", -6, 7, 8, 9.)], dtype=[("c", "<U2"), ("d", "i1"), ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"), ("('c', 'f')", "<f8")])) ]) def test_to_records_dtype_mi(self, df, kwargs, expected): # see gh-18146 result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) def test_to_records_dict_like(self): # see gh-18146 class DictLike(object): def __init__(self, **kwargs): self.d = kwargs.copy() def __getitem__(self, key): return self.d.__getitem__(key) def __contains__(self, key): return key in self.d def keys(self): return self.d.keys() df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8, "B": np.float32}), index_dtypes="<U2") result = df.to_records(**dtype_mappings) expected = np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")]) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize('mapping', [dict, defaultdict(list), OrderedDict]) def test_to_dict(self, mapping): test_data = { 'A': {'1': 1, '2': 2}, 'B': {'1': '1', '2': '2', '3': '3'}, } # GH16122 recons_data = DataFrame(test_data).to_dict(into=mapping) for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k][k2]) recons_data = DataFrame(test_data).to_dict("l", mapping) for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k][int(k2) - 1]) recons_data = DataFrame(test_data).to_dict("s", mapping) for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k][k2]) recons_data = DataFrame(test_data).to_dict("sp", mapping) expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'], 'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]} tm.assert_dict_equal(recons_data, expected_split) recons_data = DataFrame(test_data).to_dict("r", mapping) expected_records = [{'A': 1.0, 'B': '1'}, {'A': 2.0, 'B': '2'}, {'A': np.nan, 'B': '3'}] assert isinstance(recons_data, list) assert (len(recons_data) == 3) for l, r in zip(recons_data, expected_records): tm.assert_dict_equal(l, r) # GH10844 recons_data = DataFrame(test_data).to_dict("i") for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k2][k]) df = DataFrame(test_data) df['duped'] = df[df.columns[0]] recons_data = df.to_dict("i") comp_data = test_data.copy() comp_data['duped'] = comp_data[df.columns[0]] for k, v in compat.iteritems(comp_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k2][k]) @pytest.mark.parametrize('mapping', [list, defaultdict, []]) def test_to_dict_errors(self, mapping): # GH16122 df = DataFrame(np.random.randn(3, 3)) with pytest.raises(TypeError): df.to_dict(into=mapping) def test_to_dict_not_unique_warning(self): # GH16927: When converting to a dict, if a column has a non-unique name # it will be dropped, throwing a warning. df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b']) with tm.assert_produces_warning(UserWarning): df.to_dict() @pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern']) def test_to_records_datetimeindex_with_tz(self, tz): # GH13937 dr = date_range('2016-01-01', periods=10, freq='S', tz=tz) df = DataFrame({'datetime': dr}, index=dr) expected = df.to_records() result = df.tz_convert("UTC").to_records() # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) # orient - orient argument to to_dict function # item_getter - function for extracting value from # the resulting dict using column name and index @pytest.mark.parametrize('orient,item_getter', [ ('dict', lambda d, col, idx: d[col][idx]), ('records', lambda d, col, idx: d[idx][col]), ('list', lambda d, col, idx: d[col][idx]), ('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]), ('index', lambda d, col, idx: d[idx][col]) ]) def test_to_dict_box_scalars(self, orient, item_getter): # 14216, 23753 # make sure that we are boxing properly df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) result = df.to_dict(orient=orient) assert isinstance(item_getter(result, 'a', 0), int) assert isinstance(item_getter(result, 'b', 0), float) def test_frame_to_dict_tz(self): # GH18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)] df = DataFrame(list(data), columns=["d", ]) result = df.to_dict(orient='records') expected = [ {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)}, {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)}, ] tm.assert_dict_equal(result[0], expected[0]) tm.assert_dict_equal(result[1], expected[1]) @pytest.mark.parametrize('into, expected', [ (dict, {0: {'int_col': 1, 'float_col': 1.0}, 1: {'int_col': 2, 'float_col': 2.0}, 2: {'int_col': 3, 'float_col': 3.0}}), (OrderedDict, OrderedDict([(0, {'int_col': 1, 'float_col': 1.0}), (1, {'int_col': 2, 'float_col': 2.0}), (2, {'int_col': 3, 'float_col': 3.0})])), (defaultdict(list), defaultdict(list, {0: {'int_col': 1, 'float_col': 1.0}, 1: {'int_col': 2, 'float_col': 2.0}, 2: {'int_col': 3, 'float_col': 3.0}})) ]) def test_to_dict_index_dtypes(self, into, expected): # GH 18580 # When using to_dict(orient='index') on a dataframe with int # and float columns only the int columns were cast to float df = DataFrame({'int_col': [1, 2, 3], 'float_col': [1.0, 2.0, 3.0]}) result = df.to_dict(orient='index', into=into) cols = ['int_col', 'float_col'] result = DataFrame.from_dict(result, orient='index')[cols] expected = DataFrame.from_dict(expected, orient='index')[cols] tm.assert_frame_equal(result, expected) def test_to_dict_numeric_names(self): # https://github.com/pandas-dev/pandas/issues/24940 df = DataFrame({str(i): [i] for i in range(5)}) result = set(df.to_dict('records')[0].keys()) expected = set(df.columns) assert result == expected def test_to_dict_wide(self): # https://github.com/pandas-dev/pandas/issues/24939 df = DataFrame({('A_{:d}'.format(i)): [i] for i in range(256)}) result = df.to_dict('records')[0] expected = {'A_{:d}'.format(i): i for i in range(256)} assert result == expected
class TestDataFrameConvertTo: def test_to_dict_timestamp(self): # GH11247 # split/records producing np.datetime64 rather than Timestamps # on datetime64[ns] dtypes only tsmp = Timestamp("20130101") test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]}) test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]}) expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}] expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}] assert test_data.to_dict(orient="records") == expected_records assert test_data_mixed.to_dict( orient="records") == expected_records_mixed expected_series = { "A": Series([tsmp, tsmp], name="A"), "B": Series([tsmp, tsmp], name="B"), } expected_series_mixed = { "A": Series([tsmp, tsmp], name="A"), "B": Series([1, 2], name="B"), } tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series) tm.assert_dict_equal(test_data_mixed.to_dict(orient="series"), expected_series_mixed) expected_split = { "index": [0, 1], "data": [[tsmp, tsmp], [tsmp, tsmp]], "columns": ["A", "B"], } expected_split_mixed = { "index": [0, 1], "data": [[tsmp, 1], [tsmp, 2]], "columns": ["A", "B"], } tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split) tm.assert_dict_equal(test_data_mixed.to_dict(orient="split"), expected_split_mixed) def test_to_dict_index_not_unique_with_index_orient(self): # GH22801 # Data loss when indexes are not unique. Raise ValueError. df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"]) msg = "DataFrame index must be unique for orient='index'" with pytest.raises(ValueError, match=msg): df.to_dict(orient="index") def test_to_dict_invalid_orient(self): df = DataFrame({"A": [0, 1]}) msg = "orient 'xinvalid' not understood" with pytest.raises(ValueError, match=msg): df.to_dict(orient="xinvalid") def test_to_records_dt64(self): df = DataFrame( [["one", "two", "three"], ["four", "five", "six"]], index=date_range("2012-01-01", "2012-01-02"), ) expected = df.index.values[0] result = df.to_records()["index"][0] assert expected == result def test_to_records_with_multindex(self): # GH3189 index = [ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] data = np.zeros((8, 4)) df = DataFrame(data, index=index) r = df.to_records(index=True)["level_0"] assert "bar" in r assert "one" not in r def test_to_records_with_Mapping_type(self): import email from email.parser import Parser abc.Mapping.register(email.message.Message) headers = Parser().parsestr("From: <*****@*****.**>\n" "To: <*****@*****.**>\n" "Subject: Test message\n" "\n" "Body would go here\n") frame = DataFrame.from_records([headers]) all(x in frame for x in ["Type", "Subject", "From"]) def test_to_records_floats(self): df = DataFrame(np.random.rand(10, 10)) df.to_records() def test_to_records_index_name(self): df = DataFrame(np.random.randn(3, 3)) df.index.name = "X" rs = df.to_records() assert "X" in rs.dtype.fields df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() assert "index" in rs.dtype.fields df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) df.index.names = ["A", None] rs = df.to_records() assert "level_0" in rs.dtype.fields def test_to_records_with_unicode_index(self): # GH13172 # unicode_literals conflict with to_records result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) tm.assert_almost_equal(result, expected) def test_to_records_with_unicode_column_names(self): # xref issue: https://github.com/numpy/numpy/issues/2407 # Issue #11879. to_records used to raise an exception when used # with column names containing non-ascii characters in Python 2 result = DataFrame(data={"accented_name_é": [1.0]}).to_records() # Note that numpy allows for unicode field names but dtypes need # to be specified using dictionary instead of list of tuples. expected = np.rec.array( [(0, 1.0)], dtype={ "names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"] }, ) tm.assert_almost_equal(result, expected) def test_to_records_with_categorical(self): # GH8626 # dict creation df = DataFrame({"A": list("abc")}, dtype="category") expected = Series(list("abc"), dtype="category", name="A") tm.assert_series_equal(df["A"], expected) # list-like creation df = DataFrame(list("abc"), dtype="category") expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # to record array # this coerces result = df.to_records() expected = np.rec.array([(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")]) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( "kwargs,expected", [ # No dtypes --> default to array dtypes. ( dict(), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Should have no effect in this case. ( dict(index=True), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Column dtype applied across the board. Index unaffected. ( dict(column_dtypes="<U4"), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"), ("C", "<U4")], ), ), # Index dtype applied across the board. Columns unaffected. ( dict(index_dtypes="<U1"), np.rec.array( [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")], dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Pass in a type instance. ( dict(column_dtypes=np.unicode), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")], ), ), # Pass in a dtype instance. ( dict(column_dtypes=np.dtype("unicode")), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")], ), ), # Pass in a dictionary (name-only). ( dict(column_dtypes={ "A": np.int8, "B": np.float32, "C": "<U2" }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "<U2")], ), ), # Pass in a dictionary (indices-only). ( dict(index_dtypes={0: "int16"}), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Ignore index mappings if index is not True. ( dict(index=False, index_dtypes="<U2"), np.rec.array( [(1, 0.2, "a"), (2, 1.5, "bc")], dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Non-existent names / indices in mapping should not error. ( dict(index_dtypes={ 0: "int16", "not-there": "float32" }), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Names / indices not in mapping default to array dtype. ( dict(column_dtypes={ "A": np.int8, "B": np.float32 }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Names / indices not in dtype mapping default to array dtype. ( dict(column_dtypes={ "A": np.dtype("int8"), "B": np.dtype("float32") }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Mixture of everything. ( dict(column_dtypes={ "A": np.int8, "B": np.float32 }, index_dtypes="<U2"), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Invalid dype values. ( dict(index=False, column_dtypes=list()), (ValueError, "Invalid dtype \\[\\] specified for column A"), ), ( dict(index=False, column_dtypes={ "A": "int32", "B": 5 }), (ValueError, "Invalid dtype 5 specified for column B"), ), # Numpy can't handle EA types, so check error is raised ( dict( index=False, column_dtypes={ "A": "int32", "B": CategoricalDtype(["a", "b"]) }, ), (ValueError, "Invalid dtype category specified for column B"), ), # Check that bad types raise ( dict(index=False, column_dtypes={ "A": "int32", "B": "foo" }), (TypeError, 'data type "foo" not understood'), ), ], ) def test_to_records_dtype(self, kwargs, expected): # see gh-18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) if not isinstance(expected, np.recarray): with pytest.raises(expected[0], match=expected[1]): df.to_records(**kwargs) else: result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( "df,kwargs,expected", [ # MultiIndex in the index. ( DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")).set_index(["a", "b"]), dict(column_dtypes="float64", index_dtypes={ 0: "int32", 1: "int8" }), np.rec.array( [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)], dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")], ), ), # MultiIndex in the columns. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")]), ), dict(column_dtypes={ 0: "<U1", 2: "float32" }, index_dtypes="float32"), np.rec.array( [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)], dtype=[ ("index", "<f4"), ("('a', 'd')", "<U1"), ("('b', 'e')", "<i8"), ("('c', 'f')", "<f4"), ], ), ), # MultiIndex in both the columns and index. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")), index=MultiIndex.from_tuples([("d", -4), ("d", -5), ("f", -6)], names=list("cd")), ), dict(column_dtypes="float64", index_dtypes={ 0: "<U2", 1: "int8" }), np.rec.array( [ ("d", -4, 1.0, 2.0, 3.0), ("d", -5, 4.0, 5.0, 6.0), ("f", -6, 7, 8, 9.0), ], dtype=[ ("c", "<U2"), ("d", "i1"), ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"), ("('c', 'f')", "<f8"), ], ), ), ], ) def test_to_records_dtype_mi(self, df, kwargs, expected): # see gh-18146 result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) def test_to_records_dict_like(self): # see gh-18146 class DictLike: def __init__(self, **kwargs): self.d = kwargs.copy() def __getitem__(self, key): return self.d.__getitem__(key) def __contains__(self, key): return key in self.d def keys(self): return self.d.keys() df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) dtype_mappings = dict( column_dtypes=DictLike(**{ "A": np.int8, "B": np.float32 }), index_dtypes="<U2", ) result = df.to_records(**dtype_mappings) expected = np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) def test_to_dict(self, mapping): test_data = { "A": { "1": 1, "2": 2 }, "B": { "1": "1", "2": "2", "3": "3" } } # GH16122 recons_data = DataFrame(test_data).to_dict(into=mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] recons_data = DataFrame(test_data).to_dict("l", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][int(k2) - 1] recons_data = DataFrame(test_data).to_dict("s", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] recons_data = DataFrame(test_data).to_dict("sp", mapping) expected_split = { "columns": ["A", "B"], "index": ["1", "2", "3"], "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]], } tm.assert_dict_equal(recons_data, expected_split) recons_data = DataFrame(test_data).to_dict("r", mapping) expected_records = [ { "A": 1.0, "B": "1" }, { "A": 2.0, "B": "2" }, { "A": np.nan, "B": "3" }, ] assert isinstance(recons_data, list) assert len(recons_data) == 3 for l, r in zip(recons_data, expected_records): tm.assert_dict_equal(l, r) # GH10844 recons_data = DataFrame(test_data).to_dict("i") for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k2][k] df = DataFrame(test_data) df["duped"] = df[df.columns[0]] recons_data = df.to_dict("i") comp_data = test_data.copy() comp_data["duped"] = comp_data[df.columns[0]] for k, v in comp_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k2][k] @pytest.mark.parametrize("mapping", [list, defaultdict, []]) def test_to_dict_errors(self, mapping): # GH16122 df = DataFrame(np.random.randn(3, 3)) with pytest.raises(TypeError): df.to_dict(into=mapping) def test_to_dict_not_unique_warning(self): # GH16927: When converting to a dict, if a column has a non-unique name # it will be dropped, throwing a warning. df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) with tm.assert_produces_warning(UserWarning): df.to_dict() @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) def test_to_records_datetimeindex_with_tz(self, tz): # GH13937 dr = date_range("2016-01-01", periods=10, freq="S", tz=tz) df = DataFrame({"datetime": dr}, index=dr) expected = df.to_records() result = df.tz_convert("UTC").to_records() # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) # orient - orient argument to to_dict function # item_getter - function for extracting value from # the resulting dict using column name and index @pytest.mark.parametrize( "orient,item_getter", [ ("dict", lambda d, col, idx: d[col][idx]), ("records", lambda d, col, idx: d[idx][col]), ("list", lambda d, col, idx: d[col][idx]), ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]), ("index", lambda d, col, idx: d[idx][col]), ], ) def test_to_dict_box_scalars(self, orient, item_getter): # 14216, 23753 # make sure that we are boxing properly df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) result = df.to_dict(orient=orient) assert isinstance(item_getter(result, "a", 0), int) assert isinstance(item_getter(result, "b", 0), float) def test_frame_to_dict_tz(self): # GH18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [ (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc), ), (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc), ), ] df = DataFrame(list(data), columns=["d"]) result = df.to_dict(orient="records") expected = [ { "d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc) }, { "d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc) }, ] tm.assert_dict_equal(result[0], expected[0]) tm.assert_dict_equal(result[1], expected[1]) @pytest.mark.parametrize( "into, expected", [ ( dict, { 0: { "int_col": 1, "float_col": 1.0 }, 1: { "int_col": 2, "float_col": 2.0 }, 2: { "int_col": 3, "float_col": 3.0 }, }, ), ( OrderedDict, OrderedDict([ (0, { "int_col": 1, "float_col": 1.0 }), (1, { "int_col": 2, "float_col": 2.0 }), (2, { "int_col": 3, "float_col": 3.0 }), ]), ), ( defaultdict(dict), defaultdict( dict, { 0: { "int_col": 1, "float_col": 1.0 }, 1: { "int_col": 2, "float_col": 2.0 }, 2: { "int_col": 3, "float_col": 3.0 }, }, ), ), ], ) def test_to_dict_index_dtypes(self, into, expected): # GH 18580 # When using to_dict(orient='index') on a dataframe with int # and float columns only the int columns were cast to float df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]}) result = df.to_dict(orient="index", into=into) cols = ["int_col", "float_col"] result = DataFrame.from_dict(result, orient="index")[cols] expected = DataFrame.from_dict(expected, orient="index")[cols] tm.assert_frame_equal(result, expected) def test_to_dict_numeric_names(self): # https://github.com/pandas-dev/pandas/issues/24940 df = DataFrame({str(i): [i] for i in range(5)}) result = set(df.to_dict("records")[0].keys()) expected = set(df.columns) assert result == expected def test_to_dict_wide(self): # https://github.com/pandas-dev/pandas/issues/24939 df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)}) result = df.to_dict("records")[0] expected = {"A_{:d}".format(i): i for i in range(256)} assert result == expected def test_to_dict_orient_dtype(self): # https://github.com/pandas-dev/pandas/issues/22620 # Input Data input_data = { "a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["X", "Y", "Z"] } df = DataFrame(input_data) # Expected Dtypes expected = {"a": int, "b": float, "c": str} # Extracting dtypes out of to_dict operation for df_dict in df.to_dict("records"): result = { "a": type(df_dict["a"]), "b": type(df_dict["b"]), "c": type(df_dict["c"]), } assert result == expected
def test_london_cleaner(): unclean_input = pd.DataFrame.from_dict( { "Place (Overall)": [12547, 34146], "Place (Gender)": [9390, 20833], "Place (Category)": [4345, 3132], "Name": ["»A Smith, Matthew (GBR) \n", "»Aalders, Jennifer (GBR) \n"], "Sex": ["M", "W"], "Club": ["Lymm Runners", "Tynny Trotters"], "Running Number": ["Runner Number40546", "Runner Number23235"], "Category": ["18-39", pd.NA], "Finish": ["0 days 03:59:33", "0 days 06:22:20"], "Year": [2021, 2021], } ) exp_output = pd.DataFrame.from_dict( { "Place (Overall)": [12547, 34146], "Place (Gender)": [9390, 20833], "Place (Category)": [4345, 3132], "Name": ["A Smith Matthew", "Aalders Jennifer"], "Sex": ["M", "F"], "Club": ["Lymm Runners", "Tynny Trotters"], "Running Number": ["40546", "23235"], "Category": ["18-39", "Unknown"], "Finish": [ pd.Timedelta("0 days 03:59:33"), pd.Timedelta("0 days 06:22:20"), ], "Year": [2021, 2021], "Country": ["GBR", "GBR"], "FirstName": ["Matthew", "Jennifer"], "LastName": ["A Smith", "Aalders"], "DSQ": [False, False], "Finish (Total Seconds)": [14373.0, 22940.0], } ).astype( { "Place (Overall)": Int64Dtype(), "Place (Gender)": Int64Dtype(), "Place (Category)": Int64Dtype(), "Name": dtype("O"), "Sex": dtype("O"), "Club": dtype("O"), "Running Number": dtype("O"), "Category": CategoricalDtype( categories=[ "18-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70+", "70-74", "75-79", "80-84", "85+", "80+", "Unknown", ], ordered=False, ), "Finish": dtype("<m8[ns]"), "Year": Int64Dtype(), "Country": dtype("O"), "FirstName": dtype("O"), "LastName": dtype("O"), "DSQ": dtype("bool"), "Finish (Total Seconds)": dtype("float64"), } ) actual_output = london_cleaner(unclean_input) pd.testing.assert_frame_equal(actual_output, exp_output, check_categorical=False)
def get_feature_df(self) -> Tuple[pd.DataFrame, List[Any]]: """ Transform incoming data into pandas dataframe :return: tuple(features pandas.DataFrame, unqualified item id list) """ # prepare features dataframe target_qs = self.get_queryset() all_sample_ids = list(target_qs.values_list('id', flat=True)) # TODO: all documents ref. by all_sample_ids should be in feature_table feature_table: Optional[pd.DataFrame] = None counter = 'counter' for feature_source_item in self.feature_source: msg = f'Get "{feature_source_item}" feature data:' self.log_message(msg) self.log_message('_' * len(msg)) # get aggregation queryset parameters for .annotate function source_model = self.source_models[feature_source_item] source_field = self.source_fields[feature_source_item] target_id_field = self.target_id_field aggregation = {counter: self.aggregation_function} # try to decrease memory usage iterating over chunks and using sparse dataframes # Note: pivot_table takes extra memory so use lower memory limits source_qs = source_model.objects.filter(**{target_id_field + '__in': all_sample_ids}) if hasattr(source_model, 'text_unit'): source_qs = source_qs.filter(**{self.unit_type_filter: self.unit_type}) ids = sorted(source_qs.order_by(target_id_field).values_list(target_id_field, flat=True).distinct()) terms = sorted(source_qs.order_by(source_field).values_list(source_field, flat=True).distinct()) id_count = len(ids) term_count = len(terms) self.log_message(f'{self.source_item}s containing "{feature_source_item}": {id_count}') self.log_message(f'unique "{feature_source_item}" items: {term_count}') if not term_count: self.log_message(f'WARN: there are no "{feature_source_item}" entities found') continue from_mem_chunk_size = self.get_chunk_size(term_count * 2) # np.uint16 - 2 bytes chunk_size = min([self.max_chunk_size, from_mem_chunk_size]) self.log_message(f'chunk_size from_mem/min/final: {from_mem_chunk_size}/{self.max_chunk_size}/{chunk_size}') # TODO: we stopped using pd.SparseDataFrame as there's no such class anymore single_feature_table = SparseSingleFeatureTable(feature_source_item) for step in range(0, id_count, chunk_size): self.log_message(f'...process "{feature_source_item}" feature: "{self.source_item}s" range: {step}-{step + chunk_size}') sample_ids = ids[step:step + chunk_size] chunk_qs = source_qs \ .filter(**{target_id_field + '__in': sample_ids}) \ .order_by(target_id_field, source_field) \ .values(target_id_field, source_field) \ .annotate(**aggregation) df_src = list(chunk_qs) chunk_df = pd.DataFrame.from_records(df_src) del chunk_qs gc.collect() # try to free up memory doc_cat = CategoricalDtype(sample_ids, ordered=True) # TODO: fix for date features: pandas can't compare dates, but datetimes only if terms and isinstance(terms[0], datetime.date): terms = [datetime.datetime.combine(d, datetime.datetime.min.time()) for d in terms] term_cat = CategoricalDtype(terms, ordered=True) row = [] if chunk_df.empty else chunk_df[self.target_id_field].astype(doc_cat).cat.codes col = [] if chunk_df.empty else chunk_df[source_field].astype(term_cat).cat.codes val = [] if chunk_df.empty else chunk_df[counter] sparse_matrix = scp.csr_matrix( (val, (row, col)), shape=(len(sample_ids), term_cat.categories.size), dtype=np.uint16) single_feature_table.join(sparse_matrix) del chunk_df gc.collect() # try to free up memory mem = psutil.virtual_memory() self.log_message(f'......available memory: {get_mb(mem.available)}M ({mem.percent}%)') # join feature_source_item-specific dataframe into results dataframe gc.collect() # try to free up memory single_feature_df_src = SparseAllFeaturesTable(ids) single_feature_df_src.add_feature_table(single_feature_table, terms) if feature_table is None: feature_table = single_feature_df_src.to_dataframe() else: feature_table = feature_table.join(single_feature_df_src.to_dataframe(), how='outer') del single_feature_table del single_feature_df_src gc.collect() # try to free up memory # end of "for feature_source_item in self.feature_source" df = feature_table if self.drop_empty_columns: df.dropna(axis=1, how='all', inplace=True) self.log_message(f'df: {get_df_info(df)}') mem = psutil.virtual_memory() self.log_message(f'available memory: {get_mb(mem.available)}M ({mem.percent}%)') if df.empty: msg = 'No features of chosen "feature_source" options {} detected. ' \ 'Empty Data Set.'.format(str(self.feature_source)) raise EmptyDataSetError(msg, feature_source=self.feature_source) # item ids not included in feature df which don't have features at all initial_id_set = set(target_qs.values_list('id', flat=True)) feature_id_set = set(df.index.tolist()) unqualified_item_ids = sorted(list(initial_id_set.difference(feature_id_set))) self.log_message('count unqualified_item_ids: {}'.format(len(unqualified_item_ids))) if not self.drop_empty_rows and unqualified_item_ids: unqualified_items_df = pd.DataFrame(index=unqualified_item_ids, columns=df.columns).fillna(0) self.log_message('unqualified_items_df shape: {} size: {}'.format( unqualified_items_df.shape, unqualified_items_df.memory_usage().sum())) df = pd.concat([df, unqualified_items_df]).fillna(0).astype(np.uint16) self.log_message(f'df: {get_df_info(df)}') return df, unqualified_item_ids
class TestAstype: def test_astype_float(self, float_frame): casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) casted = float_frame.astype(np.int32) expected = DataFrame( float_frame.values.astype(np.int32), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) float_frame["foo"] = "5" casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) def test_astype_mixed_float(self, mixed_float_frame): # mixed casting casted = mixed_float_frame.reindex( columns=["A", "B"]).astype("float32") _check_cast(casted, "float32") casted = mixed_float_frame.reindex( columns=["A", "B"]).astype("float16") _check_cast(casted, "float16") def test_astype_mixed_type(self, mixed_type_frame): # mixed casting mn = mixed_type_frame._get_numeric_data().copy() mn["little_float"] = np.array(12345.0, dtype="float16") mn["big_float"] = np.array(123456789101112.0, dtype="float64") casted = mn.astype("float64") _check_cast(casted, "float64") casted = mn.astype("int64") _check_cast(casted, "int64") casted = mn.reindex(columns=["little_float"]).astype("float16") _check_cast(casted, "float16") casted = mn.astype("float32") _check_cast(casted, "float32") casted = mn.astype("int32") _check_cast(casted, "int32") # to object casted = mn.astype("O") _check_cast(casted, "object") def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) df["string"] = "foo" casted = df.astype(int, errors="ignore") expected["string"] = "foo" tm.assert_frame_equal(casted, expected) df = float_frame.copy() expected = float_frame.astype(np.int32) df["string"] = "foo" casted = df.astype(np.int32, errors="ignore") expected["string"] = "foo" tm.assert_frame_equal(casted, expected) def test_astype_with_view_float(self, float_frame): # this is the only real reason to do it this way tf = np.round(float_frame).astype(np.int32) casted = tf.astype(np.float32, copy=False) # TODO(wesm): verification? tf = float_frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa def test_astype_with_view_mixed_float(self, mixed_float_frame): tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): # see GH#14265 # # Check NaN and inf --> raise error when converting to int. msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" df = DataFrame([val]) with pytest.raises(ValueError, match=msg): df.astype(dtype) def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like result = df.astype(str) expected = DataFrame({ "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), "d": list(map(str, d._values)), "e": list(map(str, e._values)), }) tm.assert_frame_equal(result, expected) def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.NaN]).astype(str) expected = DataFrame(["nan"]) tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) # < 1.14 truncates # >= 1.14 preserves the full repr val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457" expected = DataFrame([val]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 a = Series(date_range("2010-01-04", periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) d = Series(["1.0", "2", "3.14", "4", "5.4"]) df = DataFrame({"a": a, "b": b, "c": c, "d": d}) original = df.copy(deep=True) # change type of a subset of columns dt1 = dtype_class({"b": "str", "d": "float32"}) result = df.astype(dt1) expected = DataFrame({ "a": a, "b": Series(["0", "1", "2", "3", "4"]), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), }) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, original) dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) result = df.astype(dt2) expected = DataFrame({ "a": a, "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), }) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, original) # change all columns dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) tm.assert_frame_equal(df.astype(dt3), df.astype(str)) tm.assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict dt4 = dtype_class({"b": str, 2: str}) dt5 = dtype_class({"e": str}) msg = "Only a column name can be used for the key in a dtype mappings argument" with pytest.raises(KeyError, match=msg): df.astype(dt4) with pytest.raises(KeyError, match=msg): df.astype(dt5) tm.assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the # resulting DataFrame should be the same as the original DataFrame dt6 = dtype_class({col: df[col].dtype for col in df.columns}) equiv = df.astype(dt6) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) # GH#16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame dt7 = dtype_class({}) if dtype_class is dict else dtype_class( {}, dtype=object) equiv = df.astype(dt7) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) def test_astype_duplicate_col(self): a1 = Series([1, 2, 3, 4, 5], name="a") b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) result = df.astype(str) a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) result = df.astype({"a": "str"}) expected = concat([a1_str, b, a2_str], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [ "category", CategoricalDtype(), CategoricalDtype(ordered=True), CategoricalDtype(ordered=False), CategoricalDtype(categories=list("abcdef")), CategoricalDtype(categories=list("edba"), ordered=False), CategoricalDtype(categories=list("edcb"), ordered=True), ], ids=repr, ) def test_astype_categorical(self, dtype): # GH#18099 d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ["a", "a", "b", "c"]}) xpr = f"Expected an instance of {cls.__name__}" with pytest.raises(TypeError, match=xpr): df.astype({"A": cls}) with pytest.raises(TypeError, match=xpr): df["A"].astype(cls) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes(self, dtype): # GH#22578 df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) expected1 = DataFrame({ "a": integer_array([1, 3, 5], dtype=dtype), "b": integer_array([2, 4, 6], dtype=dtype), }) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) df["b"] = df["b"].astype(dtype) expected2 = DataFrame({ "a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype) }) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes_1d(self, dtype): # GH#22578 df = DataFrame({"a": [1.0, 2.0, 3.0]}) expected1 = DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) df = DataFrame({"a": [1.0, 2.0, 3.0]}) df["a"] = df["a"].astype(dtype) expected2 = DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["category", "Int64"]) def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH#24704 a1 = Series([0, np.nan, 4], name="a") a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [{ 100: "float64", 200: "uint64" }, "category", "float64"]) def test_astype_column_metadata(self, dtype): # GH#19920 columns = UInt64Index([100, 200, 300], name="foo") df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_object(self, dtype, unit): # tests astype to object dtype # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(object) assert (result.dtypes == object).all() if dtype.startswith("M8"): assert result.iloc[0, 0] == Timestamp(1, unit=unit) else: assert result.iloc[0, 0] == Timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" arr = np.array([[1, 2, 3]], dtype=arr_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # GH#19223 dtype = f"M8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns"]) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # GH#19223 dtype = f"m8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float # GH#19223 dtype = f"m8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(df.values.astype(dtype).astype(float)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # GH#19224 dtype = f"M8[{unit}]" other = f"m8[{unit}]" df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = (fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to " fr"\[timedelta64\[{unit}\]\]") with pytest.raises(TypeError, match=msg): df.astype(other) msg = (fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to " fr"\[datetime64\[{unit}\]\]") df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) def test_astype_arg_for_errors(self): # GH#14878 df = DataFrame([1, 2, 3]) msg = ("Expected value of kwarg 'errors' to be one of " "['raise', 'ignore']. Supplied value is 'True'") with pytest.raises(ValueError, match=re.escape(msg)): df.astype(np.float64, errors=True) df.astype(np.int8, errors="ignore") def test_astype_arg_for_errors_dictlist(self): # GH#25905 df = DataFrame([ { "a": "1", "b": "16.5%", "c": "test" }, { "a": "2.2", "b": "15.3", "c": "another_test" }, ]) expected = DataFrame([ { "a": 1.0, "b": "16.5%", "c": "test" }, { "a": 2.2, "b": "15.3", "c": "another_test" }, ]) type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") tm.assert_frame_equal(result, expected) def test_astype_dt64tz(self, timezone_frame): # astype expected = np.array( [ [ Timestamp("2013-01-01 00:00:00"), Timestamp("2013-01-02 00:00:00"), Timestamp("2013-01-03 00:00:00"), ], [ Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), NaT, Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), ], [ Timestamp("2013-01-01 00:00:00+0100", tz="CET"), NaT, Timestamp("2013-01-03 00:00:00+0100", tz="CET"), ], ], dtype=object, ).T expected = DataFrame( expected, index=timezone_frame.index, columns=timezone_frame.columns, dtype=object, ) result = timezone_frame.astype(object) tm.assert_frame_equal(result, expected) result = timezone_frame.astype("datetime64[ns]") expected = DataFrame({ "A": date_range("20130101", periods=3), "B": (date_range("20130101", periods=3, tz="US/Eastern").tz_convert("UTC").tz_localize(None)), "C": (date_range("20130101", periods=3, tz="CET").tz_convert("UTC").tz_localize(None)), }) expected.iloc[1, 1] = NaT expected.iloc[1, 2] = NaT tm.assert_frame_equal(result, expected) def test_astype_dt64tz_to_str(self, timezone_frame): # str formatting result = timezone_frame.astype(str) expected = DataFrame( [ [ "2013-01-01", "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], ["2013-01-02", "NaT", "NaT"], [ "2013-01-03", "2013-01-03 00:00:00-05:00", "2013-01-03 00:00:00+01:00", ], ], columns=timezone_frame.columns, ) tm.assert_frame_equal(result, expected) with option_context("display.max_columns", 20): result = str(timezone_frame) assert ( "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" ) in result assert ( "1 2013-01-02 NaT NaT" ) in result assert ( "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" ) in result
def replot_single(self, i): col = self.joint_data.columns[i] self.outboxes[i].clear_output(wait=True) with self.outboxes[i]: plt.clf() fig = plt.gcf() fig.set_figwidth(3) fig.set_figheight(1) if self.scope.get_dtype(col) in ('cat', 'bool'): if self.scope.get_dtype(col) == 'cat': bar_labels = self.scope.get_cat_values(col) else: bar_labels = [False, True] v = self.joint_data[col].astype( CategoricalDtype(categories=bar_labels, ordered=False)).cat.codes bar_heights, _ = numpy.histogram(v, bins=numpy.arange( 0, len(bar_labels) + 1)) bar_x = numpy.arange(0, len(bar_labels)) plt.bar(bar_x, bar_heights, 0.8, align='edge') filter_vals = self.joint_data.loc[self.joint_filters.all( axis=1), col].astype( CategoricalDtype(categories=bar_labels, ordered=False)).cat.codes bar_heights, _ = numpy.histogram(filter_vals, bins=numpy.arange( 0, len(bar_labels) + 1)) plt.bar(bar_x, bar_heights, 0.8, align='edge') plt.xticks(bar_x + 0.4, [str(i) for i in bar_labels]) plt.show() # bar_labels, bar_heights = numpy.unique(self.joint_data[col], return_counts=True) # bar_x = numpy.arange(0,len(bar_labels)) # plt.bar(bar_x, bar_heights, 0.8, align='edge') # from pandas import CategoricalDtype # filter_vals = self.joint_data.loc[self.joint_filters.all(axis=1), col].astype( # CategoricalDtype(categories=bar_labels, ordered=False) # ).cat.codes # bar_heights, _ = numpy.histogram(filter_vals, bins=numpy.arange(0,len(bar_labels)+1)) # plt.bar(bar_x, bar_heights, 0.8, align='edge') # plt.xticks(bar_x+0.4, bar_labels) # plt.show() else: bins = 20 if col not in self.data.strategy_names else 20 #n, bins, patches = plt.hist(self.joint_data[col], bins=bins) bar_heights, bar_x = numpy.histogram(self.joint_data[col], bins=bins) plt.bar(bar_x[:-1], bar_heights, bar_x[1:] - bar_x[:-1], align='edge') #n, bins, patches = plt.hist(self.joint_data.loc[self.joint_filters.all(axis=1), col], bins=bins) bar_heights, bar_x = numpy.histogram( self.joint_data.loc[self.joint_filters.all(axis=1), col], bins=bar_x) plt.bar(bar_x[:-1], bar_heights, bar_x[1:] - bar_x[:-1], align='edge') plt.show()
class TestAstype: def test_astype_float(self, float_frame): casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) casted = float_frame.astype(np.int32) expected = DataFrame( float_frame.values.astype(np.int32), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) float_frame["foo"] = "5" casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) tm.assert_frame_equal(casted, expected) def test_astype_mixed_float(self, mixed_float_frame): # mixed casting casted = mixed_float_frame.reindex( columns=["A", "B"]).astype("float32") _check_cast(casted, "float32") casted = mixed_float_frame.reindex( columns=["A", "B"]).astype("float16") _check_cast(casted, "float16") def test_astype_mixed_type(self, mixed_type_frame): # mixed casting mn = mixed_type_frame._get_numeric_data().copy() mn["little_float"] = np.array(12345.0, dtype="float16") mn["big_float"] = np.array(123456789101112.0, dtype="float64") casted = mn.astype("float64") _check_cast(casted, "float64") casted = mn.astype("int64") _check_cast(casted, "int64") casted = mn.reindex(columns=["little_float"]).astype("float16") _check_cast(casted, "float16") casted = mn.astype("float32") _check_cast(casted, "float32") casted = mn.astype("int32") _check_cast(casted, "int32") # to object casted = mn.astype("O") _check_cast(casted, "object") @td.skip_array_manager_not_yet_implemented def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) df["string"] = "foo" casted = df.astype(int, errors="ignore") expected["string"] = "foo" tm.assert_frame_equal(casted, expected) df = float_frame.copy() expected = float_frame.astype(np.int32) df["string"] = "foo" casted = df.astype(np.int32, errors="ignore") expected["string"] = "foo" tm.assert_frame_equal(casted, expected) def test_astype_with_view_float(self, float_frame): # this is the only real reason to do it this way tf = np.round(float_frame).astype(np.int32) casted = tf.astype(np.float32, copy=False) # TODO(wesm): verification? tf = float_frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa def test_astype_with_view_mixed_float(self, mixed_float_frame): tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): # see GH#14265 # # Check NaN and inf --> raise error when converting to int. msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" df = DataFrame([val]) with pytest.raises(ValueError, match=msg): df.astype(dtype) def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like result = df.astype(str) expected = DataFrame({ "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), "d": list(map(str, d._values)), "e": list(map(str, e._values)), }) tm.assert_frame_equal(result, expected) def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.NaN]).astype(str) expected = DataFrame(["nan"]) tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" expected = DataFrame([val]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 a = Series(date_range("2010-01-04", periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) d = Series(["1.0", "2", "3.14", "4", "5.4"]) df = DataFrame({"a": a, "b": b, "c": c, "d": d}) original = df.copy(deep=True) # change type of a subset of columns dt1 = dtype_class({"b": "str", "d": "float32"}) result = df.astype(dt1) expected = DataFrame({ "a": a, "b": Series(["0", "1", "2", "3", "4"]), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), }) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, original) dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) result = df.astype(dt2) expected = DataFrame({ "a": a, "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), }) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, original) # change all columns dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) tm.assert_frame_equal(df.astype(dt3), df.astype(str)) tm.assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict dt4 = dtype_class({"b": str, 2: str}) dt5 = dtype_class({"e": str}) msg = "Only a column name can be used for the key in a dtype mappings argument" with pytest.raises(KeyError, match=msg): df.astype(dt4) with pytest.raises(KeyError, match=msg): df.astype(dt5) tm.assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the # resulting DataFrame should be the same as the original DataFrame dt6 = dtype_class({col: df[col].dtype for col in df.columns}) equiv = df.astype(dt6) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) # GH#16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame dt7 = dtype_class({}) if dtype_class is dict else dtype_class( {}, dtype=object) equiv = df.astype(dt7) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) def test_astype_duplicate_col(self): a1 = Series([1, 2, 3, 4, 5], name="a") b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) result = df.astype(str) a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) result = df.astype({"a": "str"}) expected = concat([a1_str, b, a2_str], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [ "category", CategoricalDtype(), CategoricalDtype(ordered=True), CategoricalDtype(ordered=False), CategoricalDtype(categories=list("abcdef")), CategoricalDtype(categories=list("edba"), ordered=False), CategoricalDtype(categories=list("edcb"), ordered=True), ], ids=repr, ) def test_astype_categorical(self, dtype): # GH#18099 d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ["a", "a", "b", "c"]}) xpr = f"Expected an instance of {cls.__name__}" with pytest.raises(TypeError, match=xpr): df.astype({"A": cls}) with pytest.raises(TypeError, match=xpr): df["A"].astype(cls) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes(self, dtype): # GH#22578 df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) expected1 = DataFrame({ "a": pd.array([1, 3, 5], dtype=dtype), "b": pd.array([2, 4, 6], dtype=dtype), }) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) df["b"] = df["b"].astype(dtype) expected2 = DataFrame({ "a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype) }) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes_1d(self, dtype): # GH#22578 df = DataFrame({"a": [1.0, 2.0, 3.0]}) expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) df = DataFrame({"a": [1.0, 2.0, 3.0]}) df["a"] = df["a"].astype(dtype) expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["category", "Int64"]) def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH#24704 a1 = Series([0, np.nan, 4], name="a") a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [{ 100: "float64", 200: "uint64" }, "category", "float64"]) def test_astype_column_metadata(self, dtype): # GH#19920 columns = UInt64Index([100, 200, 300], name="foo") df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_object(self, dtype, unit): # tests astype to object dtype # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(object) assert (result.dtypes == object).all() if dtype.startswith("M8"): assert result.iloc[0, 0] == Timestamp(1, unit=unit) else: assert result.iloc[0, 0] == Timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" arr = np.array([[1, 2, 3]], dtype=arr_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # GH#19223 dtype = f"M8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns"]) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # GH#19223 dtype = f"m8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float # GH#19223 dtype = f"m8[{unit}]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(df.values.astype(dtype).astype(float)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # GH#19224 dtype = f"M8[{unit}]" other = f"m8[{unit}]" df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]" with pytest.raises(TypeError, match=msg): df.astype(other) msg = fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]" df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) @td.skip_array_manager_not_yet_implemented def test_astype_arg_for_errors(self): # GH#14878 df = DataFrame([1, 2, 3]) msg = ("Expected value of kwarg 'errors' to be one of " "['raise', 'ignore']. Supplied value is 'True'") with pytest.raises(ValueError, match=re.escape(msg)): df.astype(np.float64, errors=True) df.astype(np.int8, errors="ignore") def test_astype_arg_for_errors_dictlist(self): # GH#25905 df = DataFrame([ { "a": "1", "b": "16.5%", "c": "test" }, { "a": "2.2", "b": "15.3", "c": "another_test" }, ]) expected = DataFrame([ { "a": 1.0, "b": "16.5%", "c": "test" }, { "a": 2.2, "b": "15.3", "c": "another_test" }, ]) type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") tm.assert_frame_equal(result, expected) def test_astype_dt64tz(self, timezone_frame): # astype expected = np.array( [ [ Timestamp("2013-01-01 00:00:00"), Timestamp("2013-01-02 00:00:00"), Timestamp("2013-01-03 00:00:00"), ], [ Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), NaT, Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), ], [ Timestamp("2013-01-01 00:00:00+0100", tz="CET"), NaT, Timestamp("2013-01-03 00:00:00+0100", tz="CET"), ], ], dtype=object, ).T expected = DataFrame( expected, index=timezone_frame.index, columns=timezone_frame.columns, dtype=object, ) result = timezone_frame.astype(object) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # dt64tz->dt64 deprecated result = timezone_frame.astype("datetime64[ns]") expected = DataFrame({ "A": date_range("20130101", periods=3), "B": (date_range("20130101", periods=3, tz="US/Eastern").tz_convert("UTC").tz_localize(None)), "C": (date_range("20130101", periods=3, tz="CET").tz_convert("UTC").tz_localize(None)), }) expected.iloc[1, 1] = NaT expected.iloc[1, 2] = NaT tm.assert_frame_equal(result, expected) def test_astype_dt64tz_to_str(self, timezone_frame): # str formatting result = timezone_frame.astype(str) expected = DataFrame( [ [ "2013-01-01", "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], ["2013-01-02", "NaT", "NaT"], [ "2013-01-03", "2013-01-03 00:00:00-05:00", "2013-01-03 00:00:00+01:00", ], ], columns=timezone_frame.columns, ) tm.assert_frame_equal(result, expected) with option_context("display.max_columns", 20): result = str(timezone_frame) assert ( "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" ) in result assert ( "1 2013-01-02 NaT NaT" ) in result assert ( "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" ) in result def test_astype_empty_dtype_dict(self): # issue mentioned further down in the following issue's thread # https://github.com/pandas-dev/pandas/issues/33113 df = DataFrame() result = df.astype({}) tm.assert_frame_equal(result, df) assert result is not df @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) ignore keyword @pytest.mark.parametrize( "df", [ DataFrame(Series(["x", "y", "z"], dtype="string")), DataFrame(Series(["x", "y", "z"], dtype="category")), DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), DataFrame(Series(3 * [Interval(0, 1)])), ], ) @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, df, errors): # https://github.com/pandas-dev/pandas/issues/35471 if errors == "ignore": expected = df result = df.astype(float, errors=errors) tm.assert_frame_equal(result, expected) else: msg = "(Cannot cast)|(could not convert)" with pytest.raises((ValueError, TypeError), match=msg): df.astype(float, errors=errors) def test_astype_tz_conversion(self): # GH 35973 val = { "tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London") } df = DataFrame(val) result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) expected = df expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) def test_astype_tz_object_conversion(self, tz): # GH 35973 val = { "tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London") } expected = DataFrame(val) # convert expected to object dtype from other tz str (independently tested) result = expected.astype({"tz": f"datetime64[ns, {tz}]"}) result = result.astype({"tz": "object"}) # do real test: object dtype to a specified tz, different from construction tz. result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request): tz = tz_naive_fixture if tz is None: mark = pytest.mark.xfail( reason= "GH#36153 uses ndarray formatting instead of DTA formatting") request.node.add_marker(mark) dti = date_range("2016-01-01", periods=3, tz=tz) dta = dti._data dta[0] = NaT obj = frame_or_series(dta) result = obj.astype("string") # Check that Series/DataFrame.astype matches DatetimeArray.astype expected = frame_or_series(dta.astype("string")) tm.assert_equal(result, expected) item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) assert np.all(alt.iloc[1:] == result.iloc[1:]) def test_astype_bytes(self): # GH#39474 result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3")
def predict(model: keras.Model, standard_scaler: CustomStandardScaler, tf_idf: TfidfVectorizer, column_dummies, df_past, df_future): """model : Keras Model""" df_past.date_time = pd.to_datetime(df_past.date_time) df_future.date_time = pd.to_datetime(df_future.date_time) df_past.holiday = df_past.holiday != 'None' df_future.holiday = df_future.holiday != 'None' t = tf_idf.transform(df_past.weather_description) t2 = tf_idf.transform(df_future.weather_description) df_past = pd.concat([ df_past, pd.DataFrame(data=t.toarray(), index=df_past.index).add_prefix('tag_') ], axis=1) df_future = pd.concat([ df_future, pd.DataFrame(data=t2.toarray(), index=df_future.index).add_prefix('tag_') ], axis=1) df_past.drop(columns='weather_description', inplace=True) df_future.drop(columns='weather_description', inplace=True) df_past['hour'] = df_past.date_time.dt.hour df_past['weekday'] = df_past.date_time.dt.day_name() df_past['day'] = df_past.date_time.dt.day df_past['month'] = df_past.date_time.dt.month_name() df_past.holiday = df_past.holiday.astype(int) df_future['hour'] = df_future.date_time.dt.hour df_future['weekday'] = df_future.date_time.dt.day_name() df_future['day'] = df_future.date_time.dt.day df_future['month'] = df_future.date_time.dt.month_name() df_future.holiday = df_future.holiday.astype(int) for col, values in column_dummies.items(): df_future[col] = df_future[col].astype(CategoricalDtype(values)) df_past[col] = df_past[col].astype(CategoricalDtype(values)) df_past = df_past.join( pd.get_dummies(df_past.weather_main, prefix='weather')) df_past = df_past.join(pd.get_dummies(df_past.hour, prefix='hour')) df_past = df_past.join(pd.get_dummies(df_past.weekday, prefix='weekday')) df_past = df_past.join(pd.get_dummies(df_past.day, prefix='day')) df_past = df_past.join(pd.get_dummies(df_past.month, prefix='month')) df_future = df_future.join( pd.get_dummies(df_future.weather_main, prefix='weather')) df_future = df_future.join(pd.get_dummies(df_future.hour, prefix='hour')) df_future = df_future.join( pd.get_dummies(df_future.weekday, prefix='weekday')) df_future = df_future.join(pd.get_dummies(df_future.day, prefix='day')) df_future = df_future.join(pd.get_dummies(df_future.month, prefix='month')) df_past = df_past.drop( columns=['weather_main', 'hour', 'weekday', 'day', 'month']) df_future = df_future.drop( columns=['weather_main', 'hour', 'weekday', 'day', 'month']) df_past.drop(columns='date_time', inplace=True) df_future.drop(columns='date_time', inplace=True) traffic = df_past['traffic_volume'].values.reshape(-1, 1) df_past, df_future, traffic = standard_scaler.transform( [df_past, df_future, traffic]) df_future = df_future[np.newaxis, :] df_past = df_past[np.newaxis, :] y = model.predict((df_past, df_future)) y = tf.squeeze(y) y = y.numpy() return standard_scaler.ss[2].inverse_transform(y)