def test_astype_str(self): # GH4405 digits = string.digits s1 = Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]) s2 = Series([digits * 10, tm.rands(63), tm.rands(64), nan, 1.0]) types = (compat.text_type, np.str_) for typ in types: for s in (s1, s2): res = s.astype(typ) expec = s.map(compat.text_type) assert_series_equal(res, expec) # GH9757 # Test str and unicode on python 2.x and just str on python 3.x for tt in set([str, compat.text_type]): ts = Series([Timestamp('2010-01-04 00:00:00')]) s = ts.astype(tt) expected = Series([tt('2010-01-04')]) assert_series_equal(s, expected) ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) s = ts.astype(tt) expected = Series([tt('2010-01-04 00:00:00-05:00')]) assert_series_equal(s, expected) td = Series([Timedelta(1, unit='d')]) s = td.astype(tt) expected = Series([tt('1 days 00:00:00.000000000')]) assert_series_equal(s, expected)
def test_astype_categorical_to_categorical(self, name, dtype_ordered, series_ordered): # GH 10696/18593 s_data = list('abcaacbab') s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered) s = Series(s_data, dtype=s_dtype, name=name) # unspecified categories dtype = CategoricalDtype(ordered=dtype_ordered) result = s.astype(dtype) exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) expected = Series(s_data, name=name, dtype=exp_dtype) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.astype('category', ordered=dtype_ordered) tm.assert_series_equal(result, expected) # different categories dtype = CategoricalDtype(list('adc'), dtype_ordered) result = s.astype(dtype) expected = Series(s_data, name=name, dtype=dtype) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.astype( 'category', categories=list('adc'), ordered=dtype_ordered) tm.assert_series_equal(result, expected) if dtype_ordered is False: # not specifying ordered, so only test once expected = s result = s.astype('category') tm.assert_series_equal(result, expected)
def test_astype_categorical_to_other(self): value = np.random.RandomState(0).randint(0, 10000, 100) df = DataFrame({'value': value}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) s = df['value_group'] expected = s tm.assert_series_equal(s.astype('category'), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) msg = (r"could not convert string to float|" r"invalid literal for float\(\)") with pytest.raises(ValueError, match=msg): s.astype('float64') cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_series_equal(cat.astype('str'), exp) s2 = Series(Categorical(['1', '2', '3', '4'])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype('int'), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal( np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(s.values), name='value_group') cmp(s.astype('object'), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) # valid conversion for valid in [lambda x: x.astype('category'), lambda x: x.astype(CategoricalDtype()), lambda x: x.astype('object').astype('category'), lambda x: x.astype('object').astype( CategoricalDtype()) ]: result = valid(s) # compare series values # internal .categories can't be compared because it is sorted tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\." "Categorical'> for astype") for invalid in [lambda x: x.astype(Categorical), lambda x: x.astype('object').astype(Categorical)]: with pytest.raises(TypeError, match=msg): invalid(s)
def test_astype_cast_nan_inf_int(self, dtype, value): # gh-14265: check NaN and inf raise error when converting to int msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer' s = Series([value]) with tm.assert_raises_regex(ValueError, msg): s.astype(dtype)
def test_astype_datetime64tz(self): s = Series(date_range('20130101', periods=3, tz='US/Eastern')) # astype result = s.astype(object) expected = Series(s.astype(object), dtype=object) tm.assert_series_equal(result, expected) result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz) tm.assert_series_equal(result, s) # astype - object, preserves on construction result = Series(s.astype(object)) expected = s.astype(object) tm.assert_series_equal(result, expected) # astype - datetime64[ns, tz] result = Series(s.values).astype('datetime64[ns, US/Eastern]') tm.assert_series_equal(result, s) result = Series(s.values).astype(s.dtype) tm.assert_series_equal(result, s) result = s.astype('datetime64[ns, CET]') expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET')) tm.assert_series_equal(result, expected)
def test_interp_scipy_basic(self): tm._skip_if_no_scipy() s = Series([1, 3, np.nan, 12, np.nan, 25]) # slinear expected = Series([1., 3., 7.5, 12., 18.5, 25.]) result = s.interpolate(method='slinear') assert_series_equal(result, expected) result = s.interpolate(method='slinear', donwcast='infer') assert_series_equal(result, expected) # nearest expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='nearest') assert_series_equal(result, expected.astype('float')) result = s.interpolate(method='nearest', downcast='infer') assert_series_equal(result, expected) # zero expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='zero') assert_series_equal(result, expected.astype('float')) result = s.interpolate(method='zero', downcast='infer') assert_series_equal(result, expected) # quadratic expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) result = s.interpolate(method='quadratic') assert_series_equal(result, expected) result = s.interpolate(method='quadratic', downcast='infer') assert_series_equal(result, expected) # cubic expected = Series([1., 3., 6.8, 12., 18.2, 25.]) result = s.interpolate(method='cubic') assert_series_equal(result, expected)
def test_astype_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 data = [1] s = Series(data) msg = "dtype has no unit. Please pass in" with pytest.raises(ValueError, match=msg): s.astype(dtype)
def test_arg_for_errors_in_astype(self): # see gh-14878 s = Series([1, 2, 3]) with pytest.raises(ValueError): s.astype(np.float64, errors=False) s.astype(np.int8, errors='raise')
def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH 24704 a1 = Series([0, np.nan, 4], name='a') a2 = Series([np.nan, 3, 5], name='a') df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) assert_frame_equal(result, expected)
def test_astype_categories_deprecation(self): # deprecated 17636 s = Series(['a', 'b', 'a']) expected = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.astype('category', categories=['a', 'b'], ordered=True) tm.assert_series_equal(result, expected)
def test_arg_for_errors_in_astype(self): # see gh-14878 s = Series([1, 2, 3]) msg = (r"Expected value of kwarg 'errors' to be one of \['raise'," r" 'ignore'\]\. Supplied value is 'False'") with pytest.raises(ValueError, match=msg): s.astype(np.float64, errors=False) s.astype(np.int8, errors='raise')
def test_astype_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 data = [1] s = Series(data) msg = ((r"The '{dtype}' dtype has no unit\. " r"Please pass in '{dtype}\[ns\]' instead.") .format(dtype=dtype.__name__)) with pytest.raises(ValueError, match=msg): s.astype(dtype)
def test_astype_cast_nan_inf_int(self): # GH14265, check nan and inf raise error when converting to int types = [np.int32, np.int64] values = [np.nan, np.inf] msg = 'Cannot convert non-finite values \(NA or inf\) to integer' for this_type in types: for this_val in values: s = Series([this_val]) with self.assertRaisesRegexp(ValueError, msg): s.astype(this_type)
def test_astype_dict_like(self, dtype_class): # see gh-7271 s = Series(range(0, 10, 2), name='abc') dt1 = dtype_class({'abc': str}) result = s.astype(dt1) expected = Series(['0', '2', '4', '6', '8'], name='abc') tm.assert_series_equal(result, expected) dt2 = dtype_class({'abc': 'float64'}) result = s.astype(dt2) expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', name='abc') tm.assert_series_equal(result, expected) dt3 = dtype_class({'abc': str, 'def': str}) with pytest.raises(KeyError): s.astype(dt3) dt4 = dtype_class({0: str}) with pytest.raises(KeyError): s.astype(dt4) # GH16717 # if dtypes provided is empty, it should error dt5 = dtype_class({}) with pytest.raises(KeyError): s.astype(dt5)
def test_astype_dict(self): # GH7271 s = Series(range(0, 10, 2), name='abc') result = s.astype({'abc': str}) expected = Series(['0', '2', '4', '6', '8'], name='abc') assert_series_equal(result, expected) result = s.astype({'abc': 'float64'}) expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', name='abc') assert_series_equal(result, expected) self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) self.assertRaises(KeyError, s.astype, {0: str})
def test_astype_datetimes(self): import pandas.tslib as tslib s = Series(tslib.iNaT, dtype='M8[ns]', index=lrange(5)) s = s.astype('O') self.assertEqual(s.dtype, np.object_) s = Series([datetime(2001, 1, 2, 0, 0)]) s = s.astype('O') self.assertEqual(s.dtype, np.object_) s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) s[1] = np.nan self.assertEqual(s.dtype, 'M8[ns]') s = s.astype('O') self.assertEqual(s.dtype, np.object_)
def test_is_equal_dtype(self): # test dtype comparisons between cats c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False) c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False) c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True) assert c1.is_dtype_equal(c1) assert c2.is_dtype_equal(c2) assert c3.is_dtype_equal(c3) assert c1.is_dtype_equal(c2) assert not c1.is_dtype_equal(c3) assert not c1.is_dtype_equal(Index(list('aabca'))) assert not c1.is_dtype_equal(c1.astype(object)) assert c1.is_dtype_equal(CategoricalIndex(c1)) assert (c1.is_dtype_equal( CategoricalIndex(c1, categories=list('cab')))) assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)) # GH 16659 s1 = Series(c1) s2 = Series(c2) s3 = Series(c3) assert c1.is_dtype_equal(s1) assert c2.is_dtype_equal(s2) assert c3.is_dtype_equal(s3) assert c1.is_dtype_equal(s2) assert not c1.is_dtype_equal(s3) assert not c1.is_dtype_equal(s1.astype(object))
def test_custom_grouper(self): dti = DatetimeIndex(freq='Min', start=datetime(2005,1,1), end=datetime(2005,1,10)) data = np.array([1]*len(dti)) s = Series(data, index=dti) b = TimeGrouper(Minute(5)) g = s.groupby(b) self.assertEquals(g.ngroups, 2593) # construct expected val arr = [5] * 2592 arr.append(1) idx = dti[0:-1:5] idx = idx.append(DatetimeIndex([np.datetime64(dti[-1])])) expect = Series(arr, index=idx) # cython returns float for now result = g.agg(np.sum) assert_series_equal(result, expect.astype(float)) data = np.random.rand(len(dti), 10) df = DataFrame(data, index=dti) r = df.groupby(b).agg(np.sum) self.assertEquals(len(r.columns), 10) self.assertEquals(len(r.index), 2593)
def test_timedelta64_conversions(self): startdate = Series(date_range('2013-01-01', '2013-01-03')) enddate = Series(date_range('2013-03-01', '2013-03-03')) s1 = enddate - startdate s1[2] = np.nan for m in [1, 3, 10]: for unit in ['D', 'h', 'm', 's', 'ms', 'us', 'ns']: # op expected = s1.apply(lambda x: x / np.timedelta64(m, unit)) result = s1 / np.timedelta64(m, unit) assert_series_equal(result, expected) if m == 1 and unit != 'ns': # astype result = s1.astype("timedelta64[{0}]".format(unit)) assert_series_equal(result, expected) # reverse op expected = s1.apply( lambda x: Timedelta(np.timedelta64(m, unit)) / x) result = np.timedelta64(m, unit) / s1 # astype s = Series(date_range('20130101', periods=3)) result = s.astype(object) self.assertIsInstance(result.iloc[0], datetime) self.assertTrue(result.dtype == np.object_) result = s1.astype(object) self.assertIsInstance(result.iloc[0], timedelta) self.assertTrue(result.dtype == np.object_)
def test_hourly(self): rng_hourly = date_range('1/1/1994', periods=(18 * 8760 + 4 * 24), freq='H') data_hourly = np.random.randint(100, 350, rng_hourly.size) ts_hourly = Series(data_hourly, index=rng_hourly) grouped = ts_hourly.groupby(ts_hourly.index.year) hoy = grouped.apply(lambda x: x.reset_index(drop=True)) hoy = hoy.index.droplevel(0).values hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 hoy += 1 annual = pivot_annual(ts_hourly) ts_hourly = ts_hourly.astype(float) for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: subset = ts_hourly[hoy == i] subset.index = [x.year for x in subset.index] result = annual[i].dropna() tm.assert_series_equal(result, subset, check_names=False) self.assertEqual(result.name, i) leaps = ts_hourly[(ts_hourly.index.month == 2) & ( ts_hourly.index.day == 29) & (ts_hourly.index.hour == 0)] hour = leaps.index.dayofyear[0] * 24 - 23 leaps.index = leaps.index.year leaps.name = 1417 tm.assert_series_equal(annual[hour].dropna(), leaps)
def test_custom_grouper(self): dti = DatetimeIndex(freq="Min", start=datetime(2005, 1, 1), end=datetime(2005, 1, 10)) data = np.array([1] * len(dti)) s = Series(data, index=dti) b = TimeGrouper(Minute(5)) g = s.groupby(b) # check all cython functions work funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) self.assertEquals(g.ngroups, 2593) self.assert_(notnull(g.mean()).all()) # construct expected val arr = [5] * 2592 arr.append(1) idx = dti[0:-1:5] idx = idx.append(DatetimeIndex([np.datetime64(dti[-1])])) expect = Series(arr, index=idx) # cython returns float for now result = g.agg(np.sum) assert_series_equal(result, expect.astype(float)) data = np.random.rand(len(dti), 10) df = DataFrame(data, index=dti) r = df.groupby(b).agg(np.sum) self.assertEquals(len(r.columns), 10) self.assertEquals(len(r.index), 2593)
def test_astype_categoricaldtype(self): s = Series(['a', 'b', 'a']) result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) expected = Series(Categorical(['a', 'b', 'a'], ordered=True)) tm.assert_series_equal(result, expected) result = s.astype(CategoricalDtype(['a', 'b'], ordered=False)) expected = Series(Categorical(['a', 'b', 'a'], ordered=False)) tm.assert_series_equal(result, expected) result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False)) expected = Series(Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'], ordered=False)) tm.assert_series_equal(result, expected) tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c']))
def test_astype(self): s = Series(np.random.randn(5), name='foo') for dtype in ['float32', 'float64', 'int64', 'int32']: astyped = s.astype(dtype) self.assertEqual(astyped.dtype, dtype) self.assertEqual(astyped.name, s.name)
def test_freq_conversion(self): # doc example # series td = Series(date_range('20130101', periods=4)) - \ Series(date_range('20121201', periods=4)) td[2] += timedelta(minutes=5, seconds=3) td[3] = np.nan result = td / np.timedelta64(1, 'D') expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan ]) assert_series_equal(result, expected) result = td.astype('timedelta64[D]') expected = Series([31, 31, 31, np.nan]) assert_series_equal(result, expected) result = td / np.timedelta64(1, 's') expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) assert_series_equal(result, expected) result = td.astype('timedelta64[s]') assert_series_equal(result, expected) # tdi td = TimedeltaIndex(td) result = td / np.timedelta64(1, 'D') expected = Index([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) assert_index_equal(result, expected) result = td.astype('timedelta64[D]') expected = Index([31, 31, 31, np.nan]) assert_index_equal(result, expected) result = td / np.timedelta64(1, 's') expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) assert_index_equal(result, expected) result = td.astype('timedelta64[s]') assert_index_equal(result, expected)
def test_astype_datetime(self): s = Series(tslib.iNaT, dtype='M8[ns]', index=lrange(5)) s = s.astype('O') assert s.dtype == np.object_ s = Series([datetime(2001, 1, 2, 0, 0)]) s = s.astype('O') assert s.dtype == np.object_ s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) s[1] = np.nan assert s.dtype == 'M8[ns]' s = s.astype('O') assert s.dtype == np.object_
def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units # gh-19223 dtype = "{}[{}]".format(dtype, unit) arr = np.array([1, 2, 3], dtype=arr_dtype) s = Series(arr) result = s.astype(dtype) expected = Series(arr.astype(dtype)) tm.assert_series_equal(result, expected)
def test_astype_generic_timestamp_deprecated(self): # see gh-15524 data = [1] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s = Series(data) dtype = np.datetime64 result = s.astype(dtype) expected = Series(data, dtype=dtype) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s = Series(data) dtype = np.timedelta64 result = s.astype(dtype) expected = Series(data, dtype=dtype) tm.assert_series_equal(result, expected)
def test_astype_cast_object_int(self): arr = Series(["car", "house", "tree", "1"]) self.assertRaises(ValueError, arr.astype, int) self.assertRaises(ValueError, arr.astype, np.int64) self.assertRaises(ValueError, arr.astype, np.int8) arr = Series(['1', '2', '3', '4'], dtype=object) result = arr.astype(int) self.assert_numpy_array_equal(result, np.arange(1, 5))
def test_astype_str_cast(self): # see gh-9757 ts = Series([Timestamp('2010-01-04 00:00:00')]) s = ts.astype(str) expected = Series([str('2010-01-04')]) tm.assert_series_equal(s, expected) ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) s = ts.astype(str) expected = Series([str('2010-01-04 00:00:00-05:00')]) tm.assert_series_equal(s, expected) td = Series([Timedelta(1, unit='d')]) s = td.astype(str) expected = Series([str('1 days 00:00:00.000000000')]) tm.assert_series_equal(s, expected)
def test_interp_scipy_basic(self): tm._skip_if_no_scipy() s = Series([1, 3, np.nan, 12, np.nan, 25]) # slinear expected = Series([1., 3., 7.5, 12., 18.5, 25.]) result = s.interpolate(method='slinear') assert_series_equal(result, expected) result = s.interpolate(method='slinear', downcast='infer') assert_series_equal(result, expected) # nearest expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='nearest') assert_series_equal(result, expected.astype('float')) result = s.interpolate(method='nearest', downcast='infer') assert_series_equal(result, expected) # zero expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='zero') assert_series_equal(result, expected.astype('float')) result = s.interpolate(method='zero', downcast='infer') assert_series_equal(result, expected) # quadratic # GH #15662. # new cubic and quadratic interpolation algorithms from scipy 0.19.0. # previously `splmake` was used. See scipy/scipy#6710 if _is_scipy_ge_0190: expected = Series([1, 3., 6.823529, 12., 18.058824, 25.]) else: expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) result = s.interpolate(method='quadratic') assert_series_equal(result, expected) result = s.interpolate(method='quadratic', downcast='infer') assert_series_equal(result, expected) # cubic expected = Series([1., 3., 6.8, 12., 18.2, 25.]) result = s.interpolate(method='cubic') assert_series_equal(result, expected)
def setup(self, dtype, M, offset_factor): offset = int(M * offset_factor) tmp = Series(np.random.randint(offset, M + offset, 10**6)) self.series = tmp.astype(dtype) self.values = np.arange(M).astype(dtype)
def infer_problem_type(y: Series, silent=False) -> str: """ Identifies which type of prediction problem we are interested in (if user has not specified). Ie. binary classification, multi-class classification, or regression. """ if len(y) == 0: raise ValueError("provided labels cannot have length = 0") y = y.dropna( ) # Remove missing values from y (there should not be any though as they were removed in Learner.general_data_processing()) num_rows = len(y) unique_values = y.unique() MULTICLASS_LIMIT = 1000 # if numeric and class count would be above this amount, assume it is regression if num_rows > 1000: REGRESS_THRESHOLD = 0.05 # if the unique-ratio is less than this, we assume multiclass classification, even when labels are integers else: REGRESS_THRESHOLD = 0.1 unique_count = len(unique_values) if unique_count == 2: problem_type = BINARY reason = "only two unique label-values observed" elif y.dtype.name in ['object', 'category']: problem_type = MULTICLASS reason = f"dtype of label-column == {y.dtype.name}" elif np.issubdtype(y.dtype, np.floating): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): try: can_convert_to_int = np.array_equal(y, y.astype(int)) if can_convert_to_int: problem_type = MULTICLASS reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" except: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and many unique label-values observed" elif np.issubdtype(y.dtype, np.integer): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): problem_type = MULTICLASS # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression reason = "dtype of label-column == int, but few unique label-values observed" else: problem_type = REGRESSION reason = "dtype of label-column == int and many unique label-values observed" else: raise NotImplementedError(f'label dtype {y.dtype} not supported!') if not silent: logger.log( 25, f"AutoGluon infers your prediction problem is: '{problem_type}' (because {reason})." ) # TODO: Move this outside of this function so it is visible even if problem type was not inferred. if problem_type in [BINARY, MULTICLASS]: if unique_count > 10: logger.log( 20, f'\tFirst 10 (of {unique_count}) unique label values: {list(unique_values[:10])}' ) else: logger.log( 20, f'\t{unique_count} unique label values: {list(unique_values)}' ) elif problem_type == REGRESSION: y_max = y.max() y_min = y.min() y_mean = y.mean() y_stddev = y.std() logger.log( 20, f'\tLabel info (max, min, mean, stddev): ({y_max}, {y_min}, {round(y_mean, 5)}, {round(y_stddev, 5)})' ) logger.log( 25, f"\tIf '{problem_type}' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: {[BINARY, MULTICLASS, REGRESSION]})" ) return problem_type
def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) self.assertEqual(td.dtype, 'timedelta64[ns]') td = Series([timedelta(days=1)]) self.assertEqual(td.dtype, 'timedelta64[ns]') td = Series( [timedelta(days=1), timedelta(days=2), np.timedelta64(1, 's')]) self.assertEqual(td.dtype, 'timedelta64[ns]') # mixed with NaT from pandas import tslib td = Series([timedelta(days=1), tslib.NaT], dtype='m8[ns]') self.assertEqual(td.dtype, 'timedelta64[ns]') td = Series([timedelta(days=1), np.nan], dtype='m8[ns]') self.assertEqual(td.dtype, 'timedelta64[ns]') td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]') self.assertEqual(td.dtype, 'timedelta64[ns]') # improved inference # GH5689 td = Series([np.timedelta64(300000000), pd.NaT]) self.assertEqual(td.dtype, 'timedelta64[ns]') # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), tslib.iNaT]) self.assertEqual(td.dtype, 'object') td = Series([np.timedelta64(300000000), np.nan]) self.assertEqual(td.dtype, 'timedelta64[ns]') td = Series([pd.NaT, np.timedelta64(300000000)]) self.assertEqual(td.dtype, 'timedelta64[ns]') td = Series([np.timedelta64(1, 's')]) self.assertEqual(td.dtype, 'timedelta64[ns]') # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: # self.assertRaises(TypeError, td.astype, 'm8[%s]' % t) # valid astype td.astype('int64') # invalid casting self.assertRaises(TypeError, td.astype, 'int32') # this is an invalid casting def f(): Series([timedelta(days=1), 'foo'], dtype='m8[ns]') self.assertRaises(Exception, f) # leave as object here td = Series([timedelta(days=i) for i in range(3)] + ['foo']) self.assertEqual(td.dtype, 'object') # these will correctly infer a timedelta s = Series([None, pd.NaT, '1 Day']) self.assertEqual(s.dtype, 'timedelta64[ns]') s = Series([np.nan, pd.NaT, '1 Day']) self.assertEqual(s.dtype, 'timedelta64[ns]') s = Series([pd.NaT, None, '1 Day']) self.assertEqual(s.dtype, 'timedelta64[ns]') s = Series([pd.NaT, np.nan, '1 Day']) self.assertEqual(s.dtype, 'timedelta64[ns]')
def test_td64_series_astype_object(self): tdser = Series(['59 Days', '59 Days', 'NaT'], dtype='timedelta64[ns]') result = tdser.astype(object) assert isinstance(result.iloc[0], timedelta) assert result.dtype == np.object_
def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1)]) assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64( 1, 's')]) assert td.dtype == 'timedelta64[ns]' # mixed with NaT td = Series([timedelta(days=1), NaT], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1), np.nan], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' # improved inference # GH5689 td = Series([np.timedelta64(300000000), NaT]) assert td.dtype == 'timedelta64[ns]' # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), iNaT]) assert td.dtype == 'object' td = Series([np.timedelta64(300000000), np.nan]) assert td.dtype == 'timedelta64[ns]' td = Series([pd.NaT, np.timedelta64(300000000)]) assert td.dtype == 'timedelta64[ns]' td = Series([np.timedelta64(1, 's')]) assert td.dtype == 'timedelta64[ns]' # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: # pytest.raises(TypeError, td.astype, 'm8[%s]' % t) # valid astype td.astype('int64') # invalid casting msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]") with pytest.raises(TypeError, match=msg): td.astype('int32') # this is an invalid casting msg = "Could not convert object to NumPy timedelta" with pytest.raises(ValueError, match=msg): Series([timedelta(days=1), 'foo'], dtype='m8[ns]') # leave as object here td = Series([timedelta(days=i) for i in range(3)] + ['foo']) assert td.dtype == 'object' # these will correctly infer a timedelta s = Series([None, pd.NaT, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([np.nan, pd.NaT, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([pd.NaT, None, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([pd.NaT, np.nan, '1 Day']) assert s.dtype == 'timedelta64[ns]'
class TestSeriesPeriod(object): def setup_method(self, method): self.series = Series(period_range('2000-01-01', periods=10, freq='D')) def test_auto_conversion(self): series = Series(list(period_range('2000-01-01', periods=10, freq='D'))) assert series.dtype == 'Period[D]' series = pd.Series([ pd.Period('2011-01-01', freq='D'), pd.Period('2011-02-01', freq='D') ]) assert series.dtype == 'Period[D]' def test_getitem(self): assert self.series[1] == pd.Period('2000-01-02', freq='D') result = self.series[[2, 4]] exp = pd.Series([ pd.Period('2000-01-03', freq='D'), pd.Period('2000-01-05', freq='D') ], index=[2, 4], dtype='Period[D]') tm.assert_series_equal(result, exp) assert result.dtype == 'Period[D]' def test_isna(self): # GH 13737 s = Series( [pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) tm.assert_series_equal(s.isna(), Series([False, True])) tm.assert_series_equal(s.notna(), Series([True, False])) def test_fillna(self): # GH 13737 s = Series( [pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) res = s.fillna(pd.Period('2012-01', freq='M')) exp = Series( [pd.Period('2011-01', freq='M'), pd.Period('2012-01', freq='M')]) tm.assert_series_equal(res, exp) assert res.dtype == 'Period[M]' def test_dropna(self): # GH 13737 s = Series( [pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) tm.assert_series_equal(s.dropna(), Series([pd.Period('2011-01', freq='M')])) def test_between(self): left, right = self.series[[2, 7]] result = self.series.between(left, right) expected = (self.series >= left) & (self.series <= right) tm.assert_series_equal(result, expected) # --------------------------------------------------------------------- # NaT support @pytest.mark.xfail(reason="PeriodDtype Series not supported yet", strict=True) def test_NaT_scalar(self): series = Series([0, 1000, 2000, pd._libs.iNaT], dtype='period[D]') val = series[3] assert pd.isna(val) series[2] = val assert pd.isna(series[2]) @pytest.mark.xfail(reason="PeriodDtype Series not supported yet", strict=True) def test_NaT_cast(self): result = Series([np.nan]).astype('period[D]') expected = Series([pd.NaT]) tm.assert_series_equal(result, expected) def test_set_none(self): self.series[3] = None assert self.series[3] is pd.NaT self.series[3:5] = None assert self.series[4] is pd.NaT def test_set_nan(self): # Do we want to allow this? self.series[5] = np.nan assert self.series[5] is pd.NaT self.series[5:7] = np.nan assert self.series[6] is pd.NaT def test_intercept_astype_object(self): expected = self.series.astype('object') df = DataFrame({ 'a': self.series, 'b': np.random.randn(len(self.series)) }) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() def test_align_series(self, join_type): rng = period_range('1/1/2000', '1/1/2010', freq='A') ts = Series(np.random.randn(len(rng)), index=rng) ts.align(ts[::2], join=join_type) def test_truncate(self): # GH 17717 idx1 = pd.PeriodIndex([ pd.Period('2017-09-02'), pd.Period('2017-09-02'), pd.Period('2017-09-03') ]) series1 = pd.Series([1, 2, 3], index=idx1) result1 = series1.truncate(after='2017-09-02') expected_idx1 = pd.PeriodIndex( [pd.Period('2017-09-02'), pd.Period('2017-09-02')]) tm.assert_series_equal(result1, pd.Series([1, 2], index=expected_idx1)) idx2 = pd.PeriodIndex([ pd.Period('2017-09-03'), pd.Period('2017-09-02'), pd.Period('2017-09-03') ]) series2 = pd.Series([1, 2, 3], index=idx2) result2 = series2.sort_index().truncate(after='2017-09-02') expected_idx2 = pd.PeriodIndex([pd.Period('2017-09-02')]) tm.assert_series_equal(result2, pd.Series([2], index=expected_idx2)) @pytest.mark.parametrize( 'input_vals', [[Period('2016-01', freq='M'), Period('2016-02', freq='M')], [Period('2016-01-01', freq='D'), Period('2016-01-02', freq='D')], [ Period('2016-01-01 00:00:00', freq='H'), Period('2016-01-01 01:00:00', freq='H') ], [ Period('2016-01-01 00:00:00', freq='M'), Period('2016-01-01 00:01:00', freq='M') ], [ Period('2016-01-01 00:00:00', freq='S'), Period('2016-01-01 00:00:01', freq='S') ]]) def test_end_time_timevalues(self, input_vals): # GH 17157 # Check that the time part of the Period is adjusted by end_time # when using the dt accessor on a Series input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) s = Series(input_vals) result = s.dt.end_time expected = s.apply(lambda x: x.end_time) tm.assert_series_equal(result, expected)
def expected(self, dtype): arr = np.arange(5).astype(dtype) ser = Series(arr) ser = ser.astype(object) ser.values[0] = np.timedelta64(4, "ns") return ser
class TestSeriesPeriod(object): def setup_method(self, method): self.series = Series(period_range('2000-01-01', periods=10, freq='D')) def test_auto_conversion(self): series = Series(list(period_range('2000-01-01', periods=10, freq='D'))) assert series.dtype == 'object' series = pd.Series([ pd.Period('2011-01-01', freq='D'), pd.Period('2011-02-01', freq='D') ]) assert series.dtype == 'object' def test_getitem(self): assert self.series[1] == pd.Period('2000-01-02', freq='D') result = self.series[[2, 4]] exp = pd.Series([ pd.Period('2000-01-03', freq='D'), pd.Period('2000-01-05', freq='D') ], index=[2, 4]) tm.assert_series_equal(result, exp) assert result.dtype == 'object' def test_isnull(self): # GH 13737 s = Series( [pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) tm.assert_series_equal(s.isnull(), Series([False, True])) tm.assert_series_equal(s.notnull(), Series([True, False])) def test_fillna(self): # GH 13737 s = Series( [pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) res = s.fillna(pd.Period('2012-01', freq='M')) exp = Series( [pd.Period('2011-01', freq='M'), pd.Period('2012-01', freq='M')]) tm.assert_series_equal(res, exp) assert res.dtype == 'object' res = s.fillna('XXX') exp = Series([pd.Period('2011-01', freq='M'), 'XXX']) tm.assert_series_equal(res, exp) assert res.dtype == 'object' def test_dropna(self): # GH 13737 s = Series( [pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) tm.assert_series_equal(s.dropna(), Series([pd.Period('2011-01', freq='M')])) def test_series_comparison_scalars(self): val = pd.Period('2000-01-04', freq='D') result = self.series > val expected = pd.Series([x > val for x in self.series]) tm.assert_series_equal(result, expected) val = self.series[5] result = self.series > val expected = pd.Series([x > val for x in self.series]) tm.assert_series_equal(result, expected) def test_between(self): left, right = self.series[[2, 7]] result = self.series.between(left, right) expected = (self.series >= left) & (self.series <= right) tm.assert_series_equal(result, expected) # --------------------------------------------------------------------- # NaT support """ # ToDo: Enable when support period dtype def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='period[D]') val = series[3] assert isnull(val) series[2] = val assert isnull(series[2]) def test_NaT_cast(self): result = Series([np.nan]).astype('period[D]') expected = Series([NaT]) tm.assert_series_equal(result, expected) """ def test_set_none_nan(self): # currently Period is stored as object dtype, not as NaT self.series[3] = None assert self.series[3] is None self.series[3:5] = None assert self.series[4] is None self.series[5] = np.nan assert np.isnan(self.series[5]) self.series[5:7] = np.nan assert np.isnan(self.series[6]) def test_intercept_astype_object(self): expected = self.series.astype('object') df = DataFrame({ 'a': self.series, 'b': np.random.randn(len(self.series)) }) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() def test_comp_series_period_scalar(self): # GH 13200 for freq in ['M', '2M', '3M']: base = Series([ Period(x, freq=freq) for x in ['2011-01', '2011-02', '2011-03', '2011-04'] ]) p = Period('2011-02', freq=freq) exp = pd.Series([False, True, False, False]) tm.assert_series_equal(base == p, exp) tm.assert_series_equal(p == base, exp) exp = pd.Series([True, False, True, True]) tm.assert_series_equal(base != p, exp) tm.assert_series_equal(p != base, exp) exp = pd.Series([False, False, True, True]) tm.assert_series_equal(base > p, exp) tm.assert_series_equal(p < base, exp) exp = pd.Series([True, False, False, False]) tm.assert_series_equal(base < p, exp) tm.assert_series_equal(p > base, exp) exp = pd.Series([False, True, True, True]) tm.assert_series_equal(base >= p, exp) tm.assert_series_equal(p <= base, exp) exp = pd.Series([True, True, False, False]) tm.assert_series_equal(base <= p, exp) tm.assert_series_equal(p >= base, exp) # different base freq msg = "Input has different freq=A-DEC from Period" with tm.assert_raises_regex(period.IncompatibleFrequency, msg): base <= Period('2011', freq='A') with tm.assert_raises_regex(period.IncompatibleFrequency, msg): Period('2011', freq='A') >= base def test_comp_series_period_series(self): # GH 13200 for freq in ['M', '2M', '3M']: base = Series([ Period(x, freq=freq) for x in ['2011-01', '2011-02', '2011-03', '2011-04'] ]) s = Series([ Period(x, freq=freq) for x in ['2011-02', '2011-01', '2011-03', '2011-05'] ]) exp = Series([False, False, True, False]) tm.assert_series_equal(base == s, exp) exp = Series([True, True, False, True]) tm.assert_series_equal(base != s, exp) exp = Series([False, True, False, False]) tm.assert_series_equal(base > s, exp) exp = Series([True, False, False, True]) tm.assert_series_equal(base < s, exp) exp = Series([False, True, True, False]) tm.assert_series_equal(base >= s, exp) exp = Series([True, False, True, True]) tm.assert_series_equal(base <= s, exp) s2 = Series([ Period(x, freq='A') for x in ['2011', '2011', '2011', '2011'] ]) # different base freq msg = "Input has different freq=A-DEC from Period" with tm.assert_raises_regex(period.IncompatibleFrequency, msg): base <= s2 def test_comp_series_period_object(self): # GH 13200 base = Series([ Period('2011', freq='A'), Period('2011-02', freq='M'), Period('2013', freq='A'), Period('2011-04', freq='M') ]) s = Series([ Period('2012', freq='A'), Period('2011-01', freq='M'), Period('2013', freq='A'), Period('2011-05', freq='M') ]) exp = Series([False, False, True, False]) tm.assert_series_equal(base == s, exp) exp = Series([True, True, False, True]) tm.assert_series_equal(base != s, exp) exp = Series([False, True, False, False]) tm.assert_series_equal(base > s, exp) exp = Series([True, False, False, True]) tm.assert_series_equal(base < s, exp) exp = Series([False, True, True, False]) tm.assert_series_equal(base >= s, exp) exp = Series([True, False, True, True]) tm.assert_series_equal(base <= s, exp) def test_align_series(self): rng = period_range('1/1/2000', '1/1/2010', freq='A') ts = Series(np.random.randn(len(rng)), index=rng) result = ts + ts[::2] expected = ts + ts expected[1::2] = np.nan tm.assert_series_equal(result, expected) result = ts + _permute(ts[::2]) tm.assert_series_equal(result, expected) # it works! for kind in ['inner', 'outer', 'left', 'right']: ts.align(ts[::2], join=kind) msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" with tm.assert_raises_regex(period.IncompatibleFrequency, msg): ts + ts.asfreq('D', how="end")
def test_dt_accessor_api_for_categorical(self): # https://github.com/pandas-dev/pandas/issues/10661 from pandas.core.indexes.accessors import Properties s_dr = Series(date_range("1/1/2015", periods=5, tz="MET")) c_dr = s_dr.astype("category") s_pr = Series(period_range("1/1/2015", freq="D", periods=5)) c_pr = s_pr.astype("category") s_tdr = Series(timedelta_range("1 days", "10 days")) c_tdr = s_tdr.astype("category") # only testing field (like .day) # and bool (is_month_start) get_ops = lambda x: x._datetimelike_ops test_data = [ ("Datetime", get_ops(DatetimeIndex), s_dr, c_dr), ("Period", get_ops(PeriodArray), s_pr, c_pr), ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr), ] assert isinstance(c_dr.dt, Properties) special_func_defs = [ ("strftime", ("%Y-%m-%d", ), {}), ("tz_convert", ("EST", ), {}), ("round", ("D", ), {}), ("floor", ("D", ), {}), ("ceil", ("D", ), {}), ("asfreq", ("D", ), {}), # FIXME: don't leave commented-out # ('tz_localize', ("UTC",), {}), ] _special_func_names = [f[0] for f in special_func_defs] # the series is already localized _ignore_names = ["tz_localize", "components"] for name, attr_names, s, c in test_data: func_names = [ f for f in dir(s.dt) if not (f.startswith("_") or f in attr_names or f in _special_func_names or f in _ignore_names) ] func_defs = [(f, (), {}) for f in func_names] for f_def in special_func_defs: if f_def[0] in dir(s.dt): func_defs.append(f_def) for func, args, kwargs in func_defs: with warnings.catch_warnings(): if func == "to_period": # dropping TZ warnings.simplefilter("ignore", UserWarning) res = getattr(c.dt, func)(*args, **kwargs) exp = getattr(s.dt, func)(*args, **kwargs) tm.assert_equal(res, exp) for attr in attr_names: res = getattr(c.dt, attr) exp = getattr(s.dt, attr) if isinstance(res, DataFrame): tm.assert_frame_equal(res, exp) elif isinstance(res, Series): tm.assert_series_equal(res, exp) else: tm.assert_almost_equal(res, exp) invalid = Series([1, 2, 3]).astype("category") msg = "Can only use .dt accessor with datetimelike" with pytest.raises(AttributeError, match=msg): invalid.dt assert not hasattr(invalid, "str")
def test_astype_categories_raises(self): # deprecated GH#17636, removed in GH#27141 s = Series(["a", "b", "a"]) with pytest.raises(TypeError, match="got an unexpected"): s.astype("category", categories=["a", "b"], ordered=True)
def test_astype_from_categorical(self, items): ser = Series(items) exp = Series(Categorical(items)) res = ser.astype("category") tm.assert_series_equal(res, exp)
def test_astype_categorical_to_other(self): cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) ser = Series(np.random.RandomState(0).randint(0, 10000, 100)).sort_values() ser = pd.cut(ser, range(0, 10500, 500), right=False, labels=cat) expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) msg = r"Cannot cast object dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int64") tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(ser.values), name="value_group") cmp(ser.astype("object"), expected) cmp(ser.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(ser), np.array(ser.values)) tm.assert_series_equal(ser.astype("category"), ser) tm.assert_series_equal(ser.astype(CategoricalDtype()), ser) roundtrip_expected = ser.cat.set_categories( ser.cat.categories.sort_values()).cat.remove_unused_categories() result = ser.astype("object").astype("category") tm.assert_series_equal(result, roundtrip_expected) result = ser.astype("object").astype(CategoricalDtype()) tm.assert_series_equal(result, roundtrip_expected)
def restore(self, col: pd.Series) -> pd.Series: """Restore column when to_pandas.""" return col.astype(self.dtype)
def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr) self.assertTrue(s.dtype.name == 'datetime64[ns, US/Eastern]') self.assertTrue(s.dtype == 'datetime64[ns, US/Eastern]') self.assertTrue(is_datetime64tz_dtype(s.dtype)) self.assertTrue('datetime64[ns, US/Eastern]' in str(s)) # export result = s.values self.assertIsInstance(result, np.ndarray) self.assertTrue(result.dtype == 'datetime64[ns]') exp = pd.DatetimeIndex(result) exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) self.assert_index_equal(dr, exp) # indexing result = s.iloc[0] self.assertEqual( result, Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D')) result = s[0] self.assertEqual( result, Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D')) result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # astype result = s.astype(object) expected = Series(DatetimeIndex(s._values).asobject) assert_series_equal(result, expected) result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz) assert_series_equal(result, s) # astype - datetime64[ns, tz] result = Series(s.values).astype('datetime64[ns, US/Eastern]') assert_series_equal(result, s) result = Series(s.values).astype(s.dtype) assert_series_equal(result, s) result = s.astype('datetime64[ns, CET]') expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET')) assert_series_equal(result, expected) # short str self.assertTrue('datetime64[ns, US/Eastern]' in str(s)) # formatting with NaT result = s.shift() self.assertTrue('datetime64[ns, US/Eastern]' in str(result)) self.assertTrue('NaT' in str(result)) # long str t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) self.assertTrue('datetime64[ns, US/Eastern]' in str(t)) result = pd.DatetimeIndex(s, freq='infer') tm.assert_index_equal(result, dr) # inference s = Series([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific') ]) self.assertTrue(s.dtype == 'datetime64[ns, US/Pacific]') self.assertTrue(lib.infer_dtype(s) == 'datetime64') s = Series([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern') ]) self.assertTrue(s.dtype == 'object') self.assertTrue(lib.infer_dtype(s) == 'datetime') # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected)
def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment # slice s = Series(list('abc')) def f(): s[0:3] = list(range(27)) pytest.raises(ValueError, f) s[0:3] = list(range(3)) expected = Series([0, 1, 2]) assert_series_equal( s.astype(np.int64), expected, ) # slice with step s = Series(list('abcdef')) def f(): s[0:4:2] = list(range(27)) pytest.raises(ValueError, f) s = Series(list('abcdef')) s[0:4:2] = list(range(2)) expected = Series([0, 'b', 1, 'd', 'e', 'f']) assert_series_equal(s, expected) # neg slices s = Series(list('abcdef')) def f(): s[:-1] = list(range(27)) pytest.raises(ValueError, f) s[-3:-1] = list(range(2)) expected = Series(['a', 'b', 'c', 0, 1, 'f']) assert_series_equal(s, expected) # list s = Series(list('abc')) def f(): s[[0, 1, 2]] = list(range(27)) pytest.raises(ValueError, f) s = Series(list('abc')) def f(): s[[0, 1, 2]] = list(range(2)) pytest.raises(ValueError, f) # scalar s = Series(list('abc')) s[0] = list(range(10)) expected = Series([list(range(10)), 'b', 'c']) assert_series_equal(s, expected)
def test_astype_cast_object_int(self): arr = Series(["1", "2", "3", "4"], dtype=object) result = arr.astype(int) tm.assert_series_equal(result, Series(np.arange(1, 5)))
def to_bool(series: pd.Series) -> pd.Series: dtype = hasnan_bool_name if series.hasnans else bool return series.astype(dtype)
def test_astype_cast_object_int_fail(self, dtype): arr = Series(["car", "house", "tree", "1"]) msg = r"invalid literal for int\(\) with base 10: 'car'" with pytest.raises(ValueError, match=msg): arr.astype(dtype)
def test_astype_cast_object_int(self): arr = Series(['1', '2', '3', '4'], dtype=object) result = arr.astype(int) tm.assert_series_equal(result, Series(np.arange(1, 5)))
def test_astype(self, dtype): s = Series(np.random.randn(5), name="foo") as_typed = s.astype(dtype) assert as_typed.dtype == dtype assert as_typed.name == s.name
def test_astype_cast_object_int_fail(self, dtype): arr = Series(["car", "house", "tree", "1"]) with pytest.raises(ValueError): arr.astype(dtype)
def test_astype_to_str_preserves_na(self, value, string_value): # https://github.com/pandas-dev/pandas/issues/36904 s = Series(["a", "b", value], dtype=object) result = s.astype(str) expected = Series(["a", "b", string_value], dtype=object) tm.assert_series_equal(result, expected)
def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype='M8[ns]', index=lrange(5)) assert isna(s).all() # in theory this should be all nulls, but since # we are not specifying a dtype is ambiguous s = Series(iNaT, index=lrange(5)) assert not isna(s).all() s = Series(nan, dtype='M8[ns]', index=lrange(5)) assert isna(s).all() s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]') assert isna(s[1]) assert s.dtype == 'M8[ns]' s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]') assert isna(s[1]) assert s.dtype == 'M8[ns]' # GH3416 dates = [ np.datetime64(datetime(2013, 1, 1)), np.datetime64(datetime(2013, 1, 2)), np.datetime64(datetime(2013, 1, 3)), ] s = Series(dates) assert s.dtype == 'M8[ns]' s.iloc[0] = np.nan assert s.dtype == 'M8[ns]' # GH3414 related expected = Series([ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), ], dtype='datetime64[ns]') result = Series( Series(dates).astype(np.int64) / 1000000, dtype='M8[ms]') tm.assert_series_equal(result, expected) result = Series(dates, dtype='datetime64[ns]') tm.assert_series_equal(result, expected) expected = Series([ pd.NaT, datetime(2013, 1, 2), datetime(2013, 1, 3), ], dtype='datetime64[ns]') result = Series([np.nan] + dates[1:], dtype='datetime64[ns]') tm.assert_series_equal(result, expected) dts = Series(dates, dtype='datetime64[ns]') # valid astype dts.astype('int64') # invalid casting msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]") with pytest.raises(TypeError, match=msg): dts.astype('int32') # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(dts, dtype=np.int64) expected = Series(dts.astype(np.int64)) tm.assert_series_equal(result, expected) # invalid dates can be help as object result = Series([datetime(2, 1, 1)]) assert result[0] == datetime(2, 1, 1, 0, 0) result = Series([datetime(3000, 1, 1)]) assert result[0] == datetime(3000, 1, 1, 0, 0) # don't mix types result = Series([Timestamp('20130101'), 1], index=['a', 'b']) assert result['a'] == Timestamp('20130101') assert result['b'] == 1 # GH6529 # coerce datetime64 non-ns properly dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') values2 = dates.view(np.ndarray).astype('datetime64[ns]') expected = Series(values2, index=dates) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) # GH 13876 # coerce to non-ns to object properly expected = Series(values2, index=dates, dtype=object) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, index=dates, dtype=object) assert_series_equal(result, expected) # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object # these will correctly infer a datetime s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' # tz-aware (UTC and other tz's) # GH 8411 dr = date_range('20130101', periods=3) assert Series(dr).iloc[0].tz is None dr = date_range('20130101', periods=3, tz='UTC') assert str(Series(dr).iloc[0].tz) == 'UTC' dr = date_range('20130101', periods=3, tz='US/Eastern') assert str(Series(dr).iloc[0].tz) == 'US/Eastern' # non-convertible s = Series([1479596223000, -1479590, pd.NaT]) assert s.dtype == 'object' assert s[2] is pd.NaT assert 'NaT' in str(s) # if we passed a NaT it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) assert s.dtype == 'object' assert s[2] is pd.NaT assert 'NaT' in str(s) # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) assert s.dtype == 'object' assert s[2] is np.nan assert 'NaN' in str(s)
def test_astype_from_float_to_str(self, dtype): # https://github.com/pandas-dev/pandas/issues/36451 s = Series([0.1], dtype=dtype) result = s.astype(str) expected = Series(["0.1"]) tm.assert_series_equal(result, expected)
def test_append_raise(setup_path): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages # list in column df = tm.makeDataFrame() df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ msg = re.escape( """Cannot serialize the column [invalid] because its data contents are not [string] but [mixed] object dtype""" ) with pytest.raises(TypeError, match=msg): store.append("df", df) # multiple invalid columns df["invalid2"] = [["a"]] * len(df) df["invalid3"] = [["a"]] * len(df) with pytest.raises(TypeError, match=msg): store.append("df", df) # datetime with embedded nans as object df = tm.makeDataFrame() s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) s[0:5] = np.nan df["invalid"] = s assert df.dtypes["invalid"] == np.object_ msg = "too many timezones in this block, create separate data columns" with pytest.raises(TypeError, match=msg): store.append("df", df) # directly ndarray msg = "value must be None, Series, or DataFrame" with pytest.raises(TypeError, match=msg): store.append("df", np.arange(10)) # series directly msg = re.escape( "cannot properly create the storer for: " "[group->df,value-><class 'pandas.core.series.Series'>]" ) with pytest.raises(TypeError, match=msg): store.append("df", Series(np.arange(10))) # appending an incompatible table df = tm.makeDataFrame() store.append("df", df) df["foo"] = "foo" msg = re.escape( "invalid combination of [non_index_axes] on appending data " "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table " "[(1, ['A', 'B', 'C', 'D'])]" ) with pytest.raises(ValueError, match=msg): store.append("df", df) # incompatible type (GH 41897) _maybe_remove(store, "df") df["foo"] = Timestamp("20130101") store.append("df", df) df["foo"] = "bar" msg = re.escape( "invalid combination of [values_axes] on appending data " "[name->values_block_1,cname->values_block_1," "dtype->bytes24,kind->string,shape->(1, 30)] " "vs current table " "[name->values_block_1,cname->values_block_1," "dtype->datetime64,kind->datetime64,shape->None]" ) with pytest.raises(ValueError, match=msg): store.append("df", df)
def test_td64_series_astype_object(self): tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]") result = tdser.astype(object) assert isinstance(result.iloc[0], timedelta) assert result.dtype == np.object_
def test_spilt_join_roundtrip(any_string_dtype): ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = ser.str.split("_").str.join("_") expected = ser.astype(object) tm.assert_series_equal(result, expected)
def test_dt64_series_astype_object(self): dt64ser = Series(date_range("20130101", periods=3)) result = dt64ser.astype(object) assert isinstance(result.iloc[0], datetime) assert result.dtype == np.object_
def infer_problem_type(y: Series): """ Identifies which type of prediction problem we are interested in (if user has not specified). Ie. binary classification, multi-class classification, or regression. """ if len(y) == 0: raise ValueError("provided labels cannot have length = 0") y = y.dropna( ) # Remove missing values from y (there should not be any though as they were removed in Learner.general_data_processing()) num_rows = len(y) unique_values = y.unique() unique_count = len(unique_values) if unique_count > 10: logger.log( 20, f'Here are the first 10 unique label values in your data: {list(unique_values[:10])}' ) else: logger.log( 20, f'Here are the {unique_count} unique label values in your data: {list(unique_values)}' ) MULTICLASS_LIMIT = 1000 # if numeric and class count would be above this amount, assume it is regression if num_rows > 1000: REGRESS_THRESHOLD = 0.05 # if the unique-ratio is less than this, we assume multiclass classification, even when labels are integers else: REGRESS_THRESHOLD = 0.1 if unique_count == 2: problem_type = BINARY reason = "only two unique label-values observed" elif y.dtype.name in ['object', 'category']: problem_type = MULTICLASS reason = f"dtype of label-column == {y.dtype.name}" elif np.issubdtype(y.dtype, np.floating): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): try: can_convert_to_int = np.array_equal(y, y.astype(int)) if can_convert_to_int: problem_type = MULTICLASS reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" except: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and many unique label-values observed" elif np.issubdtype(y.dtype, np.integer): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): problem_type = MULTICLASS # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression reason = "dtype of label-column == int, but few unique label-values observed" else: problem_type = REGRESSION reason = "dtype of label-column == int and many unique label-values observed" else: raise NotImplementedError(f'label dtype {y.dtype} not supported!') logger.log( 25, f"AutoGluon infers your prediction problem is: {problem_type} (because {reason})." ) logger.log( 25, f"If this is wrong, please specify `problem_type` argument in fit() instead " f"(You may specify problem_type as one of: {[BINARY, MULTICLASS, REGRESSION]})\n" ) return problem_type
def test_operators_bitwise(self): # GH 9016: support bitwise op for integer types index = list('bca') s_tft = Series([True, False, True], index=index) s_fff = Series([False, False, False], index=index) s_tff = Series([True, False, False], index=index) s_empty = Series([]) # TODO: unused # s_0101 = Series([0, 1, 0, 1]) s_0123 = Series(range(4), dtype='int64') s_3333 = Series([3] * 4) s_4444 = Series([4] * 4) res = s_tft & s_empty expected = s_fff assert_series_equal(res, expected) res = s_tft | s_empty expected = s_tft assert_series_equal(res, expected) res = s_0123 & s_3333 expected = Series(range(4), dtype='int64') assert_series_equal(res, expected) res = s_0123 | s_4444 expected = Series(range(4, 8), dtype='int64') assert_series_equal(res, expected) s_a0b1c0 = Series([1], list('b')) res = s_tft & s_a0b1c0 expected = s_tff.reindex(list('abc')) assert_series_equal(res, expected) res = s_tft | s_a0b1c0 expected = s_tft.reindex(list('abc')) assert_series_equal(res, expected) n0 = 0 res = s_tft & n0 expected = s_fff assert_series_equal(res, expected) res = s_0123 & n0 expected = Series([0] * 4) assert_series_equal(res, expected) n1 = 1 res = s_tft & n1 expected = s_tft assert_series_equal(res, expected) res = s_0123 & n1 expected = Series([0, 1, 0, 1]) assert_series_equal(res, expected) s_1111 = Series([1] * 4, dtype='int8') res = s_0123 & s_1111 expected = Series([0, 1, 0, 1], dtype='int64') assert_series_equal(res, expected) res = s_0123.astype(np.int16) | s_1111.astype(np.int32) expected = Series([1, 1, 3, 3], dtype='int32') assert_series_equal(res, expected) with pytest.raises(TypeError): s_1111 & 'a' with pytest.raises(TypeError): s_1111 & ['a', 'b', 'c', 'd'] with pytest.raises(TypeError): s_0123 & np.NaN with pytest.raises(TypeError): s_0123 & 3.14 with pytest.raises(TypeError): s_0123 & [0.1, 4, 3.14, 2] # s_0123 will be all false now because of reindexing like s_tft if compat.PY3: # unable to sort incompatible object via .union. exp = Series([False] * 7, index=['b', 'c', 'a', 0, 1, 2, 3]) with tm.assert_produces_warning(RuntimeWarning): assert_series_equal(s_tft & s_0123, exp) else: exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) assert_series_equal(s_tft & s_0123, exp) # s_tft will be all false now because of reindexing like s_0123 if compat.PY3: # unable to sort incompatible object via .union. exp = Series([False] * 7, index=[0, 1, 2, 3, 'b', 'c', 'a']) with tm.assert_produces_warning(RuntimeWarning): assert_series_equal(s_0123 & s_tft, exp) else: exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) assert_series_equal(s_0123 & s_tft, exp) assert_series_equal(s_0123 & False, Series([False] * 4)) assert_series_equal(s_0123 ^ False, Series([False, True, True, True])) assert_series_equal(s_0123 & [False], Series([False] * 4)) assert_series_equal(s_0123 & (False), Series([False] * 4)) assert_series_equal(s_0123 & Series([False, np.NaN, False, False]), Series([False] * 4)) s_ftft = Series([False, True, False, True]) assert_series_equal(s_0123 & Series([0.1, 4, -3.14, 2]), s_ftft) s_abNd = Series(['a', 'b', np.NaN, 'd']) res = s_0123 & s_abNd expected = s_ftft assert_series_equal(res, expected)