def test_coercing_dates_outside_of_datetime64_ns_bounds(self): invalid_dates = [ datetime.date(1000, 1, 1), datetime.datetime(1000, 1, 1), '1000-01-01', 'Jan 1, 1000', np.datetime64('1000-01-01'), ] for invalid_date in invalid_dates: self.assertRaises( ValueError, tslib.array_to_datetime, np.array([invalid_date], dtype='object'), errors='raise', ) self.assert_numpy_array_equal( tslib.array_to_datetime(np.array([invalid_date], dtype='object'), errors='coerce'), np.array([tslib.iNaT], dtype='M8[ns]')) arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) self.assert_numpy_array_equal( tslib.array_to_datetime(arr, errors='coerce'), np_array_datetime64_compat( [tslib.iNaT, '2000-01-01T00:00:00.000000000-0000'], dtype='M8[ns]'))
def test_parsing_valid_dates(self): arr = np.array(["01-01-2013", "01-02-2013"], dtype=object) self.assert_numpy_array_equal( tslib.array_to_datetime(arr), np_array_datetime64_compat( ["2013-01-01T00:00:00.000000000-0000", "2013-01-02T00:00:00.000000000-0000"], dtype="M8[ns]" ), ) arr = np.array(["Mon Sep 16 2013", "Tue Sep 17 2013"], dtype=object) self.assert_numpy_array_equal( tslib.array_to_datetime(arr), np_array_datetime64_compat( ["2013-09-16T00:00:00.000000000-0000", "2013-09-17T00:00:00.000000000-0000"], dtype="M8[ns]" ), )
def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np_array_datetime64_compat([ '2015-01-03T00:00:00.000000000+0000', '2015-01-01T00:00:00.000000000+0000' ], dtype='M8[ns]') dt_index = pd.to_datetime([ '2015-01-03T00:00:00.000000000+0000', '2015-01-01T00:00:00.000000000+0000', '2015-01-01T00:00:00.000000000+0000' ]) result = algos.unique(dt_index) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) s = pd.Series(dt_index) result = algos.unique(s) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) arr = s.values result = algos.unique(arr) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype)
def test_dateparser_resolution_if_not_ns(self): # issue 10245 data = """\ date,time,prn,rxstatus 2013-11-03,19:00:00,126,00E80000 2013-11-03,19:00:00,23,00E80000 2013-11-03,19:00:00,13,00E80000 """ def date_parser(date, time): datetime = np_array_datetime64_compat( date + 'T' + time + 'Z', dtype='datetime64[s]') return datetime df = read_csv(StringIO(data), date_parser=date_parser, parse_dates={'datetime': ['date', 'time']}, index_col=['datetime', 'prn']) datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3, dtype='datetime64[s]') df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3}, index=MultiIndex.from_tuples( [(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)], names=['datetime', 'prn'])) assert_frame_equal(df, df_correct)
def test_parsing_valid_dates(self): arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) self.assert_numpy_array_equal( tslib.array_to_datetime(arr), np_array_datetime64_compat([ '2013-01-01T00:00:00.000000000-0000', '2013-01-02T00:00:00.000000000-0000' ], dtype='M8[ns]')) arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) self.assert_numpy_array_equal( tslib.array_to_datetime(arr), np_array_datetime64_compat([ '2013-09-16T00:00:00.000000000-0000', '2013-09-17T00:00:00.000000000-0000' ], dtype='M8[ns]'))
def test_coerce_of_invalid_datetimes(self): arr = np.array(["01-01-2013", "not_a_date", "1"], dtype=object) # Without coercing, the presence of any invalid dates prevents # any values from being converted self.assert_numpy_array_equal(tslib.array_to_datetime(arr, errors="ignore"), arr) # With coercing, the invalid dates becomes iNaT self.assert_numpy_array_equal( tslib.array_to_datetime(arr, errors="coerce"), np_array_datetime64_compat(["2013-01-01T00:00:00.000000000-0000", tslib.iNaT, tslib.iNaT], dtype="M8[ns]"), )
def test_coerce_of_invalid_datetimes(self): arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) # Without coercing, the presence of any invalid dates prevents # any values from being converted self.assert_numpy_array_equal( tslib.array_to_datetime(arr, errors='ignore'), arr) # With coercing, the invalid dates becomes iNaT self.assert_numpy_array_equal( tslib.array_to_datetime(arr, errors='coerce'), np_array_datetime64_compat( ['2013-01-01T00:00:00.000000000-0000', tslib.iNaT, tslib.iNaT], dtype='M8[ns]'))
def test_coercing_dates_outside_of_datetime64_ns_bounds(self): invalid_dates = [ datetime.date(1000, 1, 1), datetime.datetime(1000, 1, 1), "1000-01-01", "Jan 1, 1000", np.datetime64("1000-01-01"), ] for invalid_date in invalid_dates: self.assertRaises( ValueError, tslib.array_to_datetime, np.array([invalid_date], dtype="object"), errors="raise" ) self.assert_numpy_array_equal( tslib.array_to_datetime(np.array([invalid_date], dtype="object"), errors="coerce"), np.array([tslib.iNaT], dtype="M8[ns]"), ) arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) self.assert_numpy_array_equal( tslib.array_to_datetime(arr, errors="coerce"), np_array_datetime64_compat([tslib.iNaT, "2000-01-01T00:00:00.000000000-0000"], dtype="M8[ns]"), )
def date_parser(date, time): datetime = np_array_datetime64_compat( date + 'T' + time + 'Z', dtype='datetime64[s]') return datetime
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series( {0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array( ['a', 'b', np.nan, 'd'], dtype='O')) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] # don't test names though txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM']) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) s.name = None idx = pd.to_datetime( ['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X']) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name='dt') result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days']) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({ 0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({ 0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal( s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] # don't test names though txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) s.name = None idx = pd.to_datetime([ '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat([ '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z' ], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name='dt') result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days']) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)