def test_tab_completion(self): # GH 9910 s = Series(list('abcd')) # Series of str values should have .str but not .dt/.cat in __dir__ assert 'str' in dir(s) assert 'dt' not in dir(s) assert 'cat' not in dir(s) # similarly for .dt s = Series(date_range('1/1/2015', periods=5)) assert 'dt' in dir(s) assert 'str' not in dir(s) assert 'cat' not in dir(s) # Similarly for .cat, but with the twist that str and dt should be # there if the categories are of that type first cat and str. s = Series(list('abbcd'), dtype="category") assert 'cat' in dir(s) assert 'str' in dir(s) # as it is a string categorical assert 'dt' not in dir(s) # similar to cat and str s = Series(date_range('1/1/2015', periods=5)).astype("category") assert 'cat' in dir(s) assert 'str' not in dir(s) assert 'dt' in dir(s) # as it is a datetime categorical
def test_series(self): # GH6407 # inferring series # invalid type of Series for s in [ Series(np.arange(10)), Series(np.arange(10.))]: self.assertRaises(TypeError, lambda : infer_freq(s)) # a non-convertible string self.assertRaises(ValueError, lambda : infer_freq(Series(['foo','bar']))) # cannot infer on PeriodIndex for freq in [None, 'L', 'Y']: s = Series(period_range('2013',periods=10,freq=freq)) self.assertRaises(TypeError, lambda : infer_freq(s)) # DateTimeIndex for freq in ['M', 'L', 'S']: s = Series(date_range('20130101',periods=10,freq=freq)) inferred = infer_freq(s) self.assertEqual(inferred,freq) s = Series(date_range('20130101','20130110')) inferred = infer_freq(s) self.assertEqual(inferred,'D')
def test_astype_str(self): # test astype string - #10442 result = date_range('2012-01-01', periods=4, name='test_name').astype(str) expected = Index(['2012-01-01', '2012-01-02', '2012-01-03', '2012-01-04'], name='test_name', dtype=object) tm.assert_index_equal(result, expected) # test astype string with tz and name result = date_range('2012-01-01', periods=3, name='test_name', tz='US/Eastern').astype(str) expected = Index(['2012-01-01 00:00:00-05:00', '2012-01-02 00:00:00-05:00', '2012-01-03 00:00:00-05:00'], name='test_name', dtype=object) tm.assert_index_equal(result, expected) # test astype string with freqH and name result = date_range('1/1/2011', periods=3, freq='H', name='test_name').astype(str) expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', '2011-01-01 02:00:00'], name='test_name', dtype=object) tm.assert_index_equal(result, expected) # test astype string with freqH and timezone result = date_range('3/6/2012 00:00', periods=2, freq='H', tz='Europe/London', name='test_name').astype(str) expected = Index(['2012-03-06 00:00:00+00:00', '2012-03-06 01:00:00+00:00'], dtype=object, name='test_name') tm.assert_index_equal(result, expected)
def test_tz_range_is_utc(self): exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' dfexp = ('{"DT":{' '"0":"2013-01-01T05:00:00.000Z",' '"1":"2013-01-02T05:00:00.000Z"}}') tz_range = pd.date_range('2013-01-01 05:00:00Z', periods=2) self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) tz_range = pd.date_range('2013-01-01 00:00:00', periods=2, tz='US/Eastern') self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2) self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True))
def _check_generated_range(self, start, freq): freq = freq.upper() gen = date_range(start, periods=7, freq=freq) index = _dti(gen.values) if not freq.startswith('Q-'): self.assertEqual(infer_freq(index), gen.freqstr) else: inf_freq = infer_freq(index) self.assertTrue((inf_freq == 'Q-DEC' and gen.freqstr in ('Q', 'Q-DEC', 'Q-SEP', 'Q-JUN', 'Q-MAR')) or (inf_freq == 'Q-NOV' and gen.freqstr in ('Q-NOV', 'Q-AUG', 'Q-MAY', 'Q-FEB')) or (inf_freq == 'Q-OCT' and gen.freqstr in ('Q-OCT', 'Q-JUL', 'Q-APR', 'Q-JAN'))) gen = date_range(start, periods=5, freq=freq) index = _dti(gen.values) if not freq.startswith('Q-'): self.assertEqual(infer_freq(index), gen.freqstr) else: inf_freq = infer_freq(index) self.assertTrue((inf_freq == 'Q-DEC' and gen.freqstr in ('Q', 'Q-DEC', 'Q-SEP', 'Q-JUN', 'Q-MAR')) or (inf_freq == 'Q-NOV' and gen.freqstr in ('Q-NOV', 'Q-AUG', 'Q-MAY', 'Q-FEB')) or (inf_freq == 'Q-OCT' and gen.freqstr in ('Q-OCT', 'Q-JUL', 'Q-APR', 'Q-JAN')))
def test_to_period_tz_explicit_pytz(self): xp = date_range('1/1/2000', '4/1/2000').to_period() ts = date_range('1/1/2000', '4/1/2000', tz=pytz.timezone('US/Eastern')) result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=pytz.utc) result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp)
def test_to_period_tz_dateutil(self): xp = date_range('1/1/2000', '4/1/2000').to_period() ts = date_range('1/1/2000', '4/1/2000', tz='dateutil/US/Eastern') result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=dateutil.tz.tzutc()) result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp)
def test_groupby_groups_datetimeindex(self): # GH#1430 periods = 1000 ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods) df = DataFrame({'high': np.arange(periods), 'low': np.arange(periods)}, index=ind) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) # it works! groups = grouped.groups assert isinstance(list(groups.keys())[0], datetime) # GH#11442 index = pd.date_range('2015/01/01', periods=5, name='date') df = pd.DataFrame({'A': [5, 6, 7, 8, 9], 'B': [1, 2, 3, 4, 5]}, index=index) result = df.groupby(level='date').groups dates = ['2015-01-05', '2015-01-04', '2015-01-03', '2015-01-02', '2015-01-01'] expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') for date in dates} tm.assert_dict_equal(result, expected) grouped = df.groupby(level='date') for date in dates: result = grouped.get_group(date) data = [[df.loc[date, 'A'], df.loc[date, 'B']]] expected_index = pd.DatetimeIndex([date], name='date') expected = pd.DataFrame(data, columns=list('AB'), index=expected_index) tm.assert_frame_equal(result, expected)
def test_to_period_tz_pytz(self): from pytz import utc as UTC xp = date_range('1/1/2000', '4/1/2000').to_period() ts = date_range('1/1/2000', '4/1/2000', tz='US/Eastern') result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=UTC) result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected tm.assert_index_equal(ts.to_period(), xp)
def test_iteration_preserves_tz(self): # see gh-8890 index = date_range("2012-01-01", periods=3, freq='H', tz='US/Eastern') for i, ts in enumerate(index): result = ts expected = index[i] assert result == expected index = date_range("2012-01-01", periods=3, freq='H', tz=dateutil.tz.tzoffset(None, -28800)) for i, ts in enumerate(index): result = ts expected = index[i] assert result._repr_base == expected._repr_base assert result == expected # 9100 index = pd.DatetimeIndex(['2014-12-01 03:32:39.987000-08:00', '2014-12-01 04:12:34.987000-08:00']) for i, ts in enumerate(index): result = ts expected = index[i] assert result._repr_base == expected._repr_base assert result == expected
def test_categorical_series_repr_datetime_ordered(self): idx = date_range('2011-01-01 09:00', freq='H', periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 2 2011-01-01 11:00:00 3 2011-01-01 12:00:00 4 2011-01-01 13:00:00 dtype: category Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa assert repr(s) == exp idx = date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 2 2011-01-01 11:00:00-05:00 3 2011-01-01 12:00:00-05:00 4 2011-01-01 13:00:00-05:00 dtype: category Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < 2011-01-01 13:00:00-05:00]""" # noqa assert repr(s) == exp
def test_join_aware(self): rng = date_range('1/1/2011', periods=10, freq='H') ts = Series(np.random.randn(len(rng)), index=rng) ts_utc = ts.tz_localize('utc') self.assertRaises(Exception, ts.__add__, ts_utc) self.assertRaises(Exception, ts_utc.__add__, ts) test1 = DataFrame(np.zeros((6,3)), index=date_range("2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central")) test2 = DataFrame(np.zeros((3,3)), index=date_range("2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"), columns=range(3,6)) result = test1.join(test2, how='outer') ex_index = test1.index.union(test2.index) self.assertTrue(result.index.equals(ex_index)) self.assertTrue(result.index.tz.zone == 'US/Central') # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") result = rng.union(rng2) self.assertTrue(result.tz.zone == 'UTC')
def test_ufunc_coercions(self): idx = date_range('2011-01-01', periods=3, freq='2D', name='x') delta = np.timedelta64(1, 'D') for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) exp = date_range('2011-01-02', periods=3, freq='2D', name='x') tm.assert_index_equal(result, exp) assert result.freq == '2D' for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) exp = date_range('2010-12-31', periods=3, freq='2D', name='x') tm.assert_index_equal(result, exp) assert result.freq == '2D' delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'), np.timedelta64(3, 'D')]) for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'], freq='3D', name='x') tm.assert_index_equal(result, exp) assert result.freq == '3D' for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'], freq='D', name='x') tm.assert_index_equal(result, exp) assert result.freq == 'D'
def test_date_range_localize(self): rng = date_range('3/11/2012 03:00', periods=15, freq='H', tz='US/Eastern') rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], tz='US/Eastern') rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') rng3 = rng3.tz_localize('US/Eastern') self.assert_(rng.equals(rng3)) # DST transition time val = rng[0] exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') self.assertEquals(val.hour, 3) self.assertEquals(exp.hour, 3) self.assertEquals(val, exp) # same UTC value self.assert_(rng[:2].equals(rng2)) # Right before the DST transition rng = date_range('3/11/2012 00:00', periods=2, freq='H', tz='US/Eastern') rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], tz='US/Eastern') self.assert_(rng.equals(rng2)) exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') self.assertEquals(exp.hour, 0) self.assertEquals(rng[0], exp) exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') self.assertEquals(exp.hour, 1) self.assertEquals(rng[1], exp) rng = date_range('3/11/2012 00:00', periods=10, freq='H', tz='US/Eastern') self.assert_(rng[2].hour == 3)
def test_tz_localize_naive(self): rng = date_range('1/1/2011', periods=100, freq='H') conv = rng.tz_localize('US/Pacific') exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') self.assert_(conv.equals(exp))
def test_categorical_repr_datetime_ordered(self): idx = date_range('2011-01-01 09:00', freq='H', periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa assert repr(c) == exp idx = date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < 2011-01-01 13:00:00-05:00]""" # noqa assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < 2011-01-01 13:00:00-05:00]""" # noqa assert repr(c) == exp
def test_categorical_index_repr_datetime_ordered(self): idx = date_range('2011-01-01 09:00', freq='H', periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', '2011-01-01 13:00:00'], categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa assert repr(i) == exp idx = date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa assert repr(i) == exp i = CategoricalIndex(Categorical(idx.append(idx), ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa assert repr(i) == exp
def test_constructor_with_datetimelike(self): # 12077 # constructor wwth a datetimelike and NaT for dtl in [date_range('1995-01-01 00:00:00', periods=5, freq='s'), date_range('1995-01-01 00:00:00', periods=5, freq='s', tz='US/Eastern'), timedelta_range('1 day', periods=5, freq='s')]: s = Series(dtl) c = Categorical(s) expected = type(dtl)(s) expected.freq = None tm.assert_index_equal(c.categories, expected) tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8')) # with NaT s2 = s.copy() s2.iloc[-1] = NaT c = Categorical(s2) expected = type(dtl)(s2.dropna()) expected.freq = None tm.assert_index_equal(c.categories, expected) exp = np.array([0, 1, 2, 3, -1], dtype=np.int8) tm.assert_numpy_array_equal(c.codes, exp) result = repr(c) assert 'NaT' in result
def test_date_range_businesshour(self): idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00', '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00', '2014-07-04 15:00', '2014-07-04 16:00'], freq='BH') rng = date_range('2014-07-04 09:00', '2014-07-04 16:00', freq='BH') tm.assert_index_equal(idx, rng) idx = DatetimeIndex( ['2014-07-04 16:00', '2014-07-07 09:00'], freq='BH') rng = date_range('2014-07-04 16:00', '2014-07-07 09:00', freq='BH') tm.assert_index_equal(idx, rng) idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00', '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00', '2014-07-04 15:00', '2014-07-04 16:00', '2014-07-07 09:00', '2014-07-07 10:00', '2014-07-07 11:00', '2014-07-07 12:00', '2014-07-07 13:00', '2014-07-07 14:00', '2014-07-07 15:00', '2014-07-07 16:00', '2014-07-08 09:00', '2014-07-08 10:00', '2014-07-08 11:00', '2014-07-08 12:00', '2014-07-08 13:00', '2014-07-08 14:00', '2014-07-08 15:00', '2014-07-08 16:00'], freq='BH') rng = date_range('2014-07-04 09:00', '2014-07-08 16:00', freq='BH') tm.assert_index_equal(idx, rng)
def test_select_dtypes_include_exclude_mixed_scalars_lists(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(include=np.number, exclude=['floating', 'timedelta']) ei = df[['b', 'c']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, 'category'], exclude='floating') ei = df[['b', 'c', 'f', 'k']] assert_frame_equal(ri, ei)
def test_astype(self): # astype expected = np.array([[Timestamp('2013-01-01 00:00:00'), Timestamp('2013-01-02 00:00:00'), Timestamp('2013-01-03 00:00:00')], [Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern'), pd.NaT, Timestamp('2013-01-03 00:00:00-0500', tz='US/Eastern')], [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), pd.NaT, Timestamp('2013-01-03 00:00:00+0100', tz='CET')]], dtype=object).T result = self.tzframe.astype(object) assert_frame_equal(result, DataFrame( expected, index=self.tzframe.index, columns=self.tzframe.columns)) result = self.tzframe.astype('datetime64[ns]') expected = DataFrame({'A': date_range('20130101', periods=3), 'B': (date_range('20130101', periods=3, tz='US/Eastern') .tz_convert('UTC') .tz_localize(None)), 'C': (date_range('20130101', periods=3, tz='CET') .tz_convert('UTC') .tz_localize(None))}) expected.iloc[1, 1] = pd.NaT expected.iloc[1, 2] = pd.NaT assert_frame_equal(result, expected)
def test_at_time_frame(self): rng = date_range('1/1/2000', '1/5/2000', freq='5min') ts = DataFrame(np.random.randn(len(rng), 2), index=rng) rs = ts.at_time(rng[1]) self.assertTrue((rs.index.hour == rng[1].hour).all()) self.assertTrue((rs.index.minute == rng[1].minute).all()) self.assertTrue((rs.index.second == rng[1].second).all()) result = ts.at_time('9:30') expected = ts.at_time(time(9, 30)) assert_frame_equal(result, expected) result = ts.loc[time(9, 30)] expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)] assert_frame_equal(result, expected) # midnight, everything rng = date_range('1/1/2000', '1/31/2000') ts = DataFrame(np.random.randn(len(rng), 3), index=rng) result = ts.at_time(time(0, 0)) assert_frame_equal(result, ts) # time doesn't exist rng = date_range('1/1/2012', freq='23Min', periods=384) ts = DataFrame(np.random.randn(len(rng), 2), rng) rs = ts.at_time('16:00') self.assertEqual(len(rs), 0)
def test_date_range_nat(self): # GH#11587 msg = "Neither `start` nor `end` can be NaT" with pytest.raises(ValueError, match=msg): date_range(start='2016-01-01', end=pd.NaT, freq='D') with pytest.raises(ValueError, match=msg): date_range(start=pd.NaT, end='2016-01-01', freq='D')
def test_select_x(): assert utils.select_x(None) is None def _check(d, expected): x = utils.select_x(d) assert x == expected data = dict(col1=[1.0, 2.0, 3.0], # Q col2=['A', 'B', 'C'], # N col3=pd.date_range('2012', periods=3, freq='A')) # T _check(data, 'col3') data = dict(col1=[1.0, 2.0, 3.0], # Q col2=['A', 'B', 'C']) # N _check(data, 'col2') data = dict(col1=[1.0, 2.0, 3.0]) # Q _check(data, 'col1') # Custom order data = dict(col1=[1.0, 2.0, 3.0], # Q col2=['A', 'B', 'C'], # N col3=pd.date_range('2012', periods=3, freq='A'), # T col4=pd.date_range('2012', periods=3, freq='A')) # T selected_x = utils.select_x(data, ['N', 'T', 'Q', 'O']) assert selected_x == "col2" # Len < 1 assert utils.select_x(dict()) is None
def test_CalendarDay_range_with_dst_crossing(self): # GH 20596 result = date_range('2018-10-23', '2018-11-06', freq='7CD', tz='Europe/Paris') expected = date_range('2018-10-23', '2018-11-06', freq=pd.DateOffset(days=7), tz='Europe/Paris') tm.assert_index_equal(result, expected)
def test_is_datetime_dtypes(self): ts = pd.date_range('20130101', periods=3) tsa = pd.date_range('20130101', periods=3, tz='US/Eastern') self.assertTrue(is_datetime64_dtype('datetime64')) self.assertTrue(is_datetime64_dtype('datetime64[ns]')) self.assertTrue(is_datetime64_dtype(ts)) self.assertFalse(is_datetime64_dtype(tsa)) self.assertFalse(is_datetime64_ns_dtype('datetime64')) self.assertTrue(is_datetime64_ns_dtype('datetime64[ns]')) self.assertTrue(is_datetime64_ns_dtype(ts)) self.assertTrue(is_datetime64_ns_dtype(tsa)) self.assertTrue(is_datetime64_any_dtype('datetime64')) self.assertTrue(is_datetime64_any_dtype('datetime64[ns]')) self.assertTrue(is_datetime64_any_dtype(ts)) self.assertTrue(is_datetime64_any_dtype(tsa)) self.assertFalse(is_datetime64tz_dtype('datetime64')) self.assertFalse(is_datetime64tz_dtype('datetime64[ns]')) self.assertFalse(is_datetime64tz_dtype(ts)) self.assertTrue(is_datetime64tz_dtype(tsa)) for tz in ['US/Eastern', 'UTC']: dtype = 'datetime64[ns, {}]'.format(tz) self.assertFalse(is_datetime64_dtype(dtype)) self.assertTrue(is_datetime64tz_dtype(dtype)) self.assertTrue(is_datetime64_ns_dtype(dtype)) self.assertTrue(is_datetime64_any_dtype(dtype))
def setup(self): N = 25000 index = tm.makeStringIndex(N) self.df = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N)}, index=index) self.df_mixed = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N), 'string1': ['foo'] * N, 'bool1': [True] * N, 'int1': np.random.randint(0, N, size=N)}, index=index) self.df_wide = DataFrame(np.random.randn(N, 100)) self.start_wide = self.df_wide.index[10000] self.stop_wide = self.df_wide.index[15000] self.df2 = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N)}, index=date_range('1/1/2000', periods=N)) self.start = self.df2.index[10000] self.stop = self.df2.index[15000] self.df_wide2 = DataFrame(np.random.randn(N, 100), index=date_range('1/1/2000', periods=N)) self.df_dc = DataFrame(np.random.randn(N, 10), columns=['C%03d' % i for i in range(10)]) self.fname = '__test__.h5' self.store = HDFStore(self.fname) self.store.put('fixed', self.df) self.store.put('fixed_mixed', self.df_mixed) self.store.append('table', self.df2) self.store.append('table_mixed', self.df_mixed) self.store.append('table_wide', self.df_wide) self.store.append('table_wide2', self.df_wide2)
def test_with_tz_ambiguous_times(self): tz = pytz.timezone('US/Eastern') rng = bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1)) # regular no problem self.assert_(rng.tz_validate()) # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=datetools.Hour()) self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) # after dst transition, it works dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, freq=datetools.Hour(), tz=tz) # November 6, 2011, fall back, repeat 2 AM hour dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=datetools.Hour()) self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) # UTC is OK dr = date_range(datetime(2011, 3, 13), periods=48, freq=datetools.Minute(30), tz=pytz.utc)
def setUp(self): date_index = date_range(datetime(2009, 12, 11), periods=3, freq=datetools.bday) ts = Series([3, 1, 4], index=date_index) self.TS1 = ts date_index = date_range(datetime(2009, 12, 11), periods=5, freq=datetools.bday) ts = Series([1, 5, 9, 2, 6], index=date_index) self.TS2 = ts date_index = date_range(datetime(2009, 12, 11), periods=3, freq=datetools.bday) ts = Series([5, np.nan, 3], index=date_index) self.TS3 = ts date_index = date_range(datetime(2009, 12, 11), periods=5, freq=datetools.bday) ts = Series([np.nan, 5, 8, 9, 7], index=date_index) self.TS4 = ts data = {'x1': self.TS2, 'x2': self.TS4} self.DF1 = DataFrame(data=data) data = {'x1': self.TS2, 'x2': self.TS4} self.DICT1 = data
def test_select_dtypes_exclude_using_scalars(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(exclude=np.number) ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']] assert_frame_equal(ri, ei) ri = df.select_dtypes(exclude='category') ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']] assert_frame_equal(ri, ei) pytest.raises(NotImplementedError, lambda: df.select_dtypes(exclude='period'))
def test_TimeSeries_deprecation(self): # deprecation TimeSeries, #10890 with tm.assert_produces_warning(FutureWarning): pd.SparseTimeSeries(1, index=pd.date_range('20130101', periods=3))
def ugh(exp): file = open_ncfile('../../'+exp+'/landCoverFrac/LPX-Bern_'+exp+ '_landCoverFrac_original.nc') lat = file.variables['latitude'][92:160] lon = file.variables['longitude'][584:668] time = pd.date_range(start='1/1/2001', end='1/01/2019', freq='M') veg1 = file.variables['landCoverFrac'][584:668,92:160,3612:,0] veg2 = file.variables['landCoverFrac'][584:668,92:160,3612:,1] veg3 = file.variables['landCoverFrac'][584:668,92:160,3612:,2] veg4 = file.variables['landCoverFrac'][584:668,92:160,3612:,3] veg5 = file.variables['landCoverFrac'][584:668,92:160,3612:,4] veg6 = file.variables['landCoverFrac'][584:668,92:160,3612:,5] veg7 = file.variables['landCoverFrac'][584:668,92:160,3612:,6] veg8 = file.variables['landCoverFrac'][584:668,92:160,3612:,7] veg9 = file.variables['landCoverFrac'][584:668,92:160,3612:,8] veg10 = file.variables['landCoverFrac'][584:668,92:160,3612:,9] veg11 = file.variables['landCoverFrac'][584:668,92:160,3612:,10] veg12 = file.variables['landCoverFrac'][584:668,92:160,3612:,11] veg13 = file.variables['landCoverFrac'][584:668,92:160,3612:,12] veg14 = file.variables['landCoverFrac'][584:668,92:160,3612:,13] veg15 = file.variables['landCoverFrac'][584:668,92:160,3612:,14] veg16 = file.variables['landCoverFrac'][584:668,92:160,3612:,15] veg17 = file.variables['landCoverFrac'][584:668,92:160,3612:,16] veg18 = file.variables['landCoverFrac'][584:668,92:160,3612:,17] veg19 = file.variables['landCoverFrac'][584:668,92:160,3612:,18] veg20 = file.variables['landCoverFrac'][584:668,92:160,3612:,19] pft_short = ['TrBR', 'TeNE', 'TeBE', 'TeBS', 'BNE', 'BNS', 'BBS', 'C3G', 'C4G', 'PeatGr', 'PeatSM', 'PeatTrBE', 'PeatTrBR', 'PeatHerb', 'C3Crop', 'C4Crop', 'C3Past', 'C4Past', 'UrbanBare'] pfts = ['Tropical broad raingreen', 'Temperate needle evergreen', 'Temperate broad evergeen', 'Temperate broad summergreen', 'Boreal needle evergreen', 'Boreal needle summergreen', 'Boreal broad summergreen', 'C3 herbaceous', 'C4 herbaceous', 'Peat graminoid', 'Peat sphagnum moss', 'Peat flood tolerant tropical broad evergreen', 'Peat flood tolerant tropical broad raingreen', 'Peat flood tolerant herbaceous', 'Cropland C3 herbaceous', 'Cropland C4 herbaceous', 'Pasture C3 herbaceous', 'Pasture C4 herbaceous', 'Urban Bare'] vegs = [veg2, veg3, veg4, veg5, veg6, veg7, veg8, veg9, veg10, veg11, veg12, veg13, veg14, veg15, veg16, veg17, veg18, veg19, veg20] # create dataset ds = xr.Dataset({ 'TrBE': xr.DataArray(data = np.transpose(veg1), dims = ['time', 'latitude', 'longitude'], coords = [time, lat, lon], attrs = {'long_name': 'Tropical broad evergreen', 'units' : '%'})}, attrs = {'Conventions':'CF-1.6', 'Institution':'Climate and Environmental Physics, ' 'University of Bern', 'Source': 'Extracted from LPX-Bern_S3_01 at ' '2019-08-13T17:58:26.478921', 'Title':'Fractional Land Cover of PFT output from ' 'LPX-Bern for GCP201', 'Contact': '*****@*****.**'} ) for v, ps, p in zip(vegs, pft_short, pfts): ds[ps] = xr.DataArray(data = np.transpose(v), dims = ['time', 'latitude', 'longitude'], coords = [time, lat, lon], attrs = {'long_name': p, 'units' : '%'}) ds['latitude'].attrs={'units':'degrees_north', 'long_name':'latitude', 'standard_name':'latitude', 'axis':'Y'} ds['longitude'].attrs={'units':'degrees_east', 'long_name':'longitude', 'standard_name':'longitude', 'axis':'X'} ds.to_netcdf('LPX-Bern_'+exp+'_landCoverFrac.nc', encoding={'latitude':{'dtype': 'double'}, 'longitude':{'dtype': 'double'}, 'time':{'dtype': 'double'}, 'TrBE':{'dtype': 'float32'}, 'TrBR':{'dtype': 'float32'}, 'TeNE':{'dtype': 'float32'}, 'TeBE':{'dtype': 'float32'}, 'TeBS':{'dtype': 'float32'}, 'BNE':{'dtype': 'float32'}, 'BNS':{'dtype': 'float32'}, 'BBS':{'dtype': 'float32'}, 'C3G':{'dtype': 'float32'}, 'C4G':{'dtype': 'float32'}, 'PeatGr':{'dtype': 'float32'}, 'PeatSM':{'dtype': 'float32'}, 'PeatTrBE':{'dtype': 'float32'}, 'PeatTrBR':{'dtype': 'float32'}, 'PeatHerb':{'dtype': 'float32'}, 'C3Crop':{'dtype': 'float32'}, 'C4Crop':{'dtype': 'float32'}, 'C3Past':{'dtype': 'float32'}, 'C4Past':{'dtype': 'float32'}, 'UrbanBare':{'dtype': 'float32'}})
def lag(): os.chdir(dir_in) #get names tg_list_name = sorted(os.listdir()) x = 91 y = 92 for tg in range(x, y): os.chdir(dir_in) tg_name = tg_list_name[tg] print(tg_name, '\n') pred = pd.read_csv(tg_name) #create a daily time series - date_range #get only the ymd of the start and end times start_time = pred['date'][0].split(' ')[0] end_time = pred['date'].iloc[-1].split(' ')[0] print(start_time, ' - ', end_time, '\n') date_range = pd.date_range(start_time, end_time, freq='D') #defining time changing lambda functions time_str = lambda x: str(x) time_converted_str = pd.DataFrame(map(time_str, date_range), columns=['date']) time_converted_stamp = pd.DataFrame(date_range, columns=['timestamp']) """ first prepare the six time lagging dataframes then use the merge function to merge the original predictor with the lagging dataframes """ #prepare lagged time series for time only #note here that since MERRA has 3hrly data #the lag_hrs is increased from 6(eraint) to 31(MERRA) time_lagged = pd.DataFrame() lag_hrs = list(range(0, 31)) for lag in lag_hrs: lag_name = 'lag' + str(lag) lam_delta = lambda x: str(x - dt.timedelta(hours=lag)) lag_new = pd.DataFrame(map(lam_delta, time_converted_stamp['timestamp']), \ columns = [lag_name]) time_lagged = pd.concat([time_lagged, lag_new], axis=1) #datafrmae that contains all lagged time series (just time) time_all = pd.concat([time_converted_str, time_lagged], axis=1) pred_lagged = pd.DataFrame() for ii in range( 1, time_all.shape[1]): #to loop through the lagged time series print(time_all.columns[ii]) #extracting corresponding tag time series lag_ts = pd.DataFrame(time_all.iloc[:, ii]) lag_ts.columns = ['date'] #merge the selected tlagged time with the predictor on = "date" pred_new = pd.merge(pred, lag_ts, on=['date'], how='right') pred_new.drop('Unnamed: 0', axis=1, inplace=True) #sometimes nan values go to the bottom of the dataframe #sort df by date -> reset the index -> remove old index pred_new.sort_values(by='date', inplace=True) pred_new.reset_index(inplace=True) pred_new.drop('index', axis=1, inplace=True) #concatenate lagged dataframe if ii == 1: pred_lagged = pred_new else: pred_lagged = pd.concat([pred_lagged, pred_new.iloc[:, 1:]], axis=1) #cd to saving directory os.chdir(dir_out) pred_lagged.to_csv(tg_name) os.chdir(dir_in)
def obtain_data(config_file_path, metadata_file_path=None): """ Uses read_config() to acquire a full dictionary of the config file and then uses the values contained within it to direct how data is processed and what variables are obtained. If a metadata file is provided, the config file will still be used for data organization, but the metadata will be pulled from the metadata file. Args: config_file_path : string of path to config file, should work with absolute or relative path metadata_file_path : string of path to metadata file if provided Returns: extracted_data : pandas dataframe of entire dataset, with the variables being organized into columns col_df : pandas series of what variables are stored in what columns, used to track which vars are provided station_name : string of file, including path, that was provided to dataset log_file : string of log file, including path, that was provided to dataset station_lat : station latitude in decimal degrees station_elev : station elevation in meters anemom_height : height of anemometer in meters fill_value : value pulled from config file that indicates missing data in output file script_mode : boolean flag for if user wants to correct data or not gen_bokeh : boolean flag for if user wants to plot graphs or not """ # Open config file validate_file(config_file_path, ['ini']) config_dict = read_config(config_file_path) print('\nSuccessfully opened config file at %s' % config_file_path) # Open metadata file # If a metadata file is provided we will open it and overwrite values in config_dict with its values if metadata_file_path is not None: validate_file(metadata_file_path, 'xlsx') # Validate file to make sure it exists and is the right type metadata_df = pd.read_excel(metadata_file_path, sheet_name=0, index_col=0, engine='openpyxl', keep_default_na=True, na_filter=True, verbose=True) print('\nSuccessfully opened metadata file at %s' % metadata_file_path) current_row = metadata_df.run_count.ne(2).idxmax() - 1 metadata_series = metadata_df.iloc[current_row] config_dict['data_file_path'] = metadata_series.input_path config_dict['station_latitude'] = metadata_series.latitude config_dict['station_longitude'] = metadata_series.longitude config_dict['station_elevation'] = metadata_series.elev_m config_dict['anemometer_height'] = metadata_series.anemom_height_m config_dict['corr_flag'] = metadata_series.run_count # split file string on extension (file_name, station_extension) = os.path.splitext(config_dict['data_file_path']) # check to see if file is in a subdirectory in the same folder as the script if '/' in file_name: (folder_path, delimiter, _station_name) = file_name.rpartition('/') elif '\\' in file_name: (folder_path, delimiter, _station_name) = file_name.rpartition('\\') else: folder_path = os.getcwd() # Add new keys to config_dict for directory and file information to save files later on config_dict['station_name'] = str(metadata_series.id) config_dict['file_name'] = file_name config_dict['station_extension'] = station_extension config_dict['folder_path'] = folder_path else: # No metadata file was provided, use the path info of the data file to construct path variables metadata_df = None metadata_series = None (file_name, station_extension) = os.path.splitext(config_dict['data_file_path']) # check to see if file is in a subdirectory or by itself if '/' in file_name: (folder_path, delimiter, station_name) = file_name.rpartition('/') elif '\\' in file_name: (folder_path, delimiter, station_name) = file_name.rpartition('\\') else: station_name = file_name folder_path = os.getcwd() # Add new keys to config_dict for directory and file information to save files later on config_dict['station_name'] = station_name config_dict['file_name'] = file_name config_dict['station_extension'] = station_extension config_dict['folder_path'] = folder_path # Check lines_of_header value, if 0 change it to NONE, if nonzero minus it by one if config_dict['lines_of_header'] == 0: config_dict['lines_of_header'] = None else: config_dict['lines_of_header'] = config_dict['lines_of_header'] - 1 # Open data file validate_file(config_dict['data_file_path'], ['csv', 'xls', 'xlsx']) if station_extension == '.csv': # csv file provided raw_data = pd.read_csv(config_dict['data_file_path'], delimiter=',', header=config_dict['lines_of_header'], index_col=None, engine='python', skipfooter=config_dict['lines_of_footer'], na_values=config_dict['missing_data_value'], keep_default_na=True, na_filter=True, verbose=True, skip_blank_lines=True) elif station_extension == '.xlsx': raw_data = pd.read_excel(config_dict['data_file_path'], sheet_name=0, header=config_dict['lines_of_header'], index_col=None, engine='openpyxl', skipfooter=config_dict['lines_of_footer'], na_values=config_dict['missing_data_value'], keep_default_na=True, na_filter=True, verbose=True) elif station_extension == '.xls': raw_data = pd.read_excel(config_dict['data_file_path'], sheet_name=0, header=config_dict['lines_of_header'], index_col=None, engine='xlrd', skipfooter=config_dict['lines_of_footer'], na_values=config_dict['missing_data_value'], keep_default_na=True, na_filter=True, verbose=True) else: # This script is only handles csv and excel files. Validate_file() already catches this case raise IOError('\n\nProvided file was of type \'{}\' but script was expecting type \'{}\'.' .format(station_extension, ['csv', 'xls', 'xlsx'])) print('\nSuccessfully opened data file at %s' % config_dict['data_file_path']) # Handle any for network-specific oddities that may have slipped through raw_data = raw_data.replace(to_replace='NO RECORD ', value=np.nan) # catch for whitespaces on agriment # check for the existence of 'correction_files' folder and if not present make one if not os.path.exists(folder_path + '/correction_files'): os.makedirs(folder_path + '/correction_files') os.makedirs(folder_path + '/correction_files/before_graphs/') os.makedirs(folder_path + '/correction_files/after_graphs/') os.makedirs(folder_path + '/correction_files/histograms/') else: pass # Create log file for this new data file config_dict['log_file_path'] = config_dict['folder_path'] + \ '/correction_files/' + config_dict['station_name'] + '_changes_log' + '.txt' log.basicConfig() logger = open(config_dict['log_file_path'], 'w') logger.write('The raw data for %s has been successfully read in at %s. \n \n' % (config_dict['station_name'], dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) logger.close() print('\nSuccessfully created log file at %s.' % config_dict['log_file_path']) # Date handling, figures out the date format and extracts from string if needed if config_dict['date_format'] == 1: # Date is provided as a string, expected format is MM/DD/YYYY, time can be included as well. if config_dict['string_date_col'] != -1: data_date = np.array(raw_data.iloc[:, config_dict['string_date_col']]) dt_date = pd.to_datetime(data_date, errors='raise') data_day = np.array(dt_date.day.astype('int')) data_month = np.array(dt_date.month.astype('int')) data_year = np.array(dt_date.year.astype('int')) else: # date format was provided as a string date but no string date was given raise ValueError('Date format parameter indicated a string date but none was provided') elif config_dict['date_format'] == 2: if config_dict['month_col'] != -1 and config_dict['day_col'] != -1 and config_dict['year_col'] != -1: data_month = np.array(raw_data.iloc[:, config_dict['month_col']].astype('int')) data_day = np.array(raw_data.iloc[:, config_dict['day_col']].astype('int')) data_year = np.array(raw_data.iloc[:, config_dict['year_col']].astype('int')) else: # date format was provided as separate columns but some were missing raise ValueError('Date format parameter indicated separate y/m/d columns but some or all were missing') elif config_dict['date_format'] == 3: # Date is pre-split between year column and DOY column if config_dict['day_of_year_col'] != -1 and config_dict['year_col'] != -1: data_doy = np.array(raw_data.iloc[:, config_dict['day_of_year_col']].astype('int')) data_year = np.array(raw_data.iloc[:, config_dict['year_col']].astype('int')) else: # date format was provided as separate year and doy columns but some were missing raise ValueError('Date format parameter indicated year and DOY columns but some or all were missing') dt_date = pd.to_datetime(data_year * 1000 + data_doy, format='%Y%j', errors='raise') data_day = np.array(dt_date.day.astype('int')) data_month = np.array(dt_date.month.astype('int')) data_year = np.array(dt_date.year.astype('int')) else: # Script cannot function without a time variable raise ValueError('Parameter error: date_format is set to an unexpected value.') ######################### # Variable processing # Imports all weather variables, converts them into the correct units, and filters them to remove impossible values (data_tmax, tmax_col) = process_variable(config_dict, raw_data, 'maximum_temperature') (data_tmin, tmin_col) = process_variable(config_dict, raw_data, 'minimum_temperature') (data_tavg, tavg_col) = process_variable(config_dict, raw_data, 'average_temperature') (data_tdew, tdew_col) = process_variable(config_dict, raw_data, 'dewpoint_temperature') (data_ea, ea_col) = process_variable(config_dict, raw_data, 'vapor_pressure') (data_rhmax, rhmax_col) = process_variable(config_dict, raw_data, 'maximum_relative_humidity') (data_rhmin, rhmin_col) = process_variable(config_dict, raw_data, 'minimum_relative_humidity') (data_rhavg, rhavg_col) = process_variable(config_dict, raw_data, 'average_relative_humidity') (data_rs, rs_col) = process_variable(config_dict, raw_data, 'solar_radiation') (data_ws, ws_col) = process_variable(config_dict, raw_data, 'wind_speed') (data_precip, precip_col) = process_variable(config_dict, raw_data, 'precipitation') # HPRCC data reports '0' for missing observations as well as a text column, but this script doesn't interpret text # columns, so instead we see if both tmax and tmin have the same value (0, or -17.7778 depending on units) and if so # mark that row as missing # realistically tmax should never equal tmin, so this is an okay check to have in general for i in range(len(data_tmax)): if data_tmax[i] == data_tmin[i]: data_tmax[i] = np.nan data_tmin[i] = np.nan data_tavg[i] = np.nan data_tdew[i] = np.nan data_ea[i] = np.nan data_rhmax[i] = np.nan data_rhmin[i] = np.nan data_rhavg[i] = np.nan data_rs[i] = np.nan data_ws[i] = np.nan data_precip[i] = np.nan else: pass ######################### # Dataframe Construction # In this section we convert the individual numpy arrays into a pandas dataframe to accomplish several goals: # 1. Make use of the pandas reindexing function to cover literal gaps in the dataset (not just missing values) # 2. Resample data to remove any duplicate records (same day appears twice in dataset, first instance is kept) # 3. Cleanly pass extracted data to the main script function # Create Datetime dataframe for reindexing datetime_df = pd.DataFrame({'year': data_year, 'month': data_month, 'day': data_day}) datetime_df = pd.to_datetime(datetime_df) # Create a series of all dates in time series date_reindex = pd.date_range(datetime_df.iloc[0], datetime_df.iloc[-1]) reindexing_additions = np.setdiff1d(np.array(date_reindex), np.array(datetime_df), assume_unique=False) logger = open(config_dict['log_file_path'], 'w') logger.write('The raw data file had %s missing date entries from its time record. \n \n' % reindexing_additions.size) logger.close() print('\nSystem: The input data file had %s missing dates in its time record.' % reindexing_additions.size) # Create dataframe of data data_df = pd.DataFrame({'year': data_year, 'month': data_month, 'day': data_day, 'tavg': data_tavg, 'tmax': data_tmax, 'tmin': data_tmin, 'tdew': data_tdew, 'ea': data_ea, 'rhavg': data_rhavg, 'rhmax': data_rhmax, 'rhmin': data_rhmin, 'rs': data_rs, 'ws': data_ws, 'precip': data_precip}, index=datetime_df) # Create dataframe of column indices for weather variable, to track which ones were provided vs calculated col_df = pd.Series({'tmax': tmax_col, 'tmin': tmin_col, 'tavg': tavg_col, 'tdew': tdew_col, 'ea': ea_col, 'rhmax': rhmax_col, 'rhmin': rhmin_col, 'rhavg': rhavg_col, 'rs': rs_col, 'ws': ws_col, 'precip': precip_col}) # Check for the existence of duplicate indexes # if found, since it cannot be determined which value is true, we default to first instance and remove all following data_df = data_df[~data_df.index.duplicated(keep='first')] # Reindex data with filled date series in case there are gaps in the data data_df = data_df.reindex(date_reindex, fill_value=np.nan) # Now replace M/D/Y columns with reindexed dates so there are no missing days data_df.year = date_reindex.year data_df.month = date_reindex.month data_df.day = date_reindex.day return data_df, col_df, metadata_df, metadata_series, config_dict
def test_getitem_unrecognized_scalar(): # GH#32684 a scalar key that is not recognized by lib.is_scalar # a series that might be produced via `frame.dtypes` ser = Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) key = ser.index[1] result = ser[key] assert result == 2 @pytest.mark.parametrize( "index", [ date_range("2014-01-01", periods=20, freq="MS"), period_range("2014-01", periods=20, freq="M"), timedelta_range("0", periods=20, freq="H"), ], ) def test_slice_with_zero_step_raises(index): ts = Series(np.arange(20), index) with pytest.raises(ValueError, match="slice step cannot be zero"): ts[::0] with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] with pytest.raises(ValueError, match="slice step cannot be zero"): ts.iloc[::0]
def test_subexpr_datetime(): data = pd.date_range(start='01/01/2010', end='01/04/2010', freq='D').values s = symbol('s', discover(data)) result = compute(s.truncate(days=2).day, data) expected = np.array([31, 2, 2, 4]) np.testing.assert_array_equal(result, expected)
def index(self) -> pd.DatetimeIndex: if self._index is None: self._index = pd.date_range(self.start_date, periods=self.prediction_length, freq=self.freq) return self._index
import wooldridge as woo import pandas as pd import numpy as np import statsmodels.api as sm import patsy as pt barium = woo.dataWoo('barium') T = len(barium) # monthly time series starting Feb. 1978: barium.index = pd.date_range(start='1978-02', periods=T, freq='M') # perform the Cochrane-Orcutt estimation (iterative procedure): y, X = pt.dmatrices( 'np.log(chnimp) ~ np.log(chempi) + np.log(gas) +' 'np.log(rtwex) + befile6 + affile6 + afdec6', data=barium, return_type='dataframe') reg = sm.GLSAR(y, X) CORC_results = reg.iterative_fit(maxiter=100) table = pd.DataFrame({ 'b_CORC': CORC_results.params, 'se_CORC': CORC_results.bse }) print(f'reg.rho: {reg.rho}\n') print(f'table: \n{table}\n')
def test_interp_datetime64(self): _skip_if_no_scipy() df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3)) result = df.interpolate(method='nearest') expected = Series([1, 1, 3], index=date_range('1/1/2000', periods=3)) assert_series_equal(result, expected)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2018-09-14 21:57:25 # @Author : cdl ([email protected]) # @Link : https://github.com/cdlwhm1217096231/python3_spider # @Version : $Id$ from pyecharts import Bar import pandas as pd import numpy as np title = "bar chart" index = pd.date_range('14/9/2018', periods=6, freq='M') df1 = pd.DataFrame(np.random.randn(6), index=index) df2 = pd.DataFrame(np.random.randn(6), index=index) dtvalue1 = [i[0] for i in df1.values] dtvalue2 = [i[0] for i in df2.values] _index = [i for i in df1.index.format()] bar = Bar(title, 'Profit and loss situation') bar.add('profit', _index, dtvalue1) bar.add('loss', _index, dtvalue2)
def add_datetime_index(self, start_date, feature_added_df: pd.DataFrame): date_range = pd.date_range(start_date, periods=48 * 9, freq='30min') feature_added_df.index = date_range
def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() self.assert_(s1.equals(s2)) s1[1] = 99 self.assert_(not s1.equals(s2)) # NaNs compare as equal s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) s2 = s1.copy() self.assert_(s1.equals(s2)) s2[0] = 9.9 self.assert_(not s1.equals(s2)) idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) s1 = Series([1, 2, np.nan], index=idx) s2 = s1.copy() self.assert_(s1.equals(s2)) # Add object dtype column with nans index = np.random.random(10) df1 = DataFrame(np.random.random(10,), index=index, columns=['floats']) df1['text'] = 'the sky is so blue. we could use more chocolate.'.split() df1['start'] = date_range('2000-1-1', periods=10, freq='T') df1['end'] = date_range('2000-1-1', periods=10, freq='D') df1['diff'] = df1['end'] - df1['start'] df1['bool'] = (np.arange(10) % 3 == 0) df1.ix[::2] = nan df2 = df1.copy() self.assert_(df1['text'].equals(df2['text'])) self.assert_(df1['start'].equals(df2['start'])) self.assert_(df1['end'].equals(df2['end'])) self.assert_(df1['diff'].equals(df2['diff'])) self.assert_(df1['bool'].equals(df2['bool'])) self.assert_(df1.equals(df2)) self.assert_(not df1.equals(object)) # different dtype different = df1.copy() different['floats'] = different['floats'].astype('float32') self.assert_(not df1.equals(different)) # different index different_index = -index different = df2.set_index(different_index) self.assert_(not df1.equals(different)) # different columns different = df2.copy() different.columns = df2.columns[::-1] self.assert_(not df1.equals(different)) # DatetimeIndex index = pd.date_range('2000-1-1', periods=10, freq='T') df1 = df1.set_index(index) df2 = df1.copy() self.assert_(df1.equals(df2)) # MultiIndex df3 = df1.set_index(['text'], append=True) df2 = df1.set_index(['text'], append=True) self.assert_(df3.equals(df2)) df2 = df1.set_index(['floats'], append=True) self.assert_(not df3.equals(df2)) # NaN in index df3 = df1.set_index(['floats'], append=True) df2 = df1.set_index(['floats'], append=True) self.assert_(df3.equals(df2))
def gen_all_quarters(start: datetime.datetime, end: datetime.datetime): idx = pd.date_range(start=start, end=end, freq="Q") # dt_list = [dt.strftime("%Y-%m-%d %H:%M:%S") for dt in idx] dt_list = [dt.to_pydatetime() for dt in idx] return dt_list
def main(): # Calendar-Spread implementation example # default market environment market_env = MarketEnvironment() print(market_env) # options expirations T_short = "31-05-2020" T_long = "30-08-2020" # current underlying level S_t = market_env.get_S() # calendar-spread portfolio initialized (as empty portfolio) calendar_spread_ptf = Portfolio(name="Calendar Spread Strategy") print(calendar_spread_ptf) # T_long-call Vanilla_Call_long = PlainVanillaOption(market_env, T=T_long, K=S_t) print(Vanilla_Call_long) # T_short-call Vanilla_Call_short = PlainVanillaOption(market_env, T=T_short, K=S_t) print(Vanilla_Call_short) # creation of Calendar-Spread portfolio strategy calendar_spread_ptf.add_instrument(Vanilla_Call_long, 1) calendar_spread_ptf.add_instrument(Vanilla_Call_short, -1) print(calendar_spread_ptf) # portfolio plotter instance calendar_spread_ptf_plotter = PortfolioPlotter(calendar_spread_ptf) # valuation date of the portfolio valuation_date = calendar_spread_ptf.get_t() print(valuation_date) # select metrics to plot for plot_metrics in ["price", "PnL", "delta", "theta", "gamma", "vega", "rho"]: plot_details_flag = True if plot_metrics == "price" else False # time-parameter as a date-range of 5 valuation dates between t and T_short last_date = T_short if plot_metrics in ["price", "PnL"] else date_string_to_datetime_obj(T_short) - pd.Timedelta(days=1) multiple_valuation_dates = pd.date_range(start=valuation_date, end=last_date, periods=5) print(multiple_valuation_dates) # Bull-Spread price plot calendar_spread_ptf_plotter.plot(t=last_date, plot_metrics=plot_metrics, plot_details=plot_details_flag) # Plot at multiple dates calendar_spread_ptf_plotter.plot(t=multiple_valuation_dates, plot_metrics=plot_metrics) # Surface plot calendar_spread_ptf_plotter.plot(t=multiple_valuation_dates, plot_metrics=plot_metrics, surf_plot=True) # Surface plot (rotate) - Underlying value side calendar_spread_ptf_plotter.plot(t=multiple_valuation_dates, plot_metrics=plot_metrics, surf_plot=True, view=(0,180)) # Price surface plot (rotate) - Date side calendar_spread_ptf_plotter.plot(t=multiple_valuation_dates, plot_metrics=plot_metrics, surf_plot=True, view=(0,-90))
def data_read(date_range, crd, utc_offset): # Number of ASOS observations an hour interval = 12 freq = ['5min', 'H'] # Adjust datetimes to timezone corresponding to location of interest date_range = pd.date_range(start=date_range[0], end=date_range[-1], freq=freq[0]) start_date, end_date = [date_range[0], date_range[-1]] # Generate strings from dates for comparison purposes date_str = [datetime.datetime.strftime(start_date, '%Y%m%d%H%M'), datetime.datetime.strftime(end_date, '%Y%m%d%H%M')] # Initialize DataFrame here to return nan DataFrame in case of failed FTP connection df = pd.DataFrame(np.nan, index=date_range, columns=['T_air', 'T_dew', 'u_r', 'p_air']) # Set up URL to appropriate data file data_url = wfile(date_str[0][0:6] + '.dat', crd) # Import data to DataFrame 'df' try: asos_data = pd.read_table(data_url, header=None) except: print('FTP connection failed. Exiting program...') sys.exit() ## Regex patterns for data mining through the .dat file(s) # Air temperature regex: string of 6 characters "(0-9)(0-9)/(0-9)(0-9)" bounded by 2 spaces T_pattern = r'\s.?\d\d[+-/].?\d\d\s' # Wind speed regex: string of 6 characters "(0-9)(0-9)KT " bounded by 2 numbers and a space # Note: This definition ignores gusts u_pattern = r"\s\d\d\d\d\d\D" # Note: This definition allows the gust becomes the effective wind speed # u_pattern = r"\d\d[K][T]\s\d" # Air pressure regex: string of 6 characters "SLP(0-9)(0-9)" p_pattern = r"[S][L][P]\d\d\d" # Iterate through all rows in ASOS data file. For dates in file that are within date range, extract data. for row in asos_data.iloc[:, 0]: if datetime.datetime.strptime(row[13:23], '%Y%m%d%H') in df.index: # If temperature pattern is found, extract data. if re.findall(T_pattern, row): date = datetime.datetime.strptime(row[13:25], '%Y%m%d%H%M') # Extract air temperature ('M' prefix indicates a negative temperature) T_air_str = re.findall(T_pattern, row)[0] if T_air_str[1] == 'M': df.loc[date, 'T_air'] = CtoK(-int(T_air_str[2:4])) else: df.loc[date, 'T_air'] = CtoK(int(T_air_str[1:3])) # Extract dew point temperature ('M' prefix indicates a negative temperature) if T_air_str[-4] == 'M': df.loc[date, 'T_dew'] = CtoK(-int(T_air_str[-3:-1])) else: df.loc[date, 'T_dew'] = CtoK(int(T_air_str[-3:-1])) # Extract wind speed if re.findall(u_pattern, row): u_str = re.findall(u_pattern, row)[0] df.loc[date, 'u_r'] = int(u_str[4:6]) else: df.loc[date, 'u_r'] = 0 # Extract air pressure if re.findall(p_pattern, row): # Convert p_str to pressure in hPa p_temp = 1000 + int(re.findall(p_pattern, row)[0][-3:])/10 df.loc[date, 'p_air'] = int(p_temp) else: df.loc[date, 'p_air'] = 1013.25 # Average over all observations to produce hourly, then re-index to set dates to proper indices. df = pd.DataFrame(df.values.reshape(-1, interval, df.shape[1]).mean(1), columns=df.columns) df['date'] = pd.date_range(start=date_range[0], end=date_range[-1], freq=freq[1]) df = df.set_index('date') # Delete ASOS data folder created locally shutil.rmtree(os.path.join(os.path.dirname(__file__), data_url.split('/')[-2])) return df['u_r'], df['T_dew'], df['p_air']
#Dividing the dataset into test and train y_train=np.array(df['Prediction'][:-15]) X_train=np.array(X[:-15]) X_test=np.array(X[-15:]) #Training the model model = LinearRegression() model.fit(X_train,y_train) forecast_prediction = model.predict(X_test) #Prediction made #To create a dataframe of predicted price forecast_prediction=[df['Adj. Close'][-1]]+forecast_prediction.tolist() Pred_data={"Forecast":forecast_prediction} dates = pd.date_range('20180327', periods=16) Pred_df = pd.DataFrame(data=Pred_data, index=dates) print("Done.\n") print(Pred_df[-15:]) #Displaying predicted price #For ploting the graph of Date vs Price df['Adj. Close'].plot() Pred_df['Forecast'].plot() plt.legend(loc=4) plt.title("Google Stock Price") plt.xlabel('Date') plt.ylabel('Price') plt.show() #To display graph to the user
def test_strftime(self): # GH 10086 s = Series(date_range("20130101", periods=5)) result = s.dt.strftime("%Y/%m/%d") expected = Series( ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) s = Series(date_range("2015-02-03 11:22:33.4567", periods=5)) result = s.dt.strftime("%Y/%m/%d %H-%M-%S") expected = Series( [ "2015/02/03 11-22-33", "2015/02/04 11-22-33", "2015/02/05 11-22-33", "2015/02/06 11-22-33", "2015/02/07 11-22-33", ] ) tm.assert_series_equal(result, expected) s = Series(period_range("20130101", periods=5)) result = s.dt.strftime("%Y/%m/%d") expected = Series( ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) s = Series(period_range("2015-02-03 11:22:33.4567", periods=5, freq="s")) result = s.dt.strftime("%Y/%m/%d %H-%M-%S") expected = Series( [ "2015/02/03 11-22-33", "2015/02/03 11-22-34", "2015/02/03 11-22-35", "2015/02/03 11-22-36", "2015/02/03 11-22-37", ] ) tm.assert_series_equal(result, expected) s = Series(date_range("20130101", periods=5)) s.iloc[0] = pd.NaT result = s.dt.strftime("%Y/%m/%d") expected = Series( ["NaT", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) datetime_index = date_range("20150301", periods=5) result = datetime_index.strftime("%Y/%m/%d") expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], dtype=np.object_, ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], dtype="=U10", ) tm.assert_index_equal(result, expected) s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = s.dt.strftime("%Y-%m-%d %H:%M:%S") expected = Series(["2013-01-01 02:32:59", "2013-01-02 14:32:01"]) tm.assert_series_equal(result, expected) s = Series(period_range("20130101", periods=4, freq="H")) result = s.dt.strftime("%Y/%m/%d %H:%M:%S") expected = Series( [ "2013/01/01 00:00:00", "2013/01/01 01:00:00", "2013/01/01 02:00:00", "2013/01/01 03:00:00", ] ) s = Series(period_range("20130101", periods=4, freq="L")) result = s.dt.strftime("%Y/%m/%d %H:%M:%S.%l") expected = Series( [ "2013/01/01 00:00:00.000", "2013/01/01 00:00:00.001", "2013/01/01 00:00:00.002", "2013/01/01 00:00:00.003", ] ) tm.assert_series_equal(result, expected)
# %% download_dir = 'output.csv' #where you want the file to be downloaded to csv = open(download_dir, "w") #"w" indicates that you're writing strings to the file columnTitleRow = "datetime, WindSpeed, WindDir\n" csv.write(columnTitleRow) for i in range(len(time)): row=str(time[i])+","+str(speed[i])+","+str(direction[i])+"\n" print(row) csv.write(row) csv.close() # %% idx = pd.date_range(time, periods=10, freq='min') ts = pd.Series(range(len(idx)), index=idx) ts ts.resample('10min').mean() # %% time[1] # %%
def test_dt_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 s = Series(date_range("20130101", periods=5, freq="D")) with pytest.raises(AttributeError, match="You cannot add any new attribute"): s.dt.xlabel = "a"
def test_dt_accessor_updates_on_inplace(self): s = Series(pd.date_range("2018-01-01", periods=10)) s[2] = None s.fillna(pd.Timestamp("2018-01-01"), inplace=True) result = s.dt.date assert result[0] == result[2]
import numpy as np import pandas as pd s = pd.Series([1,3,5,np.nan,6,8]) dates = pd.date_range('20200405',periods=13) df = pd.DataFrame(np.random.randn(13, 4), index=dates, columns=list('ABCD') df2 = pd.DataFrame({ 'A':1., 'B':pd.Timestamp('20200405'), 'C':pd.Series(1, index=list(range(4)),dtype='float32'), 'D':np.array([3]*4, dtype='int32'), 'E': pd.Categorical(['test','train','test','train']), 'F':'foo'}) print(df.iloc[3]) print(df.iloc[3:5,0:2])
def test_dt_accessor_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes expected_days = [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", ] expected_months = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", ] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] s = Series(date_range(freq="D", start=datetime(1998, 1, 1), periods=365)) english_days = [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", ] for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert s.dt.weekday_name[day] == eng_name assert s.dt.day_name(locale=time_locale)[day] == name s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1]) s = Series(date_range(freq="M", start="2012", end="2013")) result = s.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) # work around https://github.com/pandas-dev/pandas/issues/22342 result = result.str.normalize("NFD") expected = expected.str.normalize("NFD") tm.assert_series_equal(result, expected) for s_date, expected in zip(s, expected_months): result = s_date.month_name(locale=time_locale) expected = expected.capitalize() result = unicodedata.normalize("NFD", result) expected = unicodedata.normalize("NFD", expected) assert result == expected s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1])
import pandas as pd import numpy as np dates = pd.date_range("20191031", periods=6) df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D']) print(df) ''' A B C D 2019-10-31 0 1 2 3 2019-11-01 4 5 6 7 2019-11-02 8 9 10 11 2019-11-03 12 13 14 15 2019-11-04 16 17 18 19 2019-11-05 20 21 22 23 ''' df.iloc[2, 2] = 1111 print(df) ''' A B C D 2019-10-31 0 1 2 3 2019-11-01 4 5 6 7 2019-11-02 8 9 1111 11 2019-11-03 12 13 14 15 2019-11-04 16 17 18 19 2019-11-05 20 21 22 23 ''' df.loc['2019-11-02', 'B'] = 2222 print(df) '''
def test_dt_namespace_accessor(self): # GH 7207, 11128 # test .dt namespace accessor ok_for_period = PeriodArray._datetimelike_ops ok_for_period_methods = ["strftime", "to_timestamp", "asfreq"] ok_for_dt = DatetimeIndex._datetimelike_ops ok_for_dt_methods = [ "to_period", "to_pydatetime", "tz_localize", "tz_convert", "normalize", "strftime", "round", "floor", "ceil", "day_name", "month_name", ] ok_for_td = TimedeltaIndex._datetimelike_ops ok_for_td_methods = [ "components", "to_pytimedelta", "total_seconds", "round", "floor", "ceil", ] def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype("int64") elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name) def compare(s, name): a = getattr(s.dt, prop) b = get_expected(s, prop) if not (is_list_like(a) and is_list_like(b)): assert a == b else: tm.assert_series_equal(a, b) # datetimeindex cases = [ Series(date_range("20130101", periods=5), name="xxx"), Series(date_range("20130101", periods=5, freq="s"), name="xxx"), Series(date_range("20130101 00:00:00", periods=5, freq="ms"), name="xxx"), ] for s in cases: for prop in ok_for_dt: # we test freq below if prop != "freq": compare(s, prop) for prop in ok_for_dt_methods: getattr(s.dt, prop) result = s.dt.to_pydatetime() assert isinstance(result, np.ndarray) assert result.dtype == object result = s.dt.tz_localize("US/Eastern") exp_values = DatetimeIndex(s.values).tz_localize("US/Eastern") expected = Series(exp_values, index=s.index, name="xxx") tm.assert_series_equal(result, expected) tz_result = result.dt.tz assert str(tz_result) == "US/Eastern" freq_result = s.dt.freq assert freq_result == DatetimeIndex(s.values, freq="infer").freq # let's localize, then convert result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") exp_values = ( DatetimeIndex(s.values).tz_localize("UTC").tz_convert("US/Eastern") ) expected = Series(exp_values, index=s.index, name="xxx") tm.assert_series_equal(result, expected) # datetimeindex with tz s = Series(date_range("20130101", periods=5, tz="US/Eastern"), name="xxx") for prop in ok_for_dt: # we test freq below if prop != "freq": compare(s, prop) for prop in ok_for_dt_methods: getattr(s.dt, prop) result = s.dt.to_pydatetime() assert isinstance(result, np.ndarray) assert result.dtype == object result = s.dt.tz_convert("CET") expected = Series(s._values.tz_convert("CET"), index=s.index, name="xxx") tm.assert_series_equal(result, expected) tz_result = result.dt.tz assert str(tz_result) == "CET" freq_result = s.dt.freq assert freq_result == DatetimeIndex(s.values, freq="infer").freq # timedelta index cases = [ Series( timedelta_range("1 day", periods=5), index=list("abcde"), name="xxx" ), Series(timedelta_range("1 day 01:23:45", periods=5, freq="s"), name="xxx"), Series( timedelta_range("2 days 01:23:45.012345", periods=5, freq="ms"), name="xxx", ), ] for s in cases: for prop in ok_for_td: # we test freq below if prop != "freq": compare(s, prop) for prop in ok_for_td_methods: getattr(s.dt, prop) result = s.dt.components assert isinstance(result, DataFrame) tm.assert_index_equal(result.index, s.index) result = s.dt.to_pytimedelta() assert isinstance(result, np.ndarray) assert result.dtype == object result = s.dt.total_seconds() assert isinstance(result, pd.Series) assert result.dtype == "float64" freq_result = s.dt.freq assert freq_result == TimedeltaIndex(s.values, freq="infer").freq # both index = date_range("20130101", periods=3, freq="D") s = Series(date_range("20140204", periods=3, freq="s"), index=index, name="xxx") exp = Series( np.array([2014, 2014, 2014], dtype="int64"), index=index, name="xxx" ) tm.assert_series_equal(s.dt.year, exp) exp = Series(np.array([2, 2, 2], dtype="int64"), index=index, name="xxx") tm.assert_series_equal(s.dt.month, exp) exp = Series(np.array([0, 1, 2], dtype="int64"), index=index, name="xxx") tm.assert_series_equal(s.dt.second, exp) exp = pd.Series([s[0]] * 3, index=index, name="xxx") tm.assert_series_equal(s.dt.normalize(), exp) # periodindex cases = [Series(period_range("20130101", periods=5, freq="D"), name="xxx")] for s in cases: for prop in ok_for_period: # we test freq below if prop != "freq": compare(s, prop) for prop in ok_for_period_methods: getattr(s.dt, prop) freq_result = s.dt.freq assert freq_result == PeriodIndex(s.values).freq # test limited display api def get_dir(s): results = [r for r in s.dt.__dir__() if not r.startswith("_")] return list(sorted(set(results))) s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") results = get_dir(s) tm.assert_almost_equal( results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) ) s = Series( period_range("20130101", periods=5, freq="D", name="xxx").astype(object) ) results = get_dir(s) tm.assert_almost_equal( results, list(sorted(set(ok_for_period + ok_for_period_methods))) ) # 11295 # ambiguous time error on the conversions s = Series(pd.date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(s) tm.assert_almost_equal( results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) ) exp_values = pd.date_range( "2015-01-01", "2016-01-01", freq="T", tz="UTC" ).tz_convert("America/Chicago") expected = Series(exp_values, name="xxx") tm.assert_series_equal(s, expected) # no setting allowed s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): s.dt.hour = 5 # trying to set a copy with pd.option_context("chained_assignment", "raise"): with pytest.raises(com.SettingWithCopyError): s.dt.hour[0] = 5
def main(): subject_file = open("subjects.txt","r") n=int(subject_file.readline()) sub=[] subcode={} colors=[0]*n for i in range(n): x=subject_file.readline().split(" -> ") sub.append(x) subcode[x[0]]=[i,x[1]] graph=[None]*n for i in range(n): graph[i]=[0]*n student_file = open("students.txt" , "r") n2=int(student_file.readline()) for i in range(n2): x=student_file.readline().split() x[1]=x[1].split(",") for j in x[1]: for k in x[1]: if(j==k): continue graph[subcode[j][0]][subcode[k][0]]=1 graph[subcode[k][0]][subcode[j][0]]=1 colors[0]=1 colorsused=1 for i in range(1,n): c=[] for j in range(n): if graph[i][j]==1 and colors[j] not in c: c.append(colors[j]) for col in range(1,colorsused+2): if col not in c: colors[i]=col colorsused=max(colorsused,col) break import pandas as pd import datetime print("Enter starting of exam in YYYY-MM-DD format: ") d1,d2="","" week=['Monday','Tuesday','Wednesday','Thursday', 'Friday', 'Saturday','Sunday'] while(True): try: d1=input() d1=pd.to_datetime(d1) break except: print("Enter Valid Date!") d2=d1 + datetime.timedelta(days=2*colorsused) daterange = pd.date_range(d1,d2) currcol=1 for single_date in daterange: if(single_date.weekday()==6): continue else: s=str(single_date).split()[0] print("\n-------------------------------------------------") print(s,'('+week[single_date.weekday()]+')',":") print("-------------------------------------------------\n") for i in range(n): if(currcol==colors[i]): print(sub[i][0],sub[i][1],end="") if(currcol>=colorsused): break else: currcol+=1
class TestCategoricalDtypeParametrized: @pytest.mark.parametrize( "categories", [ list("abcd"), np.arange(1000), ["a", "b", 10, 2, 1.3, True], [True, False], pd.date_range("2017", periods=4), ], ) def test_basic(self, categories, ordered): c1 = CategoricalDtype(categories, ordered=ordered) tm.assert_index_equal(c1.categories, pd.Index(categories)) assert c1.ordered is ordered def test_order_matters(self): categories = ["a", "b"] c1 = CategoricalDtype(categories, ordered=True) c2 = CategoricalDtype(categories, ordered=False) c3 = CategoricalDtype(categories, ordered=None) assert c1 is not c2 assert c1 is not c3 @pytest.mark.parametrize("ordered", [False, None]) def test_unordered_same(self, ordered): c1 = CategoricalDtype(["a", "b"], ordered=ordered) c2 = CategoricalDtype(["b", "a"], ordered=ordered) assert hash(c1) == hash(c2) def test_categories(self): result = CategoricalDtype(["a", "b", "c"]) tm.assert_index_equal(result.categories, pd.Index(["a", "b", "c"])) assert result.ordered is False def test_equal_but_different(self, ordered): c1 = CategoricalDtype([1, 2, 3]) c2 = CategoricalDtype([1.0, 2.0, 3.0]) assert c1 is not c2 assert c1 != c2 @pytest.mark.parametrize("v1, v2", [([1, 2, 3], [1, 2, 3]), ([1, 2, 3], [3, 2, 1])]) def test_order_hashes_different(self, v1, v2): c1 = CategoricalDtype(v1, ordered=False) c2 = CategoricalDtype(v2, ordered=True) c3 = CategoricalDtype(v1, ordered=None) assert c1 is not c2 assert c1 is not c3 def test_nan_invalid(self): msg = "Categorical categories cannot be null" with pytest.raises(ValueError, match=msg): CategoricalDtype([1, 2, np.nan]) def test_non_unique_invalid(self): msg = "Categorical categories must be unique" with pytest.raises(ValueError, match=msg): CategoricalDtype([1, 2, 1]) def test_same_categories_different_order(self): c1 = CategoricalDtype(["a", "b"], ordered=True) c2 = CategoricalDtype(["b", "a"], ordered=True) assert c1 is not c2 @pytest.mark.parametrize("ordered1", [True, False, None]) @pytest.mark.parametrize("ordered2", [True, False, None]) def test_categorical_equality(self, ordered1, ordered2): # same categories, same order # any combination of None/False are equal # True/True is the only combination with True that are equal c1 = CategoricalDtype(list("abc"), ordered1) c2 = CategoricalDtype(list("abc"), ordered2) result = c1 == c2 expected = bool(ordered1) is bool(ordered2) assert result is expected # same categories, different order # any combination of None/False are equal (order doesn't matter) # any combination with True are not equal (different order of cats) c1 = CategoricalDtype(list("abc"), ordered1) c2 = CategoricalDtype(list("cab"), ordered2) result = c1 == c2 expected = (bool(ordered1) is False) and (bool(ordered2) is False) assert result is expected # different categories c2 = CategoricalDtype([1, 2, 3], ordered2) assert c1 != c2 # none categories c1 = CategoricalDtype(list("abc"), ordered1) c2 = CategoricalDtype(None, ordered2) c3 = CategoricalDtype(None, ordered1) assert c1 == c2 assert c2 == c1 assert c2 == c3 @pytest.mark.parametrize("categories", [list("abc"), None]) @pytest.mark.parametrize("other", ["category", "not a category"]) def test_categorical_equality_strings(self, categories, ordered, other): c1 = CategoricalDtype(categories, ordered) result = c1 == other expected = other == "category" assert result is expected def test_invalid_raises(self): with pytest.raises(TypeError, match="ordered"): CategoricalDtype(["a", "b"], ordered="foo") with pytest.raises(TypeError, match="'categories' must be list-like"): CategoricalDtype("category") def test_mixed(self): a = CategoricalDtype(["a", "b", 1, 2]) b = CategoricalDtype(["a", "b", "1", "2"]) assert hash(a) != hash(b) def test_from_categorical_dtype_identity(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # Identity test for no changes c2 = CategoricalDtype._from_categorical_dtype(c1) assert c2 is c1 def test_from_categorical_dtype_categories(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override categories result = CategoricalDtype._from_categorical_dtype(c1, categories=[2, 3]) assert result == CategoricalDtype([2, 3], ordered=True) def test_from_categorical_dtype_ordered(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override ordered result = CategoricalDtype._from_categorical_dtype(c1, ordered=False) assert result == CategoricalDtype([1, 2, 3], ordered=False) def test_from_categorical_dtype_both(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override ordered result = CategoricalDtype._from_categorical_dtype( c1, categories=[1, 2], ordered=False ) assert result == CategoricalDtype([1, 2], ordered=False) def test_str_vs_repr(self, ordered): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)" assert re.match(pat.format(ordered=ordered), repr(c1)) def test_categorical_categories(self): # GH17884 c1 = CategoricalDtype(Categorical(["a", "b"])) tm.assert_index_equal(c1.categories, pd.Index(["a", "b"])) c1 = CategoricalDtype(CategoricalIndex(["a", "b"])) tm.assert_index_equal(c1.categories, pd.Index(["a", "b"])) @pytest.mark.parametrize( "new_categories", [list("abc"), list("cba"), list("wxyz"), None] ) @pytest.mark.parametrize("new_ordered", [True, False, None]) def test_update_dtype(self, ordered, new_categories, new_ordered): original_categories = list("abc") dtype = CategoricalDtype(original_categories, ordered) new_dtype = CategoricalDtype(new_categories, new_ordered) result = dtype.update_dtype(new_dtype) expected_categories = pd.Index(new_categories or original_categories) expected_ordered = new_ordered if new_ordered is not None else dtype.ordered tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered def test_update_dtype_string(self, ordered): dtype = CategoricalDtype(list("abc"), ordered) expected_categories = dtype.categories expected_ordered = dtype.ordered result = dtype.update_dtype("category") tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered @pytest.mark.parametrize("bad_dtype", ["foo", object, np.int64, PeriodDtype("Q")]) def test_update_dtype_errors(self, bad_dtype): dtype = CategoricalDtype(list("abc"), False) msg = "a CategoricalDtype must be passed to perform an update, " with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype)
def apple_calendar(apple_start, apple_end): apple_fiscal = pd.read_csv(csv_path, parse_dates=True, index_col=None) # convert date, start and end date range, pay date and month duration to datetime apple_fiscal['date'] = pd.to_datetime(apple_fiscal['date']) apple_fiscal['start_date'] = pd.to_datetime(apple_fiscal['start_date']) apple_starts = apple_fiscal[['start_date', 'date']] apple_fiscal['end_date'] = pd.to_datetime(apple_fiscal['end_date']) apple_ends = apple_fiscal[['end_date', 'date']] apple_fiscal['pay_date'] = pd.to_datetime(apple_fiscal['pay_date']) apple_pays = apple_fiscal['pay_date'] apple_duration = apple_fiscal[['date', 'next_month_duration']] # set parse format / parse date entry string year, month, day = map(int, apple_start_date.split('-')) apple_start = datetime.date(year, month, day) year, month, day = map(int, apple_end_date.split('-')) apple_end = datetime.date(year, month, day) # create list of annual dates, based on the start date from date entry variable and convert list to data frame apple_range = apple_end - apple_start apple_range = apple_range.days + 1 apple_revenue = list(range(0, apple_range)) start_datelist = pd.date_range(apple_start, apple_end).tolist() start_apple_table = pd.DataFrame(start_datelist) # name index 'days' and column 'date' start_apple_table.index.name = 'days' start_apple_table.columns = ['date'] # merge csv columns 'start_date', 'end_date', and 'next_month_duration' with data frame start_apple_table = start_apple_table.merge(apple_starts, how='left', on='date', left_index=True) start_apple_table = start_apple_table.merge(apple_ends, how='left', on='date', left_index=True) start_apple_table = start_apple_table.merge(apple_duration, how='left', on='date', left_index=True) # add 'pay_date' column to list only the days when receive payment from Apple start_apple_table['date'] = start_apple_table.loc[(start_apple_table['date'].isin(apple_pays)), 'pay_date'] = start_apple_table['date'] # assign 'sample_sales' column to second input revenue start_apple_table['sample_sales'] = apple_revenue # change index to column 'date' start_apple_table = start_apple_table.set_index('date') # convert 'next_month_duration' from integer to datetime days start_apple_table['next_month_duration'] = pd.to_timedelta(start_apple_table['next_month_duration'], unit='D') # create 'monthly_sales' column start_apple_table['monthly_sales'] = start_apple_table.apply(lambda x: start_apple_table.loc[(start_apple_table['start_date'] <= x.name) & (x.name <= start_apple_table['end_date']), ['sample_sales']].sum(), axis=1) # create 'monthly_adj' column to move the sales up by next month fiscal duration period start_apple_table['monthly_adj'] = start_apple_table.apply(lambda x: start_apple_table.loc[(start_apple_table['start_date'] + start_apple_table['next_month_duration'] <= x.name) & (x.name <= start_apple_table['end_date'] + start_apple_table['next_month_duration']), ['sample_sales']].sum(), axis=1) # shift 'monthly_adj' by 7 rows to be captured by 'pay_date' start_apple_table['monthly_shift'] = start_apple_table['monthly_adj'].shift(7) # add 'monthly_payment' and show only on 'pay_date' dates start_apple_table['monthly_payment'] = start_apple_table['monthly_shift'].loc[start_apple_table['pay_date'].notnull()] # add 'cumulative_payment' column start_apple_table['cumulative_payment'] = start_apple_table['monthly_payment'].cumsum() return start_apple_table
# Learning pandas in about 10 minutes # https://pandas.pydata.org/pandas-docs/stable/10min.html # CNA 330 # Mustafa Musa, [email protected] import pandas as pd import numpy as np import matplotlib.pyplot as plt s = pd.Series([1, 3, 5, np.nan, 6, 8]) print(s) dates = pd.date_range('20130101', periods=6) print(dates) df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) print(df) df2 = pd.DataFrame({ 'A': 1., 'B': pd.Timestamp('20130102'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(["test", "train", "test", "train"]), 'F': 'foo' }) print(df2) print(df2.dtypes) print(df.head()) print(df.tail(3)) print(df.index)
thumbnail_url="caching-example.png", code_url="caching_example/caching_example.py", mp4_url="caching-example.mp4", tags=["Panel", "Caching"], ) ACCENT_COLOR = "#C01754" CACHE_EXPIRY = 60 * 60 * 24 # seconds, i.e. one Day np.random.seed([3, 1415]) PERIODS = 1 * 24 * 60 # minutes. I.e. 1 days DATA = pd.DataFrame( { "time": pd.date_range("2020-01-01", periods=PERIODS, freq="T"), "price": np.random.randn(PERIODS) + 98, } ) def _load_data(frac=0.1): time.sleep(0.5 + frac * 0.5) return DATA.sample(frac=frac) def _plot_data(frac=0.1): time.sleep(0.5) data = _load_data(frac) return data.hvplot(x="time", y="price")
def forecast(self, start_date:date=None, end_date:date=None, fq:str=None, econ_limit:float=None, np_limit:float=None, npi:float=0, fluid_rate:float=None, show_water:bool = False, **kwargs ): """ Forecast curve from the declination object. Input: start_date -> (datetime.date) Initial date Forecast end_date -> (datetime.date) end date Forecast fq -> (str) frequecy for the time table. Use https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases econ_limit -> (int,float,np.dnarray) Economic limit Rate. If end_date Return: f: DataFrame with t column and curve column np: Cummulative production """ if econ_limit is None: econ_limit = self.econ_limit else: assert isinstance(econ_limit,(int,float,np.ndarray)), 'econ_limit must be a number' if fq is None: fq = self.fq else: assert isinstance(fq,str), 'fq must be str' if start_date is None: if self.start_date is None: start_date = self.ti else: start_date = self.start_date else: assert isinstance(start_date,date), 'start_date must be date' if end_date is None: if self.end_date is None: end_date = self.ti + timedelta(days=365) if econ_limit is None else None else: end_date = self.end_date else: assert isinstance(end_date,date), 'end_date must be date' if np_limit is None: np_limit = self.np_limit else: assert isinstance(np_limit,(int,float,np.ndarray)), 'econ_limit must be a number' if fluid_rate is None: fluid_rate = self.fluid_rate if econ_limit is None: time_range = pd.Series(pd.date_range(start=start_date, end=end_date, freq=fq, **kwargs)) f, Np = forecast_curve(time_range,self.qi,self.di,self.ti,self.b,npi=npi, gas=self.gas) else: f, Np = forecast_econlimit(start_date,econ_limit,self.qi,self.di,self.ti,self.b, fr=fq,end_date=end_date,npi=npi,gas=self.gas) if np_limit is not None: if Np > np_limit: f = f.loc[f['np']<np_limit,:] Np = f.iloc[-1,-1] if show_water and fluid_rate is not None: f['qw'] = fluid_rate - f['qo'] f['bsw'] = f['qw'] / (f['qw'] + f['qo']) f['wor'] = f['qw'] / f['qo'] f['wor_1'] = f['wor'] + 1 return f, Np