def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) df = DataFrame({'a': np.random.randn(len(ts)), 'b': np.random.randn(len(ts))}, index=ts) ax = df.plot() # verify tick labels ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime('%H:%M:%S') self.assertEqual(xp, rs) # change xlim ax.set_xlim('1:30', '5:00') # check tick labels again ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime('%H:%M:%S') self.assertEqual(xp, rs)
def components(self): """ Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. Returns ------- a DataFrame """ from pandas import DataFrame columns = ['days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds', 'nanoseconds'] hasnans = self.hasnans if hasnans: def f(x): if isnull(x): return [np.nan] * len(columns) return x.components else: def f(x): return x.components result = DataFrame([f(x) for x in self]) result.columns = columns if not hasnans: result = result.astype('int64') return result
def test_query_single_element_booleans(self, parser, engine): columns = 'bid', 'bidsize', 'ask', 'asksize' data = np.random.randint(2, size=(1, len(columns))).astype(bool) df = DataFrame(data, columns=columns) res = df.query('bid & ask', engine=engine, parser=parser) expected = df[df.bid & df.ask] assert_frame_equal(res, expected)
def test_append_concat(self): rng = date_range('5/8/2012 1:45', periods=10, freq='5T') ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) result = ts.append(ts) result_df = df.append(df) ex_index = DatetimeIndex(np.tile(rng.values, 2)) tm.assert_index_equal(result.index, ex_index) tm.assert_index_equal(result_df.index, ex_index) appended = rng.append(rng) tm.assert_index_equal(appended, ex_index) appended = rng.append([rng, rng]) ex_index = DatetimeIndex(np.tile(rng.values, 3)) tm.assert_index_equal(appended, ex_index) # different index names rng1 = rng.copy() rng2 = rng.copy() rng1.name = 'foo' rng2.name = 'bar' assert rng1.append(rng1).name == 'foo' assert rng1.append(rng2).name is None
def compute_confusion_matrix(target, predicted, normalize=True, sort = True): """ returns a confusion matrix as a data frame with labels Parameters: target (array): The values that are predicted. predicted (array): predicted values. normalize (bool): If True, Normalize normalize (bool): If true sort by value. Returns (DataFrame): df with the confusion matrix. """ # Determine the uniqu values in the target list, sort them and assign as labels. labels = np.unique(list(target)) labels.sort() # Compute the confusion matrix, place into data frame and normailize if desired. confusion = metrics.confusion_matrix(target, predicted, labels) confusion = DataFrame(confusion, index=labels, columns=labels) if normalize: confusion = confusion.apply(lambda x: x / np.sum(x), axis=1) # if sort is true: find the max value for each and then sort, the confusion matrix if sort: #get the max values, order and then use to order the confusion matrix on both axes max_values =confusion.max(axis = 1) max_values.sort(inplace = True, ascending=False) order = max_values.index confusion = confusion.loc[order,order] return confusion
def test_nested_scope(self): from pandas.core.computation.ops import UndefinedVariableError engine = self.engine parser = self.parser # smoke test x = 1 # noqa result = pd.eval('x + 1', engine=engine, parser=parser) assert result == 2 df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) # don't have the pandas parser with pytest.raises(SyntaxError): df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) with pytest.raises(UndefinedVariableError): df.query('(df>0) & (df2>0)', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0)] result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result)
def test_query_with_nested_special_character(self, parser, engine): skip_if_no_pandas_parser(parser) df = DataFrame({'a': ['a', 'b', 'test & test'], 'b': [1, 2, 3]}) res = df.query('a == "test & test"', parser=parser, engine=engine) expec = df[df.a == 'test & test'] assert_frame_equal(res, expec)
def test_query_with_partially_named_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) a = np.random.choice(['red', 'green'], size=10) b = np.arange(10) index = MultiIndex.from_arrays([a, b]) index.names = [None, 'rating'] df = DataFrame(np.random.randn(10, 2), index=index) res = df.query('rating == 1', parser=parser, engine=engine) ind = Series(df.index.get_level_values('rating').values, index=index, name='rating') exp = df[ind == 1] assert_frame_equal(res, exp) res = df.query('rating != 1', parser=parser, engine=engine) ind = Series(df.index.get_level_values('rating').values, index=index, name='rating') exp = df[ind != 1] assert_frame_equal(res, exp) res = df.query('ilevel_0 == "red"', parser=parser, engine=engine) ind = Series(df.index.get_level_values(0).values, index=index) exp = df[ind == "red"] assert_frame_equal(res, exp) res = df.query('ilevel_0 != "red"', parser=parser, engine=engine) ind = Series(df.index.get_level_values(0).values, index=index) exp = df[ind != "red"] assert_frame_equal(res, exp)
def test_nested_scope(self): engine = self.engine parser = self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) expected = df[(df > 0) & (df2 > 0)] result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) assert_frame_equal(result, expected) result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, parser=parser) assert_frame_equal(result, expected) result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected)
def test_stable_descending_sort(self): # GH #6399 df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], columns=['sort_col', 'order']) sorted_df = df.sort_values(by='sort_col', kind='mergesort', ascending=False) assert_frame_equal(df, sorted_df)
class TestDataFrameEvalWithFrame(object): def setup_method(self, method): self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc')) def teardown_method(self, method): del self.frame def test_simple_expr(self, parser, engine): res = self.frame.eval('a + b', engine=engine, parser=parser) expect = self.frame.a + self.frame.b assert_series_equal(res, expect) def test_bool_arith_expr(self, parser, engine): res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b assert_series_equal(res, expect) @pytest.mark.parametrize('op', ['+', '-', '*', '/']) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" with pytest.raises(TypeError, match=msg): df.eval('a {0} b'.format(op), engine=engine, parser=parser)
def test_frame_datetime64_handling_groupby(self): # it works! df = DataFrame([(3, np.datetime64('2012-07-03')), (3, np.datetime64('2012-07-04'))], columns=['a', 'date']) result = df.groupby('a').first() assert result['date'][3] == Timestamp('2012-07-03')
def test_groupby_max_datetime64(self): # GH 5869 # datetimelike dtype conversion from int df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) expected = df.groupby('A')['A'].apply(lambda x: x.max()) result = df.groupby('A')['A'].max() assert_series_equal(result, expected)
def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ '20121002', '20121007', '20130130', '20130202', '20130305', '20121002', '20121207', '20130130', '20130202', '20130305', '20130202', '20130305' ]), 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801], 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') expected = ( df.groupby('user_id')['whole_cost'] .resample(freq) .sum(min_count=1) # XXX .dropna() .reorder_levels(['date', 'user_id']) .sort_index() .astype('int64') ) expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.Grouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ 'whole_cost'].sum() assert_series_equal(result2, expected)
def test_groupby_groups_datetimeindex(self): # GH#1430 periods = 1000 ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods) df = DataFrame({'high': np.arange(periods), 'low': np.arange(periods)}, index=ind) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) # it works! groups = grouped.groups assert isinstance(list(groups.keys())[0], datetime) # GH#11442 index = pd.date_range('2015/01/01', periods=5, name='date') df = pd.DataFrame({'A': [5, 6, 7, 8, 9], 'B': [1, 2, 3, 4, 5]}, index=index) result = df.groupby(level='date').groups dates = ['2015-01-05', '2015-01-04', '2015-01-03', '2015-01-02', '2015-01-01'] expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') for date in dates} tm.assert_dict_equal(result, expected) grouped = df.groupby(level='date') for date in dates: result = grouped.get_group(date) data = [[df.loc[date, 'A'], df.loc[date, 'B']]] expected_index = pd.DatetimeIndex([date], name='date') expected = pd.DataFrame(data, columns=list('AB'), index=expected_index) tm.assert_frame_equal(result, expected)
def test_crosstab_margins(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) df = DataFrame({'a': a, 'b': b, 'c': c}) result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), margins=True) self.assertEqual(result.index.names, ('a',)) self.assertEqual(result.columns.names, ['b', 'c']) all_cols = result['All', ''] exp_cols = df.groupby(['a']).size().astype('i8') exp_cols = exp_cols.append(Series([len(df)], index=['All'])) tm.assert_series_equal(all_cols, exp_cols) all_rows = result.ix['All'] exp_rows = df.groupby(['b', 'c']).size().astype('i8') exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows)
def test_coercion_with_loc(self): for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: start_dataframe = DataFrame({'foo': start_data}) start_dataframe.loc[0, ['foo']] = None expected_dataframe = DataFrame({'foo': expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe)
def test_parse_dates_noconvert_thousands(self): # see gh-14066 data = 'a\n04.15.2016' expected = DataFrame([datetime(2016, 4, 15)], columns=['a']) result = self.read_csv(StringIO(data), parse_dates=['a'], thousands='.') tm.assert_frame_equal(result, expected) exp_index = DatetimeIndex(['2016-04-15'], name='a') expected = DataFrame(index=exp_index) result = self.read_csv(StringIO(data), index_col=0, parse_dates=True, thousands='.') tm.assert_frame_equal(result, expected) data = 'a,b\n04.15.2016,09.16.2013' expected = DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=['a', 'b']) result = self.read_csv(StringIO(data), parse_dates=['a', 'b'], thousands='.') tm.assert_frame_equal(result, expected) expected = DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=['a', 'b']) expected = expected.set_index(['a', 'b']) result = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True, thousands='.') tm.assert_frame_equal(result, expected)
def test_groupby_function_tuple_1677(self): df = DataFrame(np.random.rand(100), index=date_range("1/1/2000", periods=100)) monthly_group = df.groupby(lambda x: (x.year, x.month)) result = monthly_group.mean() assert isinstance(result.index[0], tuple)
def test_frame_reset_index(self): dr = date_range('2012-06-02', periods=10, tz='US/Eastern') df = DataFrame(np.random.randn(len(dr)), dr) roundtripped = df.reset_index().set_index('index') xp = df.index.tz rs = roundtripped.index.tz self.assertEquals(xp, rs)
def test_join_aware(self): rng = date_range('1/1/2011', periods=10, freq='H') ts = Series(np.random.randn(len(rng)), index=rng) ts_utc = ts.tz_localize('utc') self.assertRaises(Exception, ts.__add__, ts_utc) self.assertRaises(Exception, ts_utc.__add__, ts) test1 = DataFrame(np.zeros((6,3)), index=date_range("2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central")) test2 = DataFrame(np.zeros((3,3)), index=date_range("2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"), columns=range(3,6)) result = test1.join(test2, how='outer') ex_index = test1.index.union(test2.index) self.assertTrue(result.index.equals(ex_index)) self.assertTrue(result.index.tz.zone == 'US/Central') # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") result = rng.union(rng2) self.assertTrue(result.tz.zone == 'UTC')
def test_loc_setitem_frame_multiples(self): # multiple setting df = DataFrame({'A': ['foo', 'bar', 'baz'], 'B': Series( range(3), dtype=np.int64)}) rhs = df.loc[1:2] rhs.index = df.index[0:2] df.loc[0:1] = rhs expected = DataFrame({'A': ['bar', 'baz', 'baz'], 'B': Series( [1, 2, 2], dtype=np.int64)}) tm.assert_frame_equal(df, expected) # multiple setting with frame on rhs (with M8) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), 'val': Series( range(5), dtype=np.int64)}) expected = DataFrame({'date': [Timestamp('20000101'), Timestamp( '20000102'), Timestamp('20000101'), Timestamp('20000102'), Timestamp('20000103')], 'val': Series( [0, 1, 0, 1, 2], dtype=np.int64)}) rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs tm.assert_frame_equal(df, expected)
def test_to_string_format_na(self): fmt.reset_printoptions() df = DataFrame({'A' : [np.nan, -1, -2.1234, 3, 4], 'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) result = df.to_string() expected = (' A B\n' '0 NaN NaN\n' '1 -1.0000 foo\n' '2 -2.1234 foooo\n' '3 3.0000 fooooo\n' '4 4.0000 bar') self.assertEqual(result, expected) df = DataFrame({'A' : [np.nan, -1., -2., 3., 4.], 'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) result = df.to_string() expected = (' A B\n' '0 NaN NaN\n' '1 -1 foo\n' '2 -2 foooo\n' '3 3 fooooo\n' '4 4 bar') self.assertEqual(result, expected)
def test_to_string_repr_unicode(self): buf = StringIO() unicode_values = [u'\u03c3'] * 10 unicode_values = np.array(unicode_values, dtype=object) df = DataFrame({'unicode' : unicode_values}) df.to_string(col_space=10, buf=buf) # it works! repr(df) idx = Index(['abc', u'\u03c3a', 'aegdvg']) ser = Series(np.random.randn(len(idx)), idx) rs = repr(ser).split('\n') line_len = len(rs[0]) for line in rs[1:]: try: line = line.decode('utf-8') except: pass self.assert_(len(line) == line_len) # it works even if sys.stdin in None sys.stdin = None repr(df) sys.stdin = sys.__stdin__
def test_to_string_with_formatters_unicode(self): df = DataFrame({u'c/\u03c3':[1,2,3]}) result = df.to_string(formatters={u'c/\u03c3': lambda x: '%s' % x}) self.assertEqual(result, (u' c/\u03c3\n' '0 1\n' '1 2\n' '2 3'))
def test_resample_anchored_intraday(self): # #1471, #1458 rng = date_range('1/1/2012', '4/1/2012', freq='100min') df = DataFrame(rng.month, index=rng) result = df.resample('M') expected = df.resample('M', kind='period').to_timestamp(how='end') tm.assert_frame_equal(result, expected) result = df.resample('M', closed='left') exp = df.tshift(1, freq='D').resample('M', kind='period') exp = exp.to_timestamp(how='end') tm.assert_frame_equal(result, exp) rng = date_range('1/1/2012', '4/1/2012', freq='100min') df = DataFrame(rng.month, index=rng) result = df.resample('Q') expected = df.resample('Q', kind='period').to_timestamp(how='end') tm.assert_frame_equal(result, expected) result = df.resample('Q', closed='left') expected = df.tshift(1, freq='D').resample('Q', kind='period', closed='left') expected = expected.to_timestamp(how='end') tm.assert_frame_equal(result, expected) ts = _simple_ts('2012-04-29 23:00', '2012-04-30 5:00', freq='h') resampled = ts.resample('M') self.assert_(len(resampled) == 1)
def test_eng_float_formatter(self): df = DataFrame({'A' : [1.41, 141., 14100, 1410000.]}) fmt.set_eng_float_format() result = df.to_string() expected = (' A\n' '0 1.410E+00\n' '1 141.000E+00\n' '2 14.100E+03\n' '3 1.410E+06') self.assertEqual(result, expected) fmt.set_eng_float_format(use_eng_prefix=True) result = df.to_string() expected = (' A\n' '0 1.410\n' '1 141.000\n' '2 14.100k\n' '3 1.410M') self.assertEqual(result, expected) fmt.set_eng_float_format(accuracy=0) result = df.to_string() expected = (' A\n' '0 1E+00\n' '1 141E+00\n' '2 14E+03\n' '3 1E+06') self.assertEqual(result, expected) fmt.reset_printoptions()
def test_groupby_categorical_index_and_columns(self, observed): # GH18432 columns = ['A', 'B', 'A', 'B'] categories = ['B', 'A'] data = np.ones((5, 4), int) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = 2 * np.ones((5, 2), int) if observed: # if we are not-observed we undergo a reindex # so need to adjust the output as our expected sets us up # to be non-observed expected_columns = CategoricalIndex(['A', 'B'], categories=categories, ordered=True) else: expected_columns = CategoricalIndex(categories, categories=categories, ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) assert_frame_equal(result, expected)
def test_group_selection_cache(): # GH 12839 nth, head, and tail should return same result consistently df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) expected = df.iloc[[0, 2]].set_index('A') g = df.groupby('A') result1 = g.head(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) g = df.groupby('A') result1 = g.tail(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) g = df.groupby('A') result1 = g.nth(0) result2 = g.head(n=2) assert_frame_equal(result1, expected) assert_frame_equal(result2, df) g = df.groupby('A') result1 = g.nth(0) result2 = g.tail(n=2) assert_frame_equal(result1, expected) assert_frame_equal(result2, df)
def test_frame_reset_index(self): dr = date_range("2012-06-02", periods=10, tz=self.tzstr("US/Eastern")) df = DataFrame(np.random.randn(len(dr)), dr) roundtripped = df.reset_index().set_index("index") xp = df.index.tz rs = roundtripped.index.tz self.assertEqual(xp, rs)
from pandas import DataFrame print("HELLO!") df = DataFrame({'a':[1,2,3], 'b':[4,5,6]}) print(df.head) from test_folder.my_mod import enlarge x=11 print(enlarge(x))
def _apply_query_metadata(df: pd.DataFrame, query_metadata: _QueryMetadata) -> pd.DataFrame: with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) df.query_metadata = query_metadata.raw_payload return df
def heston_calibration(df_option, ival=None): """ calibrate heston model """ # extract rates and div yields from the data set df_tmp = DataFrame.filter(df_option, items=['dtExpiry', 'iRate', 'iDiv']) grouped = df_tmp.groupby('dtExpiry') def aggregate(serie): return serie[serie.index[0]] df_rates = grouped.agg(aggregate) # Get first index: first_index = 0 dtTrade = df_option['dtTrade'][first_index] # back out the spot from any forward iRate = df_option['iRate'][first_index] iDiv = df_option['iDiv'][first_index] TTM = df_option['TTM'][first_index] Fwd = df_option['Fwd'][first_index] spot = SimpleQuote(Fwd*np.exp(-(iRate-iDiv)*TTM)) print('Spot: %f risk-free rate: %f div. yield: %f' % (spot.value, iRate, iDiv)) # build array of option helpers hh = heston_helpers(spot, df_option, dtTrade, df_rates) options = hh['options'] spot = hh['spot'] risk_free_ts = dfToZeroCurve(df_rates['iRate'], dtTrade) dividend_ts = dfToZeroCurve(df_rates['iDiv'], dtTrade) # initial values for parameters if ival is None: ival = {'v0': 0.1, 'kappa': 1.0, 'theta': 0.1, 'sigma': 0.5, 'rho': -.5} process = HestonProcess( risk_free_ts, dividend_ts, spot, ival['v0'], ival['kappa'], ival['theta'], ival['sigma'], ival['rho']) model = HestonModel(process) engine = AnalyticHestonEngine(model, 64) for option in options: option.set_pricing_engine(engine) om = LevenbergMarquardt(1e-8, 1e-8, 1e-8) model.calibrate( options, om, EndCriteria(400, 40, 1.0e-8, 1.0e-8, 1.0e-8) ) print('model calibration results:') print('v0: %f kappa: %f theta: %f sigma: %f rho: %f' % (model.v0, model.kappa, model.theta, model.sigma, model.rho)) calib_error = (1.0/len(options)) * sum( [pow(o.calibration_error()*100.0,2) for o in options]) print('SSE: %f' % calib_error) # merge the fitted volatility and the input data set return merge_df(df_option, options, 'Heston')
def test_plot_submethod_works(self): df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")}) df.groupby("z").plot.scatter("x", "y") tm.close() df.groupby("z")["x"].plot.line() tm.close()
def batch_map_cnot_circuits(source, modes, architectures, n_qubits=None, populations=30, iterations=15, crossover_probs=0.8, mutation_probs=0.5, dest_folder=None, metrics_file=None, n_compile=1): modes = make_into_list(modes) architectures = make_into_list(architectures) populations = make_into_list(populations) iterations = make_into_list(iterations) crossover_probs = make_into_list(crossover_probs) mutation_probs = make_into_list(mutation_probs) if os.path.isfile(source): source, file = os.path.split(source) files = [file] else: files = [f for f in os.listdir(source) if os.path.isfile(os.path.join(source, f))] if not os.path.exists(source): raise IOError("Folder does not exist: " + source) if dest_folder is None: dest_folder = source else: os.makedirs(dest_folder, exist_ok=True) arch_iter = [] circuits = {} metrics = [] for architecture in architectures: if architecture in dynamic_size_architectures: if n_qubits is None: raise KeyError("Number of qubits not specified for architecture" + architecture) else: n_qubits = make_into_list(n_qubits) arch_iter.extend([create_architecture(architecture, n_qubits=q) for q in n_qubits]) else: arch_iter.append(create_architecture(architecture)) for architecture in arch_iter: circuits[architecture.name] = {} for mode in modes: if mode == QUIL_COMPILER: n_compile_list = range(n_compile) else: n_compile_list = [None] new_dest_folder = os.path.join(dest_folder, architecture.name, mode) os.makedirs(new_dest_folder, exist_ok=True) if mode in genetic_elim_modes: pop_iter = populations iter_iter = iterations crossover_iter = crossover_probs mutation_iter = mutation_probs circuits[architecture.name][mode] = {} else: if mode == QUIL_COMPILER: circuits[architecture.name][mode] = [] pop_iter = [None] iter_iter = [None] crossover_iter = [None] mutation_iter = [None] for population in pop_iter: for iteration in iter_iter: for crossover_prob in crossover_iter: for mutation_prob in mutation_iter: for file in files: if os.path.splitext(file)[1].lower() == ".qasm": origin_file = os.path.join(source, file) for i in n_compile_list: dest_filename = create_dest_filename(origin_file, population, iteration, crossover_prob, mutation_prob, i) dest_file = os.path.join(dest_folder, architecture.name, mode, dest_filename) try: start_time = time.time() circuit = map_cnot_circuit(origin_file, architecture, mode=mode, dest_file=dest_file, population=population, iterations=iteration, crossover_prob=crossover_prob, mutation_prob=mutation_prob) end_time = time.time() if metrics_file is not None: metrics.append(make_metrics(circuit, origin_file, architecture.name, mode, dest_file, population, iteration, crossover_prob, mutation_prob, end_time-start_time, i)) if mode in genetic_elim_modes: circuits[architecture.name][mode][(population, iteration, crossover_prob, mutation_prob)] = circuit elif mode == QUIL_COMPILER: circuits[architecture.name][mode].append(circuit) else: circuits[architecture.name][mode] = circuit except KeyError as e: # Should only happen with quilc if mode == QUIL_COMPILER: print("\033[31mCould not compile", origin_file, "into", dest_file, end="\033[0m\n") else: raise e if len(metrics) > 0 and DataFrame != None: df = DataFrame(metrics) if os.path.exists(metrics_file): # append to the file - do not overwrite! df.to_csv(metrics_file, columns=get_metric_header(), header=False, index=False, mode='a') else: df.to_csv(metrics_file, columns=get_metric_header(), index=False) return circuits
def table(coords_src, coords_dest=None, ids_origin=None, ids_dest=None, output='np', minutes=False, url_config=RequestConfig, send_as_polyline=True): """ Function wrapping OSRM 'table' function in order to get a matrix of time distance as a numpy array or as a DataFrame Parameters ---------- coords_src : list A list of coord as (lat, long) , like : list_coords = [(21.3224, 45.2358), (21.3856, 42.0094), (20.9574, 41.5286)] (coords have to be float) coords_dest : list, optional A list of coord as (lat, long) , like : list_coords = [(21.3224, 45.2358), (21.3856, 42.0094), (20.9574, 41.5286)] (coords have to be float) ids_origin : list, optional A list of name/id to use to label the source axis of the result `DataFrame` (default: None). ids_dest : list, optional A list of name/id to use to label the destination axis of the result `DataFrame` (default: None). output : str, optional The type of durations matrice to return (DataFrame or numpy array) 'raw' for the (parsed) json response from OSRM 'pandas', 'df' or 'DataFrame' for a DataFrame 'numpy', 'array' or 'np' for a numpy array (default is "np") url_config: osrm.RequestConfig, optional Parameters regarding the host, version and profile to use Returns ------- - if output=='raw' : a dict, the parsed json response. - if output=='np' : a numpy.ndarray containing the time in minutes, a list of snapped origin coordinates, a list of snapped destination coordinates. - if output=='pandas' : a labeled DataFrame containing the time matrix in minutes, a list of snapped origin coordinates, a list of snapped destination coordinates. """ if output.lower() in ('numpy', 'array', 'np'): output = 1 elif output.lower() in ('pandas', 'dataframe', 'df'): output = 2 else: output = 3 host = check_host(url_config.host) url = ''.join( [host, '/table/', url_config.version, '/', url_config.profile, '/']) if not send_as_polyline: if not coords_dest: url = ''.join([ url, ';'.join([ ','.join([str(coord[0]), str(coord[1])]) for coord in coords_src ]) ]) else: src_end = len(coords_src) dest_end = src_end + len(coords_dest) url = ''.join([ url, ';'.join([ ','.join([str(coord[0]), str(coord[1])]) for coord in _chain(coords_src, coords_dest) ]), '?sources=', ';'.join([str(i) for i in range(src_end)]), '&destinations=', ';'.join([str(j) for j in range(src_end, dest_end)]) ]) else: if not coords_dest: url = ''.join([ url, "polyline(", polyline_encode([(c[1], c[0]) for c in coords_src]), ")" ]) else: src_end = len(coords_src) dest_end = src_end + len(coords_dest) url = ''.join([ url, "polyline(", polyline_encode([ (c[1], c[0]) for c in _chain(coords_src, coords_dest) ]), ")", '?sources=', ';'.join([str(i) for i in range(src_end)]), '&destinations=', ';'.join([str(j) for j in range(src_end, dest_end)]) ]) rep = urlopen(url) parsed_json = json.loads(rep.read().decode('utf-8')) if "code" not in parsed_json or "Ok" not in parsed_json["code"]: raise ValueError('No distance table return by OSRM instance') elif output == 3: return parsed_json else: durations = np.array(parsed_json["durations"], dtype=float) new_src_coords = [ft["location"] for ft in parsed_json["sources"]] new_dest_coords = None if not coords_dest \ else [ft["location"] for ft in parsed_json["destinations"]] if minutes: # Conversion in minutes with 2 decimals: durations = np.around((durations / 60), 2) if output == 2: if not ids_origin: ids_origin = [i for i in range(len(coords_src))] if not ids_dest: ids_dest = ids_origin if not coords_dest \ else [i for i in range(len(coords_dest))] durations = DataFrame(durations, index=ids_origin, columns=ids_dest, dtype=float) return durations, new_src_coords, new_dest_coords
def test_astype_categorical_to_other(self): value = np.random.RandomState(0).randint(0, 10000, 100) df = DataFrame({"value": value}) labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) df["value_group"] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) s = df["value_group"] expected = s tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) msg = r"could not convert string to float|invalid literal for float\(\)" with pytest.raises(ValueError, match=msg): s.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(s.values), name="value_group") cmp(s.astype("object"), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) tm.assert_series_equal(s.astype("category"), s) tm.assert_series_equal(s.astype(CategoricalDtype()), s) roundtrip_expected = s.cat.set_categories( s.cat.categories.sort_values()).cat.remove_unused_categories() tm.assert_series_equal( s.astype("object").astype("category"), roundtrip_expected) tm.assert_series_equal( s.astype("object").astype(CategoricalDtype()), roundtrip_expected) # invalid conversion (these are NOT a dtype) msg = ("dtype '<class 'pandas.core.arrays.categorical.Categorical'>' " "not understood") for invalid in [ lambda x: x.astype(Categorical), lambda x: x.astype("object").astype(Categorical), ]: with pytest.raises(TypeError, match=msg): invalid(s)
def test_cython_transform_frame(op, args, targop): s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) strings = list("qwertyuiopasdfghjklz") strings_missing = strings[:] strings_missing[5] = np.nan df = DataFrame( { "float": s, "float_missing": s_missing, "int": [1, 1, 1, 1, 2] * 200, "datetime": pd.date_range("1990-1-1", periods=1000), "timedelta": pd.timedelta_range(1, freq="s", periods=1000), "string": strings * 50, "string_missing": strings_missing * 50, }, columns=[ "float", "float_missing", "int", "datetime", "timedelta", "string", "string_missing", ], ) df["cat"] = df["string"].astype("category") df2 = df.copy() df2.index = pd.MultiIndex.from_product([range(100), range(10)]) # DataFrame - Single and MultiIndex, # group by values, index level, columns for df in [df, df2]: for gb_target in [ dict(by=labels), dict(level=0), dict(by="string"), ]: # dict(by='string_missing')]: # dict(by=['int','string'])]: gb = df.groupby(**gb_target) # whitelisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == "shift": gb._set_group_selection() if op != "shift" and "int" not in gb_target: # numeric apply fastpath promotes dtype so have # to apply separately and concat i = gb[["int"]].apply(targop) f = gb[["float", "float_missing"]].apply(targop) expected = pd.concat([f, i], axis=1) else: expected = gb.apply(targop) expected = expected.sort_index(axis=1) tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: if c not in ["float", "int", "float_missing"] and op != "shift": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): gb[c].transform(op) with pytest.raises(DataError, match=msg): getattr(gb[c], op)() else: expected = gb[c].apply(targop) expected.name = c tm.assert_series_equal(expected, gb[c].transform(op, *args)) tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
def test_transform(): data = Series(np.arange(9) // 3, index=np.arange(9)) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) transformed = grouped.transform(lambda x: x * x.sum()) assert transformed[7] == 12 # GH 8046 # make sure that we preserve the input order df = DataFrame( np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1] ) key = [0, 0, 1] expected = ( df.sort_index() .groupby(key) .transform(lambda x: x - x.mean()) .groupby(key) .mean() ) result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean() assert_frame_equal(result, expected) def demean(arr): return arr - arr.mean() people = DataFrame( np.random.randn(5, 5), columns=["a", "b", "c", "d", "e"], index=["Joe", "Steve", "Wes", "Jim", "Travis"], ) key = ["one", "two", "one", "two", "one"] result = people.groupby(key).transform(demean).groupby(key).mean() expected = people.groupby(key).apply(demean).groupby(key).mean() assert_frame_equal(result, expected) # GH 8430 df = tm.makeTimeDataFrame() g = df.groupby(pd.Grouper(freq="M")) g.transform(lambda x: x - 1) # GH 9700 df = DataFrame({"a": range(5, 10), "b": range(5)}) result = df.groupby("a").transform(max) expected = DataFrame({"b": range(5)}) tm.assert_frame_equal(result, expected)
__author__ = 'farhan' from pandas import DataFrame, Series import pandas as pd import numpy as np import gov_data_fetcher import matplotlib.pyplot as plt records = gov_data_fetcher.fetch_records() frame = DataFrame(records) results = Series([x.split()[0] for x in frame.a.dropna()]) cframe = frame[frame.a.notnull()] operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows') by_tz_os = cframe.groupby(['tz', operating_system]) agg_counts = by_tz_os.size().unstack().fillna(0) indexer = agg_counts.sum(1).argsort() count_subset = agg_counts.take(indexer)[-10:] #count_subset.plot(kind='barh', stacked=True) normed_subset = count_subset.div(count_subset.sum(1), axis=0) normed_subset.plot(kind='barh', stacked=True) plt.show() #print(count_subset)
def decompose(other_args: List[str], ticker: str, stock: pd.DataFrame): """Decompose time series as: - Additive Time Series = Level + CyclicTrend + Residual + Seasonality - Multiplicative Time Series = Level * CyclicTrend * Residual * Seasonality Parameters ---------- other_args : str Command line arguments to be processed with argparse ticker : str Ticker of the stock stock : pd.DataFrame Stock data """ parser = argparse.ArgumentParser( add_help=False, prog="decompose", description=""" Decompose time series as: - Additive Time Series = Level + CyclicTrend + Residual + Seasonality - Multiplicative Time Series = Level * CyclicTrend * Residual * Seasonality """, ) parser.add_argument( "-m", "--multiplicative", action="store_true", default=False, dest="multiplicative", help="decompose using multiplicative model instead of additive", ) try: ns_parser = parse_known_args_and_warn(parser, other_args) if not ns_parser: return stock = stock["5. adjusted close"] seasonal_periods = 5 # Hodrick-Prescott filter # See Ravn and Uhlig: http://home.uchicago.edu/~huhlig/papers/uhlig.ravn.res.2002.pdf lamb = 107360000000 fig = plt.figure(figsize=plot_autoscale(), dpi=PLOT_DPI, constrained_layout=True) spec = gridspec.GridSpec(ncols=4, nrows=5, figure=fig) fig.add_subplot(spec[0, :]) plt.plot(stock) plt.title(ticker + " (Time-Series)") if ns_parser.multiplicative: resultMul = seasonal_decompose(stock, model="multiplicative", period=seasonal_periods) cycleMul, trendMul = sm.tsa.filters.hpfilter( resultMul.trend[resultMul.trend.notna().values], lamb=lamb) # Multiplicative model fig.add_subplot(spec[1, :4]) plt.plot(resultMul.trend, lw=2, c="purple") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Multiplicative Cyclic-Trend") fig.add_subplot(spec[2, 0:2]) plt.plot(trendMul, lw=2, c="tab:blue") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Multiplicative Trend component") fig.add_subplot(spec[2, 2:]) plt.plot(cycleMul, lw=2, c="green") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Multiplicative Cycle component") fig.add_subplot(spec[3, :]) plt.plot(resultMul.seasonal, lw=2, c="orange") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Multiplicative Seasonal effect") fig.add_subplot(spec[4, :]) plt.plot(resultMul.resid, lw=2, c="red") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Multiplicative Residuals") else: resultAdd = seasonal_decompose(stock, model="additive", period=seasonal_periods) cycleAdd, trendAdd = sm.tsa.filters.hpfilter( resultAdd.trend[resultAdd.trend.notna().values], lamb=lamb) # Additive model fig.add_subplot(spec[1, :4]) plt.plot(resultAdd.trend, lw=2, c="purple") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Additive Cyclic-Trend") fig.add_subplot(spec[2, 0:2]) plt.plot(trendAdd, lw=2, c="tab:blue") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Additive Trend component") fig.add_subplot(spec[2, 2:]) plt.plot(cycleAdd, lw=2, c="green") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Additive Cycle component") fig.add_subplot(spec[3, :]) plt.plot(resultAdd.seasonal, lw=2, c="orange") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Additive Seasonal effect") fig.add_subplot(spec[4, :]) plt.plot(resultAdd.resid, lw=2, c="red") plt.xlim([stock.index[0], stock.index[-1]]) plt.title("Additive Residuals") if gtff.USE_ION: plt.ion() plt.show() print("") # From # https://otexts.com/fpp2/seasonal-strength.html print("Time-Series Level is " + str(round(stock.mean(), 2))) if ns_parser.multiplicative: FtMul = max(0, 1 - np.var( resultMul.resid)) / np.var(resultMul.trend + resultMul.resid) print("Strength of Trend: %.4f" % FtMul) FsMul = max( 0, 1 - np.var(resultMul.resid) / np.var(resultMul.seasonal + resultMul.resid), ) print("Strength of Seasonality: %.4f" % FsMul) else: FtAdd = max( 0, 1 - np.var(resultAdd.resid) / np.var(resultAdd.trend + resultAdd.resid), ) print("Strength of Trend: %.4f" % FtAdd) FsAdd = max( 0, 1 - np.var(resultAdd.resid) / np.var(resultAdd.seasonal + resultAdd.resid), ) print("Strength of Seasonality: %.4f" % FsAdd) print("") except Exception as e: print(e, "\n") return
def test_groupby_transform_with_int(): # GH 3740, make sure that we might upcast on item-by-item transform # floats df = DataFrame( dict( A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype="float64"), C=Series([1, 2, 3, 1, 2, 3], dtype="float64"), D="foo", ) ) with np.errstate(all="ignore"): result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) expected = DataFrame( dict(B=np.nan, C=Series([-1, 0, 1, -1, 0, 1], dtype="float64")) ) assert_frame_equal(result, expected) # int case df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=[1, 2, 3, 1, 2, 3], D="foo")) with np.errstate(all="ignore"): result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) assert_frame_equal(result, expected) # int that needs float conversion s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D="foo")) with np.errstate(all="ignore"): result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) s1 = s.iloc[0:3] s1 = (s1 - s1.mean()) / s1.std() s2 = s.iloc[3:6] s2 = (s2 - s2.mean()) / s2.std() expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) assert_frame_equal(result, expected) # int downcasting result = df.groupby("A").transform(lambda x: x * 2 / 2) expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) assert_frame_equal(result, expected)
def cdf(other_args: List[str], ticker: str, stock: pd.DataFrame, start: datetime): """Plot cumulative distribution function Parameters ---------- other_args : str Command line arguments to be processed with argparse ticker : str Ticker of the stock stock : pd.DataFrame Stock data """ parser = argparse.ArgumentParser( add_help=False, prog="cdf", description=""" Cumulative distribution function """, ) try: ns_parser = parse_known_args_and_warn(parser, other_args) if not ns_parser: return plt.figure(figsize=plot_autoscale(), dpi=PLOT_DPI) stock = stock["5. adjusted close"] cdf = stock.value_counts().sort_index().div(len(stock)).cumsum() cdf.plot(lw=2) plt.title( f"Cumulative Distribution Function of {ticker} from {start.strftime('%Y-%m-%d')}" ) plt.ylabel("Probability") plt.xlabel("Share Price") minVal = stock.values.min() q25 = np.quantile(stock.values, 0.25) medianVal = np.quantile(stock.values, 0.5) q75 = np.quantile(stock.values, 0.75) data = [ (minVal, q25), (0.25, 0.25), "r", (q25, q25), (0, 0.25), "r", (minVal, medianVal), (0.5, 0.5), "r", (medianVal, medianVal), (0, 0.5), "r", (minVal, q75), (0.75, 0.75), "r", (q75, q75), (0, 0.75), "r", ] plt.plot(*data, ls="--") plt.text(minVal + (q25 - minVal) / 2, 0.27, "Q1", color="r", fontweight="bold") plt.text( minVal + (medianVal - minVal) / 2, 0.52, "Median", color="r", fontweight="bold", ) plt.text(minVal + (q75 - minVal) / 2, 0.77, "Q3", color="r", fontweight="bold") plt.xlim(cdf.index[0], cdf.index[-1]) plt.grid(True) if gtff.USE_ION: plt.ion() plt.show() print("") except Exception as e: print(e, "\n") return
def test_values_with_duplicate_columns(self): df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"]) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all()
def print_predictions(predictions: pd.DataFrame) -> None: print('Predictions:') print('============') for sample_id, pred in predictions.to_numpy(): print('{}: {}'.format(sample_id, pred))
def rolling(other_args: List[str], ticker: str, stock: pd.DataFrame): """Rolling mean and std deviation Parameters ---------- other_args : str Command line arguments to be processed with argparse ticker : str Ticker of the stock stock : pd.DataFrame Stock data """ parser = argparse.ArgumentParser( add_help=False, prog="rolling", description=""" Rolling mean and std deviation """, ) parser.add_argument( "-w", "--window", dest="rolling_window", type=check_positive, default=100, help="rolling window", ) try: ns_parser = parse_known_args_and_warn(parser, other_args) if not ns_parser: return stock = stock["5. adjusted close"] rolling_mean = stock.rolling(ns_parser.rolling_window, center=True, min_periods=1).mean() rolling_std = stock.rolling(ns_parser.rolling_window, center=True, min_periods=1).std() fig, axMean = plt.subplots(figsize=plot_autoscale(), dpi=PLOT_DPI) axMean.plot(stock.index, stock.values, label=ticker, linewidth=2, color="black") axMean.plot(rolling_mean, linestyle="--", linewidth=3, color="blue") axMean.set_xlabel("Time") axMean.set_ylabel("Share Price", color="blue") axMean.legend(["Real values", "Rolling Mean"], loc=2) axMean.tick_params(axis="y", labelcolor="blue") axStd = axMean.twinx() axStd.plot(rolling_std, label="Rolling std", linestyle="--", color="green", linewidth=3) axStd.set_ylabel("Std Deviation") axStd.legend(["Rolling std"], loc=1) axStd.set_ylabel("Share Price standard deviation", color="green") axStd.tick_params(axis="y", labelcolor="green") axMean.set_title("Rolling mean and std with window " + str(ns_parser.rolling_window) + " applied to " + ticker) plt.xlim([stock.index[0], stock.index[-1]]) plt.grid(b=True, which="major", color="#666666", linestyle="-") plt.minorticks_on() plt.grid(b=True, which="minor", color="#999999", linestyle="-", alpha=0.2) if gtff.USE_ION: plt.ion() plt.show() print("") except Exception as e: print(e, "\n") return
# State pool states = ['GA', 'FL', 'fl', 'NY', 'NJ', 'TX'] # Make a random list of states random_states = [states[np.randint(low=0, high=len(states))] for i in range(len(rng))] Output.extend(zip(random_states, random_status, data, rng)) return Output if __name__ == "__main__": ### Setting up Data np.seed(500) # Set seed so we can reproduce results dataset = CreateDataSet(4) df = DataFrame(data=dataset, columns=['State', 'Status', 'CustomerCount', 'StatusDate']) print df.info() #<class 'pandas.core.frame.DataFrame'> #Int64Index: 836 entries, 0 to 835 #Data columns (total 4 columns): #State 836 non-null object #Status 836 non-null int64 #CustomerCount 836 non-null int64 #StatusDate 836 non-null datetime64[ns] #dtypes: datetime64[ns](1), int64(2), object(1)None print df.head() # How to write data to read_excel #df.to_excel('Lesson3.xlsx', index=False)
def store_predictions_as_csv(predictions: pd.DataFrame, file_path: str) -> None: print('\nWriting predictions to file "{}".'.format(file_path)) predictions.to_json(file_path, orient='records')
def featurize_site(df: pd.DataFrame, site_stats=("mean", "std_dev")) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with site features from matminer. Currently creates the set of all matminer structure features with the `matminer.featurizers.structure.SiteStatsFingerprint`. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. site_stats (Tuple[str]): the matminer site stats to use in the `SiteStatsFingerprint` for all features. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying site featurizers...") df = df.copy() df.columns = ["Input data|" + x for x in df.columns] site_fingerprints = ( AGNIFingerprints(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), OPSiteFingerprint(), CrystalNNFingerprint.from_preset("ops"), VoronoiFingerprint(), GaussianSymmFunc(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), LocalPropertyDifference(), BondOrientationalParameter(), AverageBondLength(VoronoiNN()), AverageBondAngle(VoronoiNN()) ) for fingerprint in site_fingerprints: site_stats_fingerprint = SiteStatsFingerprint( fingerprint, stats=site_stats ) df = site_stats_fingerprint.featurize_dataframe( df, "Input data|structure", multiindex=False, ignore_errors=True ) fingerprint_name = fingerprint.__class__.__name__ # rename some features for backwards compatibility with pretrained models if fingerprint_name == "GeneralizedRadialDistributionFunction": fingerprint_name = "GeneralizedRDF" elif fingerprint_name == "AGNIFingerprints": fingerprint_name = "AGNIFingerPrint" elif fingerprint_name == "BondOrientationalParameter": fingerprint_name = "BondOrientationParameter" elif fingerprint_name == "GaussianSymmFunc": fingerprint_name = "ChemEnvSiteFingerprint|GaussianSymmFunc" if "|" not in fingerprint_name: fingerprint_name += "|" df.columns = [f"{fingerprint_name}{x}" if "|" not in x else x for x in df.columns] df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
def backtest(self, args: Dict) -> DataFrame: """ Implements backtesting functionality NOTE: This method is used by Hyperopt at each iteration. Please keep it optimized. Of course try to not have ugly code. By some accessor are sometime slower than functions. Avoid, logging on this method :param args: a dict containing: stake_amount: btc amount to use for each trade processed: a processed dictionary with format {pair, data} max_open_trades: maximum number of concurrent trades (default: 0, disabled) realistic: do we try to simulate realistic trades? (default: True) :return: DataFrame """ headers = ['date', 'buy', 'open', 'close', 'sell'] processed = args['processed'] max_open_trades = args.get('max_open_trades', 0) realistic = args.get('realistic', False) trades = [] trade_count_lock: Dict = {} for pair, pair_data in processed.items(): pair_data['buy'], pair_data[ 'sell'] = 0, 0 # cleanup from previous run ticker_data = self.populate_sell_trend( self.populate_buy_trend(pair_data))[headers].copy() # to avoid using data from future, we buy/sell with signal from previous candle ticker_data.loc[:, 'buy'] = ticker_data['buy'].shift(1) ticker_data.loc[:, 'sell'] = ticker_data['sell'].shift(1) ticker_data.drop(ticker_data.head(1).index, inplace=True) # Convert from Pandas to list for performance reasons # (Looping Pandas is slow.) ticker = [x for x in ticker_data.itertuples()] lock_pair_until = None for index, row in enumerate(ticker): if row.buy == 0 or row.sell == 1: continue # skip rows where no buy signal or that would immediately sell off if realistic: if lock_pair_until is not None and row.date <= lock_pair_until: continue if max_open_trades > 0: # Check if max_open_trades has already been reached for the given date if not trade_count_lock.get(row.date, 0) < max_open_trades: continue trade_count_lock[row.date] = trade_count_lock.get( row.date, 0) + 1 trade_entry = self._get_sell_trade_entry( pair, row, ticker[index + 1:], trade_count_lock, args) if trade_entry: lock_pair_until = trade_entry.close_time trades.append(trade_entry) else: # Set lock_pair_until to end of testing period if trade could not be closed # This happens only if the buy-signal was with the last candle lock_pair_until = ticker_data.iloc[-1].date return DataFrame.from_records(trades, columns=BacktestResult._fields)
Pandas DataFrame of series framed for supervised learning. """ n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = list(), list() # input sequence (t-n, ... t-1) for i in range(n_in, 0, -1): cols.append(df.shift(i)) names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)] # forecast sequence (t, t+1, ... t+n) for i in range(0, n_out): cols.append(df.shift(-i)) if i == 0: names += [('var%d(t)' % (j+1)) for j in range(n_vars)] else: names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)] # put it all together agg = concat(cols, axis=1) agg.columns = names # drop rows with NaN values if dropnan: agg.dropna(inplace=True) return agg raw = DataFrame() raw['ob1'] = [x for x in range(10)] raw['ob2'] = [x for x in range(50, 60)] values = raw.values data = series_to_supervised(values, 3, 3) print(data)
def test_partial_slicing_dataframe(self): # GH14856 # Test various combinations of string slicing resolution vs. # index resolution # - If string resolution is less precise than index resolution, # string is considered a slice # - If string resolution is equal to or more precise than index # resolution, string is considered an exact match formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H', '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S'] resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second'] for rnum, resolution in enumerate(resolutions[2:], 2): # we check only 'day', 'hour', 'minute' and 'second' unit = Timedelta("1 " + resolution) middate = datetime(2012, 1, 1, 0, 0, 0) index = DatetimeIndex([middate - unit, middate, middate + unit]) values = [1, 2, 3] df = DataFrame({'a': values}, index, dtype=np.int64) assert df.index.resolution == resolution # Timestamp with the same resolution as index # Should be exact match for Series (return scalar) # and raise KeyError for Frame for timestamp, expected in zip(index, values): ts_string = timestamp.strftime(formats[rnum]) # make ts_string as precise as index result = df['a'][ts_string] assert isinstance(result, np.int64) assert result == expected pytest.raises(KeyError, df.__getitem__, ts_string) # Timestamp with resolution less precise than index for fmt in formats[:rnum]: for element, theslice in [[0, slice(None, 1)], [1, slice(1, None)]]: ts_string = index[element].strftime(fmt) # Series should return slice result = df['a'][ts_string] expected = df['a'][theslice] tm.assert_series_equal(result, expected) # Frame should return slice as well result = df[ts_string] expected = df[theslice] tm.assert_frame_equal(result, expected) # Timestamp with resolution more precise than index # Compatible with existing key # Should return scalar for Series # and raise KeyError for Frame for fmt in formats[rnum + 1:]: ts_string = index[1].strftime(fmt) result = df['a'][ts_string] assert isinstance(result, np.int64) assert result == 2 pytest.raises(KeyError, df.__getitem__, ts_string) # Not compatible with existing key # Should raise KeyError for fmt, res in list(zip(formats, resolutions))[rnum + 1:]: ts = index[1] + Timedelta("1 " + res) ts_string = ts.strftime(fmt) pytest.raises(KeyError, df['a'].__getitem__, ts_string) pytest.raises(KeyError, df.__getitem__, ts_string)
def find_and_set_winnings( wagers: pd.DataFrame, numbers_wagered: pd.DataFrame, drawings: pd.DataFrame, wagers_table_name: str, conn: sqla.engine.Connection, ) -> pd.DataFrame: """ Function to find the prize amount of each item in the 'wagers' DataFrame. @param wagers: DataFrame containing keno wagers data. @param numbers_wagered: DataFrame containing numbers_wagered data. @param drawings: DataFrame containing keno drawings data. @returns wagers: modified 'wagers' DataFrame. """ metadata = sqla.MetaData(bind=conn) wagers_table = sqla.Table(wagers_table_name, metadata, autoload=True) def calculate_prize(row: pd.Series) -> pd.Series: """ Function applied to all rows in the 'wagers' DataFrame. Utilized normally via pd.apply. Principally, this function is responsible for the bit-wise AND'ing of two lottery numbers, allowing for fast matching of theretofore mentioned numbers. @param x: 'numbers_wagered_id' and 'draw_number_id' element of the 'wagers' DataFrame @param spots: DataFrame containing spots data. @param drawings: DataFrame containing keno drawings data. @returns match_mask: array of high and low bits of the match, hamming weight (number of spots played), and date. """ try: numbers_wagered_id = row["numbers_wagered_id"] draw_number_id = row["draw_number_id"] high_bits1 = numbers_wagered.at[numbers_wagered_id, "high_bits"] low_bits1 = numbers_wagered.at[numbers_wagered_id, "low_bits"] number_played = numbers_wagered.at[numbers_wagered_id, "numbers_played"] high_bits2 = drawings.at[draw_number_id, "high_bits"] low_bits2 = drawings.at[draw_number_id, "low_bits"] match_mask = [low_bits1 & low_bits2, high_bits1 & high_bits2] numbers_matched = sum(map(popcount64d, match_mask)) row["low_match_mask"] = match_mask[0] row["high_match_mask"] = match_mask[1] row["numbers_matched"] = numbers_matched row["prize"] = PRIZE_DICT.get(number_played, {}).get(numbers_matched, 0) conn.execute(wagers_table.insert(), **row) except Exception as e: print(e) return row return wagers.assign( low_match_mask=0, high_match_mask=0, numbers_matched=0, prize=0 ).apply(calculate_prize, axis=1)
def get_features_relevance_redundancy( target_nmi: pd.DataFrame, cross_nmi: pd.DataFrame, n_feat: Optional[int] = None, rr_parameters: Optional[Dict[str, Union[float, Callable[[int], float]]]] = None, return_pc: bool = False ) -> List: """ Select features from the Relevance Redundancy (RR) score between the input features and the target output. The RR is defined following Equation 2 of De Breuck et al, arXiv:2004:14766, with default values, ..math:: p = \\max{0.1, 4.5 - n^{0.4}}, and ..math:: c = 10^{-6} n^3, where :math:`n` is the number of features in the "chosen" subset for that iteration. These values can be overriden with the `rr_parameters` dictionary argument. Args: target_nmi (pandas.DataFrame): dataframe containing the Normalized Mutual Information (NMI) between a list of input features and a target variable, as computed from :py:func:`nmi_target`. cross_nmi (pandas.DataFrame): dataframe containing the NMI between the input features, as computed from :py:func:`get_cross_nmi`. n_feat (int): Number of features for which the RR score needs to be computed (default: all features). rr_parameters (dict): Allows tuning of p and c parameters. Currently allows fixing of p and c to constant values instead of using the dynamical evaluation. Expects to find keys `"p"` and `"c"`, containing either a callable that takes `n` as an argument and returns the desired `p` or `c`, or another dictionary containing the key `"value"` that stores a constant value of `p` or `c`. return_pc: Whether to return p and c values in the output dictionaries. Returns: list: List of dictionaries containing the results of the relevance-redundancy selection algorithm. """ # Initial checks if set(cross_nmi.index) != set(cross_nmi.columns): raise ValueError('The cross_nmi DataFrame should have its indices and columns identical.') if not set(target_nmi.index).issubset(set(cross_nmi.index)): raise ValueError('The indices of the target DataFrame should be included in the cross_nmi DataFrame indices.') # Define the functions for the parameters if rr_parameters is None: get_p = get_rr_p_parameter_default get_c = get_rr_c_parameter_default else: if 'p' not in rr_parameters or 'c' not in rr_parameters: raise ValueError('When tuning p and c with rr_parameters in get_features_relevance_redundancy, ' 'both parameters should be tuned') # Set up p if callable(rr_parameters["p"]): get_p = rr_parameters["p"] elif rr_parameters['p'].get('function') == 'constant': def get_p(_): return rr_parameters['p']['value'] else: raise ValueError( 'If not passing a callable, "p" dict must contain keys "function" and "value".' ) # Set up c if callable(rr_parameters["c"]): get_c = rr_parameters["c"] elif rr_parameters['c'].get('function') == 'constant': def get_c(_): return rr_parameters['c']['value'] else: raise ValueError( 'If not passing a callable, "c" dict must contain keys "function" and "value".' ) # Set up the output list out = [] # The first feature is the one with the largest target NMI target_column = target_nmi.columns[0] first_feature = target_nmi.nlargest(1, columns=target_column).index[0] feature_set = [first_feature] feat_out = {'feature': first_feature, 'RR_score': None, 'NMI_target': target_nmi[target_column][first_feature]} if return_pc: feat_out['RR_p'] = None feat_out['RR_c'] = None out.append(feat_out) # Default is to get the RR score for all features if n_feat is None: n_feat = len(target_nmi.index) missing = [x for x in cross_nmi.index if x not in target_nmi.index] cross_nmi = cross_nmi.drop(missing, axis=0).drop(missing, axis=1) # Loop on the number of features for n in range(1, n_feat): logging.debug("In selection of feature {}/{} features...".format(n+1, n_feat)) if (n+1) % 50 == 0: logging.info("Selected {}/{} features...".format(n, n_feat)) p = get_p(n) c = get_c(n) # Compute the RR score score = cross_nmi.copy() # Remove features already selected for the index score = score.drop(feature_set, axis=0) # Use features already selected to compute the maximum NMI between # the remaining features and those already selected score = score[feature_set] # Get the scores of the remaining features for i in score.index: row = score.loc[i, :] score.loc[i, :] = target_nmi.loc[i, target_column] / (row ** p + c) # Get the next feature (the one with the highest score) scores_remaining_features = score.min(axis=1) next_feature = scores_remaining_features.idxmax(axis=0) feature_set.append(next_feature) # Add the results for the next feature to the list feat_out = {'feature': next_feature, 'RR_score': scores_remaining_features[next_feature], 'NMI_target': target_nmi[target_column][next_feature]} if return_pc: feat_out['RR_p'] = p feat_out['RR_c'] = c out.append(feat_out) return out
def test_to_csv_na_rep(self): # see gh-11553 # # Testing if NaN values are correctly represented in the index. df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]}) expected_rows = ['a,b,c', '0.0,0,2', '_,1,3'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected # now with an index containing only NaNs df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]}) expected_rows = ['a,b,c', '_,0,2', '_,1,3'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected # check if na_rep parameter does not break anything when no NaN df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]}) expected_rows = ['a,b,c', '0,0,2', '0,1,3'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
def feature_pre_processor(path): prepare_feature_groundtruth = partial(prepare_feature, f'{path}/labels.csv') ftrs = return_from_path(prepare_feature_groundtruth, f'{path}/img', '.jpg') return DataFrame(ftrs)
def test_infer_output_shape_listlike_columns(self): # GH 16353 df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) result = df.apply(lambda x: [1, 2, 3], axis=1) expected = Series([[1, 2, 3] for t in df.itertuples()]) assert_series_equal(result, expected) result = df.apply(lambda x: [1, 2], axis=1) expected = Series([[1, 2] for t in df.itertuples()]) assert_series_equal(result, expected) # GH 17970 df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) result = df.apply(lambda row: np.ones(1), axis=1) expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) assert_series_equal(result, expected) result = df.apply(lambda row: np.ones(2), axis=1) expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) assert_series_equal(result, expected) # GH 17892 df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), pd.Timestamp('2010-02-04'), pd.Timestamp('2010-02-05'), pd.Timestamp('2010-02-06')], 'b': [9, 5, 4, 3], 'c': [5, 3, 4, 2], 'd': [1, 2, 3, 4]}) def fun(x): return (1, 2) result = df.apply(fun, axis=1) expected = Series([(1, 2) for t in df.itertuples()]) assert_series_equal(result, expected)
def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) exp_rows = [',1', ',2', '0,1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ['1', '2', '1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]), index=pd.MultiIndex.from_arrays([[1], [2]])) exp_rows = [',,1', ',,2', '1,2,1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ['1', '2', '1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame( [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']])) exp_rows = [',foo', ',bar', '0,1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ['foo', 'bar', '1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp
def test_applymap(self, float_frame): applied = float_frame.applymap(lambda x: x * 2) tm.assert_frame_equal(applied, float_frame * 2) float_frame.applymap(type) # GH 465: function returning tuples result = float_frame.applymap(lambda x: (x, x)) assert isinstance(result['A'][0], tuple) # GH 2909: object conversion to float in constructor? df = DataFrame(data=[1, 'a']) result = df.applymap(lambda x: x) assert result.dtypes[0] == object df = DataFrame(data=[1., 'a']) result = df.applymap(lambda x: x) assert result.dtypes[0] == object # GH 2786 df = DataFrame(np.random.random((3, 4))) df2 = df.copy() cols = ['a', 'a', 'a', 'a'] df.columns = cols expected = df2.applymap(str) expected.columns = cols result = df.applymap(str) tm.assert_frame_equal(result, expected) # datetime/timedelta df['datetime'] = Timestamp('20130101') df['timedelta'] = pd.Timedelta('1 min') result = df.applymap(str) for f in ['datetime', 'timedelta']: assert result.loc[0, f] == str(df.loc[0, f]) # GH 8222 empty_frames = [pd.DataFrame(), pd.DataFrame(columns=list('ABC')), pd.DataFrame(index=list('ABC')), pd.DataFrame({'A': [], 'B': [], 'C': []})] for frame in empty_frames: for func in [round, lambda x: x]: result = frame.applymap(func) tm.assert_frame_equal(result, frame)
class TestDataFrameAggregate(): def test_agg_transform(self, axis, float_frame): other_axis = 1 if axis in {0, 'index'} else 0 with np.errstate(all='ignore'): f_abs = np.abs(float_frame) f_sqrt = np.sqrt(float_frame) # ufunc result = float_frame.transform(np.sqrt, axis=axis) expected = f_sqrt.copy() assert_frame_equal(result, expected) result = float_frame.apply(np.sqrt, axis=axis) assert_frame_equal(result, expected) result = float_frame.transform(np.sqrt, axis=axis) assert_frame_equal(result, expected) # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() if axis in {0, 'index'}: expected.columns = pd.MultiIndex.from_product( [float_frame.columns, ['sqrt']]) else: expected.index = pd.MultiIndex.from_product( [float_frame.index, ['sqrt']]) assert_frame_equal(result, expected) result = float_frame.transform([np.sqrt], axis=axis) assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) expected = zip_frames([f_abs, f_sqrt], axis=other_axis) if axis in {0, 'index'}: expected.columns = pd.MultiIndex.from_product( [float_frame.columns, ['absolute', 'sqrt']]) else: expected.index = pd.MultiIndex.from_product( [float_frame.index, ['absolute', 'sqrt']]) assert_frame_equal(result, expected) result = float_frame.transform([np.abs, 'sqrt'], axis=axis) assert_frame_equal(result, expected) def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg with pytest.raises(ValueError): float_frame.transform(['max', 'min'], axis=axis) with pytest.raises(ValueError): with np.errstate(all='ignore'): float_frame.agg(['max', 'sqrt'], axis=axis) with pytest.raises(ValueError): with np.errstate(all='ignore'): float_frame.transform(['max', 'sqrt'], axis=axis) df = pd.DataFrame({'A': range(5), 'B': 5}) def f(): with np.errstate(all='ignore'): df.agg({'A': ['abs', 'sum'], 'B': ['mean', 'max']}, axis=axis) @pytest.mark.parametrize('method', [ 'abs', 'shift', 'pct_change', 'cumsum', 'rank', ]) def test_transform_method_name(self, method): # GH 19760 df = pd.DataFrame({"A": [-1, 2]}) result = df.transform(method) expected = operator.methodcaller(method)(df) tm.assert_frame_equal(result, expected) def test_demo(self): # demonstration tests df = pd.DataFrame({'A': range(5), 'B': 5}) result = df.agg(['min', 'max']) expected = DataFrame({'A': [0, 4], 'B': [5, 5]}, columns=['A', 'B'], index=['min', 'max']) tm.assert_frame_equal(result, expected) result = df.agg({'A': ['min', 'max'], 'B': ['sum', 'max']}) expected = DataFrame({'A': [4.0, 0.0, np.nan], 'B': [5.0, np.nan, 25.0]}, columns=['A', 'B'], index=['max', 'min', 'sum']) tm.assert_frame_equal(result.reindex_like(expected), expected) def test_agg_multiple_mixed_no_warning(self): # GH 20909 mdf = pd.DataFrame({'A': [1, 2, 3], 'B': [1., 2., 3.], 'C': ['foo', 'bar', 'baz'], 'D': pd.date_range('20130101', periods=3)}) expected = pd.DataFrame({"A": [1, 6], 'B': [1.0, 6.0], "C": ['bar', 'foobarbaz'], "D": [pd.Timestamp('2013-01-01'), pd.NaT]}, index=['min', 'sum']) # sorted index with tm.assert_produces_warning(None): result = mdf.agg(['min', 'sum']) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(None): result = mdf[['D', 'C', 'B', 'A']].agg(['sum', 'min']) # For backwards compatibility, the result's index is # still sorted by function name, so it's ['min', 'sum'] # not ['sum', 'min']. expected = expected[['D', 'C', 'B', 'A']] tm.assert_frame_equal(result, expected) def test_agg_dict_nested_renaming_depr(self): df = pd.DataFrame({'A': range(5), 'B': 5}) # nested renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df.agg({'A': {'foo': 'min'}, 'B': {'bar': 'max'}}) def test_agg_reduce(self, axis, float_frame): other_axis = 1 if axis in {0, 'index'} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() # all reducers expected = pd.concat([float_frame.mean(axis=axis), float_frame.max(axis=axis), float_frame.sum(axis=axis), ], axis=1) expected.columns = ['mean', 'max', 'sum'] expected = expected.T if axis in {0, 'index'} else expected result = float_frame.agg(['mean', 'max', 'sum'], axis=axis) assert_frame_equal(result, expected) # dict input with scalars func = OrderedDict([(name1, 'mean'), (name2, 'sum')]) result = float_frame.agg(func, axis=axis) expected = Series([float_frame.loc(other_axis)[name1].mean(), float_frame.loc(other_axis)[name2].sum()], index=[name1, name2]) assert_series_equal(result, expected) # dict input with lists func = OrderedDict([(name1, ['mean']), (name2, ['sum'])]) result = float_frame.agg(func, axis=axis) expected = DataFrame({ name1: Series([float_frame.loc(other_axis)[name1].mean()], index=['mean']), name2: Series([float_frame.loc(other_axis)[name2].sum()], index=['sum'])}) expected = expected.T if axis in {1, 'columns'} else expected assert_frame_equal(result, expected) # dict input with lists with multiple func = OrderedDict([(name1, ['mean', 'sum']), (name2, ['sum', 'max'])]) result = float_frame.agg(func, axis=axis) expected = DataFrame(OrderedDict([ (name1, Series([float_frame.loc(other_axis)[name1].mean(), float_frame.loc(other_axis)[name1].sum()], index=['mean', 'sum'])), (name2, Series([float_frame.loc(other_axis)[name2].sum(), float_frame.loc(other_axis)[name2].max()], index=['sum', 'max'])), ])) expected = expected.T if axis in {1, 'columns'} else expected assert_frame_equal(result, expected) def test_nuiscance_columns(self): # GH 15015 df = DataFrame({'A': [1, 2, 3], 'B': [1., 2., 3.], 'C': ['foo', 'bar', 'baz'], 'D': pd.date_range('20130101', periods=3)}) result = df.agg('min') expected = Series([1, 1., 'bar', pd.Timestamp('20130101')], index=df.columns) assert_series_equal(result, expected) result = df.agg(['min']) expected = DataFrame([[1, 1., 'bar', pd.Timestamp('20130101')]], index=['min'], columns=df.columns) assert_frame_equal(result, expected) result = df.agg('sum') expected = Series([6, 6., 'foobarbaz'], index=['A', 'B', 'C']) assert_series_equal(result, expected) result = df.agg(['sum']) expected = DataFrame([[6, 6., 'foobarbaz']], index=['sum'], columns=['A', 'B', 'C']) assert_frame_equal(result, expected) def test_non_callable_aggregates(self): # GH 16405 # 'size' is a property of frame/series # validate that this is working df = DataFrame({'A': [None, 2, 3], 'B': [1.0, np.nan, 3.0], 'C': ['foo', None, 'bar']}) # Function aggregate result = df.agg({'A': 'count'}) expected = Series({'A': 2}) assert_series_equal(result, expected) # Non-function aggregate result = df.agg({'A': 'size'}) expected = Series({'A': 3}) assert_series_equal(result, expected) # Mix function and non-function aggs result1 = df.agg(['count', 'size']) result2 = df.agg({'A': ['count', 'size'], 'B': ['count', 'size'], 'C': ['count', 'size']}) expected = pd.DataFrame({'A': {'count': 2, 'size': 3}, 'B': {'count': 2, 'size': 3}, 'C': {'count': 2, 'size': 3}}) assert_frame_equal(result1, result2, check_like=True) assert_frame_equal(result2, expected, check_like=True) # Just functional string arg is same as calling df.arg() result = df.agg('count') expected = df.count() assert_series_equal(result, expected) # Just a string attribute arg same as calling df.arg result = df.agg('size') expected = df.size assert result == expected @pytest.mark.parametrize("df, func, expected", chain( _get_cython_table_params( DataFrame(), [ ('sum', Series()), ('max', Series()), ('min', Series()), ('all', Series(dtype=bool)), ('any', Series(dtype=bool)), ('mean', Series()), ('prod', Series()), ('std', Series()), ('var', Series()), ('median', Series()), ]), _get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ('sum', Series([1., 3])), ('max', Series([1., 2])), ('min', Series([1., 1])), ('all', Series([True, True])), ('any', Series([True, True])), ('mean', Series([1, 1.5])), ('prod', Series([1., 2])), ('std', Series([np.nan, 0.707107])), ('var', Series([np.nan, 0.5])), ('median', Series([1, 1.5])), ]), )) def test_agg_cython_table(self, df, func, expected, axis): # GH 21224 # test reducing functions in # pandas.core.base.SelectionMixin._cython_table result = df.agg(func, axis=axis) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("df, func, expected", chain( _get_cython_table_params( DataFrame(), [ ('cumprod', DataFrame()), ('cumsum', DataFrame()), ]), _get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ('cumprod', DataFrame([[np.nan, 1], [1., 2.]])), ('cumsum', DataFrame([[np.nan, 1], [1., 3.]])), ]), )) def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("df, func, expected", _get_cython_table_params( DataFrame([['a', 'b'], ['b', 'a']]), [ ['cumprod', TypeError], ]), ) def test_agg_cython_table_raises(self, df, func, expected, axis): # GH 21224 with pytest.raises(expected): df.agg(func, axis=axis) @pytest.mark.parametrize("num_cols", [2, 3, 5]) def test_frequency_is_original(self, num_cols): # GH 22150 index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) original = index.copy() df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq