def test_isnull(): assert not isnull(1.) assert isnull(None) assert isnull(np.NaN) assert not isnull(np.inf) assert not isnull(-np.inf) # series for s in [tm.makeFloatSeries(),tm.makeStringSeries(), tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]: assert(isinstance(isnull(s), Series)) # frame for df in [tm.makeTimeDataFrame(),tm.makePeriodFrame(),tm.makeMixedDataFrame()]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d for p in [ tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def test_hash_pandas_object(self): for obj in [ Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series([True, False, True]), Index([1, 2, 3]), Index([True, False, True]), DataFrame({ 'x': ['a', 'b', 'c'], 'y': [1, 2, 3] }), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), Series([1, 2, 3], index=pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])) ]: self.check_equal(obj) self.check_not_equal_with_index(obj)
def test_isna_isnull(self, isna_f): assert not isna_f(1.) assert isna_f(None) assert isna_f(np.NaN) assert float('nan') assert not isna_f(np.inf) assert not isna_f(-np.inf) # series for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries()]: assert isinstance(isna_f(s), Series) # frame for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame()]: result = isna_f(df) expected = df.apply(isna_f) tm.assert_frame_equal(result, expected) # panel with catch_warnings(record=True): simplefilter("ignore", FutureWarning) for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel())]: result = isna_f(p) expected = p.apply(isna_f) tm.assert_panel_equal(result, expected)
def test_upload_data_if_table_exists_replace(self): raise nose.SkipTest("buggy test") destination_table = DESTINATION_TABLE + "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, _get_project_id(), if_exists='replace', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), MultiIndex.from_product( [range(5), ['foo', 'bar', 'baz'], pd.date_range('20130101', periods=2)]), MultiIndex.from_product( [pd.CategoricalIndex(list('aabc')), range(3)])]: self.check_equal(obj) self.check_not_equal_with_index(obj)
def test_upload_data_if_table_exists_append(self): destination_table = DESTINATION_TABLE + "3" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) # Test the if_exists parameter with value 'append' gbq.to_gbq(df, destination_table, _get_project_id(), if_exists='append', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, destination_table, _get_project_id(), if_exists='append', private_key=_get_private_key_path())
def test_isnull(self): self.assertFalse(isnull(1.)) self.assertTrue(isnull(None)) self.assertTrue(isnull(np.NaN)) self.assertTrue(float('nan')) self.assertFalse(isnull(np.inf)) self.assertFalse(isnull(-np.inf)) # series for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries()]: assert isinstance(isnull(s), Series) # frame for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame()]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel with catch_warnings(record=True): for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel())]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d with catch_warnings(record=True): for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def test_csv_to_s3_into(): df = tm.makeMixedDataFrame() with tmpfile('.csv') as fn: with s3_bucket('.csv') as b: df.to_csv(fn, index=False) s3 = into(b, CSV(fn)) result = into(pd.DataFrame, s3) tm.assert_frame_equal(df, result)
def test_csv_to_s3_append(): df = tm.makeMixedDataFrame() with tmpfile(".csv") as fn: with s3_bucket(".csv") as b: s3 = resource(b) df.to_csv(fn, index=False) append(s3, CSV(fn)) result = into(pd.DataFrame, s3) tm.assert_frame_equal(df, result)
def test_generate_bq_schema(self): df = tm.makeMixedDataFrame() schema = gbq.generate_bq_schema(df) test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, {'name': 'B', 'type': 'FLOAT'}, {'name': 'C', 'type': 'STRING'}, {'name': 'D', 'type': 'TIMESTAMP'}]} self.assertEqual(schema, test_schema)
def test_s3_encrypted_multipart_upload(s3_encryption_bucket): s3_connection = boto.connect_s3() df = tm.makeMixedDataFrame() with tmpfile('.csv') as fn: df.to_csv(fn, index=False) s3_uri = 's3://{bucket}/{fn}'.format(bucket=s3_encryption_bucket, fn=os.path.basename(fn)) odo(fn, s3_uri, s3=s3_connection, encrypt_key=True, multipart=True) result = odo(s3_uri, pd.DataFrame, s3=s3_connection) tm.assert_frame_equal(df, result)
def test_head_compute(): data = tm.makeMixedDataFrame() t = symbol('t', discover(data)) db = into('sqlite:///:memory:::t', data, dshape=t.dshape) n = 2 d = Data(db) # skip the header and the ... at the end of the repr expr = d.head(n) s = repr(expr) assert '...' not in s result = s.split('\n')[1:] assert len(result) == n
def test_generate_schema(self): df = tm.makeMixedDataFrame() schema = gbq._generate_bq_schema(df) test_schema = { "fields": [ {"name": "A", "type": "FLOAT"}, {"name": "B", "type": "FLOAT"}, {"name": "C", "type": "STRING"}, {"name": "D", "type": "TIMESTAMP"}, ] } self.assertEqual(schema, test_schema)
def test_upload_data_if_table_exists_replace(self): table_name = 'new_test4' test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='replace') sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_upload_data_if_table_exists_replace(self): destination_table = DESTINATION_TABLE + "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='replace') sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_isnull(self): self.assertFalse(isnull(1.)) self.assertTrue(isnull(None)) self.assertTrue(isnull(np.NaN)) self.assertTrue(float('nan')) self.assertFalse(isnull(np.inf)) self.assertFalse(isnull(-np.inf)) # series for s in [ tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries() ]: self.assertIsInstance(isnull(s), Series) # frame for df in [ tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame() ]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def test_dataframe(): import numpy from pandas import Timestamp from pandas.util import testing from pandas.tslib import NaTType from ..dataframe import dataframe_to_rows df = testing.makeMixedDataFrame() df.iloc[0] = numpy.nan rows = tuple(dataframe_to_rows(df)) assert isnan(rows[1][1]) assert type(rows[1][-1]) == NaTType assert rows[2:] == ( [1, 1.0, 1.0, 'foo2', Timestamp('2009-01-02 00:00:00')], [2, 2.0, 0.0, 'foo3', Timestamp('2009-01-05 00:00:00')], [3, 3.0, 1.0, 'foo4', Timestamp('2009-01-06 00:00:00')], [4, 4.0, 0.0, 'foo5', Timestamp('2009-01-07 00:00:00')], )
def test_isnull(): assert not isnull(1.) assert isnull(None) assert isnull(np.NaN) assert not isnull(np.inf) assert not isnull(-np.inf) # series for s in [ tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries() ]: assert (isinstance(isnull(s), Series)) # frame for df in [ tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame() ]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def test_isna_isnull(self, isna_f): assert not isna_f(1.) assert isna_f(None) assert isna_f(np.NaN) assert float('nan') assert not isna_f(np.inf) assert not isna_f(-np.inf) # series for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries()]: assert isinstance(isna_f(s), Series) # frame for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame()]: result = isna_f(df) expected = df.apply(isna_f) tm.assert_frame_equal(result, expected)
def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex()]: self.check_equal(obj) self.check_not_equal_with_index(obj)
def test_upload_data_if_table_exists_replace(self): destination_table = DESTINATION_TABLE + "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='replace') sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_upload_data_if_table_exists_append(self): destination_table = DESTINATION_TABLE + "3" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with value 'append' gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='append') sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='append')
def test_upload_data_if_table_exists_append(self): table_name = 'new_test3' test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with value 'append' gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='append') sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='append')
def test_generate_schema(self): df = tm.makeMixedDataFrame() schema = gbq._generate_bq_schema(df) test_schema = { 'fields': [{ 'name': 'A', 'type': 'FLOAT' }, { 'name': 'B', 'type': 'FLOAT' }, { 'name': 'C', 'type': 'STRING' }, { 'name': 'D', 'type': 'TIMESTAMP' }] } self.assertEqual(schema, test_schema)
def test_isna_isnull(self, isna_f): assert not isna_f(1.) assert isna_f(None) assert isna_f(np.NaN) assert float('nan') assert not isna_f(np.inf) assert not isna_f(-np.inf) # series for s in [ tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries() ]: assert isinstance(isna_f(s), Series) # frame for df in [ tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame() ]: result = isna_f(df) expected = df.apply(isna_f) tm.assert_frame_equal(result, expected) # panel with catch_warnings(record=True): simplefilter("ignore", FutureWarning) for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: result = isna_f(p) expected = p.apply(isna_f) tm.assert_panel_equal(result, expected)
def test_tabulator_stream_dataframe(document, comm): df = makeMixedDataFrame() table = Tabulator(df) model = table.get_root(document, comm) stream_value = pd.DataFrame({ 'A': [5, 6], 'B': [1, 0], 'C': ['foo6', 'foo7'], 'D': [dt.datetime(2009, 1, 8), dt.datetime(2009, 1, 9)] }) table.stream(stream_value) assert len(table.value) == 7 expected = { 'index': np.array([0, 1, 2, 3, 4, 5, 6]), 'A': np.array([0, 1, 2, 3, 4, 5, 6]), 'B': np.array([0, 1, 0, 1, 0, 1, 0]), 'C': np.array(['foo1', 'foo2', 'foo3', 'foo4', 'foo5', 'foo6', 'foo7']), 'D': np.array([ '2009-01-01T00:00:00.000000000', '2009-01-02T00:00:00.000000000', '2009-01-05T00:00:00.000000000', '2009-01-06T00:00:00.000000000', '2009-01-07T00:00:00.000000000', '2009-01-08T00:00:00.000000000', '2009-01-09T00:00:00.000000000' ], dtype='datetime64[ns]') } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col])
def test_hash_pandas_object(self): for obj in [ Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({ 'x': ['a', 'b', 'c'], 'y': [1, 2, 3] }), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), MultiIndex.from_product([ range(5), ['foo', 'bar', 'baz'], pd.date_range('20130101', periods=2) ]), MultiIndex.from_product( [pd.CategoricalIndex(list('aabc')), range(3)]) ]: self.check_equal(obj) self.check_not_equal_with_index(obj)
def test_tabulator_stream_df_rollover(document, comm): df = makeMixedDataFrame() table = Tabulator(df) model = table.get_root(document, comm) stream_value = pd.Series({ 'A': 5, 'B': 1, 'C': 'foo6', 'D': dt.datetime(2009, 1, 8) }).to_frame().T table.stream(stream_value, rollover=5) assert len(table.value) == 5 expected = { 'index': np.array([1, 2, 3, 4, 5]), 'A': np.array([1, 2, 3, 4, 5]), 'B': np.array([1, 0, 1, 0, 1]), 'C': np.array(['foo2', 'foo3', 'foo4', 'foo5', 'foo6']), 'D': np.array([ '2009-01-02T00:00:00.000000000', '2009-01-05T00:00:00.000000000', '2009-01-06T00:00:00.000000000', '2009-01-07T00:00:00.000000000', '2009-01-08T00:00:00.000000000' ], dtype='datetime64[ns]') } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col])
def test_tabulator_function_filter(document, comm): df = makeMixedDataFrame() table = Tabulator(df) model = table.get_root(document, comm) widget = TextInput(value='foo3') def filter_c(df, value): return df[df.C.str.contains(value)] table.add_filter(bind(filter_c, value=widget), 'C') expected = { 'index': np.array([2]), 'A': np.array([2]), 'B': np.array([0]), 'C': np.array(['foo3']), 'D': np.array(['2009-01-05T00:00:00.000000000'], dtype='datetime64[ns]') } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col]) widget.value = 'foo1' expected = { 'index': np.array([0]), 'A': np.array([0]), 'B': np.array([0]), 'C': np.array(['foo1']), 'D': np.array(['2009-01-01T00:00:00.000000000'], dtype='datetime64[ns]') } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col])
def test_tabulator_constant_tuple_filter(document, comm): df = makeMixedDataFrame() table = Tabulator(df) model = table.get_root(document, comm) table.add_filter((2, 3), 'A') expected = { 'index': np.array([2, 3]), 'A': np.array([2, 3]), 'B': np.array([0, 1]), 'C': np.array(['foo3', 'foo4']), 'D': np.array( ['2009-01-05T00:00:00.000000000', '2009-01-06T00:00:00.000000000'], dtype='datetime64[ns]') } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col])
def test_equal_dataframes_compare(): df1 = makeMixedDataFrame() output = Compare(left=df1, right=df1, key_columns='A').diff assert output is None
pd.Series([1., 2., 3.]), pd.Series([1., 2., 3.], name='foo'), pd.Series([1., 2., 3.], name='foo', index=[4, 5, 6]), pd.Series([1., 2., 3.], name='foo', index=pd.Index([4, 5, 6], name='bar')), pd.DataFrame({'x': ['a', 'b', 'c']}), pd.DataFrame({'x': [b'a', b'b', b'c']}), pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'], ordered=True)}), pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'], ordered=False)}), tm.makeCategoricalIndex(), tm.makeCustomDataframe(5, 3), tm.makeDataFrame(), tm.makeDateIndex(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeObjectSeries(), tm.makePeriodFrame(), tm.makeRangeIndex(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeUnicodeIndex(), ] @pytest.mark.parametrize('df', dfs) def test_dumps_serialize_numpy(df): header, frames = serialize(df) if 'compression' in header: frames = decompress(header, frames) df2 = deserialize(header, frames)
# create a dataframe PRICEDOMSIZE= 5 # domain size of prices SIZEDOMSIZE= 100 def createTable(N): return pd.DataFrame({ 'pA': np.random.randint(0, PRICEDOMSIZE, N), 'pB': np.random.randint(0, PRICEDOMSIZE, N), 'sA': np.random.randint(0, SIZEDOMSIZE, N), 'sB': np.random.randint(0, SIZEDOMSIZE, N)}) createTable(5) # quickly create a dataframe for testing import pandas.util.testing as tm tm.N, tm.K = 5,3 tm.makeDataFrame(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(freq="W") lst = [40, 10, 20, 30] names = ['AAA','Adfsdf','dfwef','fwefw'] temp_df=pd.DataFrame(lst) temp_df=pd.DataFrame(list(zip(names,lst)),columns=["Name","Age"]) # Create blank dataframe: could be useful if we want to append data row by row to a Dataframe. # In that case it’s better to have predefined columns blank_df=pd.DataFrame(columns=["Name","Age"]) # Create rows for values separated by commas in a cell d = {"Team":["FC Barcelona", "FC Real Madrid"], "Players":["Ter Stegen, Semedo, Piqué, Lenglet, Alba, Rakitic, De Jong, Sergi Roberto, Messi, Suárez, Griezmann",
def test_describe(self): desc = tm.makeDataFrame().describe() desc = tm.makeMixedDataFrame().describe() desc = tm.makeTimeDataFrame().describe()
pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [4.0, 5.0, 6.0]}), pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=pd.Index([4, 5, 6], name="bar")), pd.Series([1.0, 2.0, 3.0]), pd.Series([1.0, 2.0, 3.0], name="foo"), pd.Series([1.0, 2.0, 3.0], name="foo", index=[4, 5, 6]), pd.Series([1.0, 2.0, 3.0], name="foo", index=pd.Index([4, 5, 6], name="bar")), pd.DataFrame({"x": ["a", "b", "c"]}), pd.DataFrame({"x": [b"a", b"b", b"c"]}), pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=True)}), pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=False)}), tm.makeCategoricalIndex(), tm.makeCustomDataframe(5, 3), tm.makeDataFrame(), tm.makeDateIndex(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeObjectSeries(), tm.makePeriodFrame(), tm.makeRangeIndex(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeUnicodeIndex(), ] @pytest.mark.parametrize("df", dfs) def test_dumps_serialize_numpy(df): header, frames = serialize(df) if "compression" in header: frames = decompress(header, frames) df2 = deserialize(header, frames)
def test_tabulator_pagination(document, comm): df = makeMixedDataFrame() table = Tabulator(df, pagination='remote', page_size=2) model = table.get_root(document, comm) assert model.max_page == 3 assert model.page_size == 2 assert model.page == 1 expected = { 'index': np.array([0, 1]), 'A': np.array([0, 1]), 'B': np.array([0, 1]), 'C': np.array(['foo1', 'foo2']), 'D': np.array( ['2009-01-01T00:00:00.000000000', '2009-01-02T00:00:00.000000000'], dtype='datetime64[ns]') } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col]) table.page = 2 expected = { 'index': np.array([2, 3]), 'A': np.array([2, 3]), 'B': np.array([0., 1.]), 'C': np.array(['foo3', 'foo4']), 'D': np.array( ['2009-01-05T00:00:00.000000000', '2009-01-06T00:00:00.000000000'], dtype='datetime64[ns]') } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col]) table.page_size = 3 table.page = 1 assert model.max_page == 2 expected = { 'index': np.array([0, 1, 2]), 'A': np.array([0, 1, 2]), 'B': np.array([0, 1, 0]), 'C': np.array(['foo1', 'foo2', 'foo3']), 'D': np.array([ '2009-01-01T00:00:00.000000000', '2009-01-02T00:00:00.000000000', '2009-01-05T00:00:00.000000000' ], dtype='datetime64[ns]') } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col])
class TestHashing(object): @pytest.fixture(params=[ Series([1, 2, 3] * 3, dtype='int32'), Series([None, 2.5, 3.5] * 3, dtype='float32'), Series(['a', 'b', 'c'] * 3, dtype='category'), Series(['d', 'e', 'f'] * 3), Series([True, False, True] * 3), Series(pd.date_range('20130101', periods=9)), Series(pd.date_range('20130101', periods=9, tz='US/Eastern')), Series(pd.timedelta_range('2000', periods=9)) ]) def series(self, request): return request.param def test_consistency(self): # check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) expected = Series(np.array( [3600424527151052760, 1374399572096150070, 477881037637427054], dtype='uint64'), index=['foo', 'bar', 'baz']) tm.assert_series_equal(result, expected) def test_hash_array(self, series): a = series.values tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) def test_hash_array_mixed(self): result1 = hash_array(np.array([3, 4, 'All'])) result2 = hash_array(np.array(['3', '4', 'All'])) result3 = hash_array(np.array([3, 4, 'All'], dtype=object)) tm.assert_numpy_array_equal(result1, result2) tm.assert_numpy_array_equal(result1, result3) @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) def test_hash_array_errors(self, val): msg = 'must pass a ndarray-like' with tm.assert_raises_regex(TypeError, msg): hash_array(val) def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) kwargs.pop('index', None) a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) def check_not_equal_with_index(self, obj): # check that we are not hashing the same if # we include the index if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) if len(obj): assert not (a == b).all() def test_hash_tuples(self): tups = [(1, 'one'), (1, 'two'), (2, 'one')] result = hash_tuples(tups) expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values tm.assert_numpy_array_equal(result, expected) result = hash_tuples(tups[0]) assert result == expected[0] @pytest.mark.parametrize('tup', [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), ('A', pd.Timestamp("2012-01-01"))]) def test_hash_tuple(self, tup): # test equivalence between hash_tuples and hash_tuple result = hash_tuple(tup) expected = hash_tuples([tup])[0] assert result == expected @pytest.mark.parametrize('val', [ 1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01", tz='Europe/Brussels'), datetime.datetime(2012, 1, 1), pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), pd.Timedelta('1 days'), datetime.timedelta(1), pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), np.nan, pd.NaT, None ]) def test_hash_scalar(self, val): result = _hash_scalar(val) expected = hash_array(np.array([val], dtype=object), categorize=True) assert result[0] == expected[0] @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) def test_hash_tuples_err(self, val): msg = 'must be convertible to a list-of-tuples' with tm.assert_raises_regex(TypeError, msg): hash_tuples(val) def test_multiindex_unique(self): mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) assert mi.is_unique is True result = hash_pandas_object(mi) assert result.is_unique is True def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) # equivalency expected = hash_pandas_object(mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object(recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # values should match, but in different order tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) @pytest.mark.parametrize('obj', [ Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({ 'x': ['a', 'b', 'c'], 'y': [1, 2, 3] }), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), MultiIndex.from_product([ range(5), ['foo', 'bar', 'baz'], pd.date_range('20130101', periods=2) ]), MultiIndex.from_product([pd.CategoricalIndex(list('aabc')), range(3)]) ]) def test_hash_pandas_object(self, obj): self.check_equal(obj) self.check_not_equal_with_index(obj) def test_hash_pandas_object2(self, series): self.check_equal(series) self.check_not_equal_with_index(series) @pytest.mark.parametrize( 'obj', [Series([], dtype='float64'), Series([], dtype='object'), Index([])]) def test_hash_pandas_empty_object(self, obj): # these are by-definition the same with # or w/o the index as the data is empty self.check_equal(obj) @pytest.mark.parametrize('s1', [ Series(['a', 'b', 'c', 'd']), Series([1000, 2000, 3000, 4000]), Series(pd.date_range(0, periods=4)) ]) @pytest.mark.parametrize('categorize', [True, False]) def test_categorical_consistency(self, s1, categorize): # GH15143 # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3) def test_categorical_with_nan_consistency(self): c = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=pd.date_range('2012-01-01', periods=5, name='B')) expected = hash_array(c, categorize=False) c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp('2012-01-01')]) result = hash_array(c, categorize=False) assert result[0] in expected assert result[1] in expected @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") def test_pandas_errors(self): with pytest.raises(TypeError): hash_pandas_object(pd.Timestamp('20130101')) obj = tm.makePanel() with pytest.raises(TypeError): hash_pandas_object(obj) def test_hash_keys(self): # using different hash keys, should have different hashes # for the same data # this only matters for object dtypes obj = Series(list('abc')) a = hash_pandas_object(obj, hash_key='9876543210123456') b = hash_pandas_object(obj, hash_key='9876543210123465') assert (a != b).all() def test_invalid_key(self): # this only matters for object dtypes msg = 'key should be a 16-byte string encoded' with tm.assert_raises_regex(ValueError, msg): hash_pandas_object(Series(list('abc')), hash_key='foo') def test_alread_encoded(self): # if already encoded then ok obj = Series(list('abc')).str.encode('utf8') self.check_equal(obj) def test_alternate_encoding(self): obj = Series(list('abc')) self.check_equal(obj, encoding='ascii') @pytest.mark.parametrize('l_exp', range(8)) @pytest.mark.parametrize('l_add', [0, 1]) def test_same_len_hash_collisions(self, l_exp, l_add): length = 2**(l_exp + 8) + l_add s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') assert not result[0] == result[1] def test_hash_collisions(self): # hash collisions are bad # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 L = [ 'Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe' ] # noqa # these should be different! result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') expected1 = np.array([14963968704024874985], dtype=np.uint64) tm.assert_numpy_array_equal(result1, expected1) result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') expected2 = np.array([16428432627716348016], dtype=np.uint64) tm.assert_numpy_array_equal(result2, expected2) result = hash_array(np.asarray(L, dtype=object), 'utf8') tm.assert_numpy_array_equal( result, np.concatenate([expected1, expected2], axis=0))
import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import pandas.util.testing as pd_samples from pandas.testing import assert_frame_equal, assert_series_equal import pytest from pandasbikeshed.plot import (robust_hist, robust_scatter, robust_info, robust_kde, robust_pairplot, corr_heatmap, dist_catplot) nan_df = pd_samples.makeMissingDataframe() nan_a = nan_df['A'] nan_b = nan_df['B'] mixed_df = pd_samples.makeMixedDataFrame() def test_robust_hist(): assert isinstance(robust_hist(nan_a), plt.Axes) assert isinstance(robust_hist(nan_a.values), plt.Axes) def test_robust_scatter(): assert isinstance(robust_scatter(nan_a, nan_b), plt.Axes) assert isinstance(robust_scatter(nan_a.values, nan_b.values), plt.Axes) def test_robust_kde(): assert isinstance(robust_kde(nan_a), plt.Axes) assert isinstance(robust_kde(nan_a, nan_b), plt.Axes)