def testRechunk(self): raw = pd.DataFrame(np.random.rand(10, 10)) df = from_pandas_df(raw, chunk_size=3) df2 = df.rechunk(4).tiles() self.assertEqual(df2.shape, (10, 10)) self.assertEqual(len(df2.chunks), 9) self.assertEqual(df2.chunks[0].shape, (4, 4)) pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)) pd.testing.assert_index_equal(df2.chunks[0].columns_value.to_pandas(), pd.RangeIndex(4)) pd.testing.assert_series_equal(df2.chunks[0].dtypes, raw.dtypes[:4]) self.assertEqual(df2.chunks[2].shape, (4, 2)) pd.testing.assert_index_equal(df2.chunks[2].index_value.to_pandas(), pd.RangeIndex(4)) pd.testing.assert_index_equal(df2.chunks[2].columns_value.to_pandas(), pd.RangeIndex(8, 10)) pd.testing.assert_series_equal(df2.chunks[2].dtypes, raw.dtypes[-2:]) self.assertEqual(df2.chunks[-1].shape, (2, 2)) pd.testing.assert_index_equal(df2.chunks[-1].index_value.to_pandas(), pd.RangeIndex(8, 10)) pd.testing.assert_index_equal(df2.chunks[-1].columns_value.to_pandas(), pd.RangeIndex(8, 10)) pd.testing.assert_series_equal(df2.chunks[-1].dtypes, raw.dtypes[-2:]) for c in df2.chunks: self.assertEqual(c.shape[1], len(c.dtypes)) self.assertEqual(len(c.columns_value.to_pandas()), len(c.dtypes)) columns = [np.random.bytes(10) for _ in range(10)] index = np.random.randint(-100, 100, size=(4,)) raw = pd.DataFrame(np.random.rand(4, 10), index=index, columns=columns) df = from_pandas_df(raw, chunk_size=3) df2 = df.rechunk(6).tiles() self.assertEqual(df2.shape, (4, 10)) self.assertEqual(len(df2.chunks), 2) self.assertEqual(df2.chunks[0].shape, (4, 6)) pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(), df.index_value.to_pandas()) pd.testing.assert_index_equal(df2.chunks[0].columns_value.to_pandas(), pd.Index(columns[:6])) pd.testing.assert_series_equal(df2.chunks[0].dtypes, raw.dtypes[:6]) self.assertEqual(df2.chunks[1].shape, (4, 4)) pd.testing.assert_index_equal(df2.chunks[1].index_value.to_pandas(), df.index_value.to_pandas()) pd.testing.assert_index_equal(df2.chunks[1].columns_value.to_pandas(), pd.Index(columns[6:])) pd.testing.assert_series_equal(df2.chunks[1].dtypes, raw.dtypes[-4:]) for c in df2.chunks: self.assertEqual(c.shape[1], len(c.dtypes)) self.assertEqual(len(c.columns_value.to_pandas()), len(c.dtypes)) # test Series rechunk series = from_pandas_series(pd.Series(np.random.rand(10,)), chunk_size=3) series2 = series.rechunk(4).tiles() self.assertEqual(series2.shape, (10,)) self.assertEqual(len(series2.chunks), 3) pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10)) self.assertEqual(series2.chunk_shape, (3,)) self.assertEqual(series2.nsplits, ((4, 4, 2), )) self.assertEqual(series2.chunks[0].shape, (4,)) pd.testing.assert_index_equal(series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)) self.assertEqual(series2.chunks[1].shape, (4,)) pd.testing.assert_index_equal(series2.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8)) self.assertEqual(series2.chunks[2].shape, (2,)) pd.testing.assert_index_equal(series2.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10)) series2 = series.rechunk(1).tiles() self.assertEqual(series2.shape, (10,)) self.assertEqual(len(series2.chunks), 10) pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10)) self.assertEqual(series2.chunk_shape, (10,)) self.assertEqual(series2.nsplits, ((1,) * 10, )) self.assertEqual(series2.chunks[0].shape, (1,)) pd.testing.assert_index_equal(series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(1)) # no need to rechunk series2 = series.rechunk(3).tiles() series = get_tiled(series) self.assertEqual(series2.chunk_shape, series.chunk_shape) self.assertEqual(series2.nsplits, series.nsplits) # test rechunk on DataFrame has known shape, but chunk's shape is unknown data = pd.DataFrame({0: [1, 2], 1: [3, 4], 'a': [5, 6]}) df = from_pandas_df(data) df = df[df[0] < 3] with self.assertRaises(TilesError): df.tiles().rechunk((np.nan, 3)).tiles()
def testConcat(self): df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=4) mdf2 = from_pandas(df2, chunk_size=4) r = concat([mdf1, mdf2], axis='index') self.assertEqual(r.shape, (20, 4)) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) tiled = r.tiles() self.assertEqual(tiled.nsplits, ((4, 4, 2, 4, 4, 2), (4,))) for i, c in enumerate(tiled.chunks): self.assertEqual(c.index, (i, 0)) df3 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'), index=pd.RangeIndex(10, 20)) mdf3 = from_pandas(df3, chunk_size=4) r = concat([mdf1, mdf3], axis='index') self.assertEqual(r.shape, (20, 4)) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20)) df4 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'), index=np.random.permutation(np.arange(10))) mdf4 = from_pandas(df4, chunk_size=4) r = concat([mdf1, mdf4], axis='index') self.assertEqual(r.shape, (20, 4)) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64)) r = concat([mdf4, mdf1], axis='index') self.assertEqual(r.shape, (20, 4)) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64)) r = concat([mdf4, mdf4], axis='index') self.assertEqual(r.shape, (20, 4)) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64)) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=4) r = concat([mdf1, mdf2], axis='columns') self.assertEqual(r.shape, (10, 8)) expected_dtypes = pd.concat([df1, df2], axis='columns').dtypes pd.testing.assert_series_equal(r.dtypes, expected_dtypes) tiled = r.tiles() self.assertEqual(tiled.nsplits, ((3, 3, 3, 1), (3, 1, 4))) for i, c in enumerate(tiled.chunks): index = (i // 3, i % 3) self.assertEqual(c.index, index) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], join='inner') self.assertEqual(r.shape, (20, 3)) tiled = r.tiles() self.assertEqual(tiled.nsplits, ((3, 3, 3, 1, 3, 3, 3, 1), (3, )))
class TestSeriesConstructors(TestData): def test_invalid_dtype(self): # GH15520 msg = 'not understood' invalid_list = [pd.Timestamp, 'pd.Timestamp', list] for dtype in invalid_list: with tm.assert_raises_regex(TypeError, msg): Series([], name='time', dtype=dtype) def test_scalar_conversion(self): # Pass in scalar is disabled scalar = Series(0.5) assert not isinstance(scalar, float) # Coercion assert float(Series([1.])) == 1.0 assert int(Series([1.])) == 1 assert long(Series([1.])) == 1 def test_constructor(self): assert self.ts.index.is_all_dates # Pass in Series derived = Series(self.ts) assert derived.index.is_all_dates assert tm.equalContents(derived.index, self.ts.index) # Ensure new index is not created assert id(self.ts.index) == id(derived.index) # Mixed type Series mixed = Series(['hello', np.NaN], index=[0, 1]) assert mixed.dtype == np.object_ assert mixed[1] is np.NaN assert not self.empty.index.is_all_dates assert not Series({}).index.is_all_dates pytest.raises(Exception, Series, np.random.randn(3, 3), index=np.arange(3)) mixed.name = 'Series' rs = Series(mixed).name xp = 'Series' assert rs == xp # raise on MultiIndex GH4187 m = MultiIndex.from_arrays([[1, 2], [3, 4]]) pytest.raises(NotImplementedError, Series, m) @pytest.mark.parametrize('input_class', [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): empty = Series() empty2 = Series(input_class()) # these are Index() and RangeIndex() which don't compare type equal # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) # With explicit dtype: empty = Series(dtype='float64') empty2 = Series(input_class(), dtype='float64') assert_series_equal(empty, empty2, check_index_type=False) # GH 18515 : with dtype=category: empty = Series(dtype='category') empty2 = Series(input_class(), dtype='category') assert_series_equal(empty, empty2, check_index_type=False) if input_class is not list: # With index: empty = Series(index=lrange(10)) empty2 = Series(input_class(), index=lrange(10)) assert_series_equal(empty, empty2) # With index and dtype float64: empty = Series(np.nan, index=lrange(10)) empty2 = Series(input_class(), index=lrange(10), dtype='float64') assert_series_equal(empty, empty2) # GH 19853 : with empty string, index and dtype str empty = Series('', dtype=str, index=range(3)) empty2 = Series('', index=range(3)) assert_series_equal(empty, empty2) @pytest.mark.parametrize('input_arg', [np.nan, float('nan')]) def test_constructor_nan(self, input_arg): empty = Series(dtype='float64', index=lrange(10)) empty2 = Series(input_arg, index=lrange(10)) assert_series_equal(empty, empty2, check_index_type=False) @pytest.mark.parametrize('dtype', [ 'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object', 'datetime64[ns, UTC]', ]) @pytest.mark.parametrize('index', [None, pd.Index([])]) def test_constructor_dtype_only(self, dtype, index): # GH-20865 result = pd.Series(dtype=dtype, index=index) assert result.dtype == dtype assert len(result) == 0 def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) assert_series_equal(s2, s1.sort_index()) def test_constructor_iterator(self): expected = Series(list(range(10)), dtype='int64') result = Series(range(10), dtype='int64') assert_series_equal(result, expected) def test_constructor_list_like(self): # make sure that we are coercing different # list-likes to standard dtypes and not # platform specific expected = Series([1, 2, 3], dtype='int64') for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype='int64')]: result = Series(obj, index=[0, 1, 2]) assert_series_equal(result, expected) @pytest.mark.parametrize('input_vals', [ ([1, 2]), ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) def test_constructor_list_str(self, input_vals): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' for dtype in ['str', str, 'U']: result = Series(input_vals, dtype=dtype) expected = Series(input_vals).astype(dtype) assert_series_equal(result, expected) def test_constructor_generator(self): gen = (i for i in range(10)) result = Series(gen) exp = Series(lrange(10)) assert_series_equal(result, exp) gen = (i for i in range(10)) result = Series(gen, index=lrange(10, 20)) exp.index = lrange(10, 20) assert_series_equal(result, exp) def test_constructor_map(self): # GH8909 m = map(lambda x: x, range(10)) result = Series(m) exp = Series(lrange(10)) assert_series_equal(result, exp) m = map(lambda x: x, range(10)) result = Series(m, index=lrange(10, 20)) exp.index = lrange(10, 20) assert_series_equal(result, exp) def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # GH12574 pytest.raises(ValueError, lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) s = Series([1, 2, 3], dtype='category') assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype) def test_constructor_categorical_with_coercion(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) # test basic creation / coercion of categoricals s = Series(factor, name='A') assert s.dtype == 'category' assert len(s) == len(factor) str(s.values) str(s) # in a frame df = DataFrame({'A': factor}) result = df['A'] tm.assert_series_equal(result, s) result = df.iloc[:, 0] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) df = DataFrame({'A': s}) result = df['A'] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) # multiples df = DataFrame({'A': s, 'B': s, 'C': 1}) result1 = df['A'] result2 = df['B'] tm.assert_series_equal(result1, s) tm.assert_series_equal(result2, s, check_names=False) assert result2.name == 'B' assert len(df) == len(factor) str(df.values) str(df) # GH8623 x = DataFrame( [[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical( x.person_name) # doing this breaks transform expected = x.iloc[0].person_name result = x.person_name.iloc[0] assert result == expected result = x.person_name[0] assert result == expected result = x.person_name.loc[0] assert result == expected def test_constructor_categorical_dtype(self): result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['a', 'b', 'c'], ordered=True)) assert is_categorical_dtype(result) is True tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) assert result.cat.ordered result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a'])) assert is_categorical_dtype(result) tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype result = Series('a', index=[0, 1], dtype=CategoricalDtype(['a', 'b'], ordered=True)) expected = Series(['a', 'a'], index=[0, 1], dtype=CategoricalDtype(['a', 'b'], ordered=True)) tm.assert_series_equal(result, expected, check_categorical=True) def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the # other one, IF you specify copy! cat = Categorical(["a", "b", "c", "a"]) s = Series(cat, copy=True) assert s.cat is not cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # setting s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # however, copy is False by default # so this WILL change values cat = Categorical(["a", "b", "c", "a"]) s = Series(cat) assert s.values is cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_s) s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_s2) def test_unordered_compare_equal(self): left = pd.Series(['a', 'b', 'c'], dtype=CategoricalDtype(['a', 'b'])) right = pd.Series( pd.Categorical(['a', 'b', np.nan], categories=['a', 'b'])) tm.assert_series_equal(left, right) def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) result = Series(data) expected = Series([nan, nan, nan]) assert_series_equal(result, expected) data[0] = 0.0 data[2] = 2.0 index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([0.0, nan, 2.0], index=index) assert_series_equal(result, expected) data[1] = 1.0 result = Series(data, index=index) expected = Series([0.0, 1.0, 2.0], index=index) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=int) result = Series(data) expected = Series([nan, nan, nan], dtype=float) assert_series_equal(result, expected) data[0] = 0 data[2] = 2 index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([0, nan, 2], index=index, dtype=float) assert_series_equal(result, expected) data[1] = 1 result = Series(data, index=index) expected = Series([0, 1, 2], index=index, dtype=int) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=bool) result = Series(data) expected = Series([nan, nan, nan], dtype=object) assert_series_equal(result, expected) data[0] = True data[2] = False index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([True, nan, False], index=index, dtype=object) assert_series_equal(result, expected) data[1] = True result = Series(data, index=index) expected = Series([True, True, False], index=index, dtype=bool) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype='M8[ns]') result = Series(data) expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]') assert_series_equal(result, expected) data[0] = datetime(2001, 1, 1) data[2] = datetime(2001, 1, 3) index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) data[1] = datetime(2001, 1, 2) result = Series(data, index=index) expected = Series( [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): rng = date_range('20090415', '20090519', freq='B') data = {k: 1 for k in rng} result = Series(data, index=rng) assert result.index is rng def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) @pytest.mark.parametrize('input', [[1, 2, 3], (1, 2, 3), list(range(3)), pd.Categorical(['a', 'b', 'a']), (i for i in range(3)), map(lambda x: x, range(3))]) def test_constructor_index_mismatch(self, input): # GH 19342 # test that construction of a Series with an index of different length # raises an error msg = 'Length of passed values is 3, index implies 4' with pytest.raises(ValueError, message=msg): Series(input, index=np.arange(4)) def test_constructor_numpy_scalar(self): # GH 19342 # construction with a numpy scalar # should not raise result = Series(np.array(100), index=np.arange(4), dtype='int64') expected = Series(100, index=np.arange(4), dtype='int64') tm.assert_series_equal(result, expected) def test_constructor_broadcast_list(self): # GH 19342 # construction with single-element container and index # should raise pytest.raises(ValueError, Series, ['foo'], index=['a', 'b', 'c']) def test_constructor_corner(self): df = tm.makeTimeDataFrame() objs = [df, df] s = Series(objs, index=[0, 1]) assert isinstance(s, Series) def test_constructor_sanitize(self): s = Series(np.array([1., 1., 8.]), dtype='i8') assert s.dtype == np.dtype('i8') s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8') assert s.dtype == np.dtype('f8') def test_constructor_copy(self): # GH15125 # test dtype parameter has no side effects on copy=True for data in [[1.], np.array([1.])]: x = Series(data) y = pd.Series(x, copy=True, dtype=float) # copy=True maintains original data in Series tm.assert_series_equal(x, y) # changes to origin of copy does not affect the copy x[0] = 2. assert not x.equals(y) assert x[0] == 2. assert y[0] == 1. @pytest.mark.parametrize("index", [ pd.date_range('20170101', periods=3, tz='US/Eastern'), pd.date_range('20170101', periods=3), pd.timedelta_range('1 day', periods=3), pd.period_range('2012Q1', periods=3, freq='Q'), pd.Index(list('abc')), pd.Int64Index([1, 2, 3]), pd.RangeIndex(0, 3) ], ids=lambda x: type(x).__name__) def test_constructor_limit_copies(self, index): # GH 17449 # limit copies of input s = pd.Series(index) # we make 1 copy; this is just a smoke test here assert s._data.blocks[0].values is not index def test_constructor_pass_none(self): s = Series(None, index=lrange(5)) assert s.dtype == np.float64 s = Series(None, index=lrange(5), dtype=object) assert s.dtype == np.object_ # GH 7431 # inference on the index s = Series(index=np.array([None])) expected = Series(index=Index([None])) assert_series_equal(s, expected) def test_constructor_pass_nan_nat(self): # GH 13467 exp = Series([np.nan, np.nan], dtype=np.float64) assert exp.dtype == np.float64 tm.assert_series_equal(Series([np.nan, np.nan]), exp) tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([pd.NaT, pd.NaT]) assert exp.dtype == 'datetime64[ns]' tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float) def test_constructor_dtype_nocast(self): # 1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) s2[1] = 5 assert s[1] == 5 def test_constructor_datelike_coercion(self): # GH 9477 # incorrectly inferring on dateimelike looking when object dtype is # specified s = Series([Timestamp('20130101'), 'NOV'], dtype=object) assert s.iloc[0] == Timestamp('20130101') assert s.iloc[1] == 'NOV' assert s.dtype == object # the dtype was being reset on the slicing and re-inferred to datetime # even thought the blocks are mixed belly = '216 3T19'.split() wing1 = '2T15 4H19'.split() wing2 = '416 4T20'.split() mat = pd.to_datetime('2016-01-22 2019-09-07'.split()) df = pd.DataFrame({ 'wing1': wing1, 'wing2': wing2, 'mat': mat }, index=belly) result = df.loc['3T19'] assert result.dtype == object result = df.loc['216'] assert result.dtype == object def test_constructor_datetimes_with_nulls(self): # gh-15869 for arr in [ np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None]) ]: result = Series(arr) assert result.dtype == 'M8[ns]' def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype='M8[ns]', index=lrange(5)) assert isna(s).all() # in theory this should be all nulls, but since # we are not specifying a dtype is ambiguous s = Series(iNaT, index=lrange(5)) assert not isna(s).all() s = Series(nan, dtype='M8[ns]', index=lrange(5)) assert isna(s).all() s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]') assert isna(s[1]) assert s.dtype == 'M8[ns]' s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]') assert isna(s[1]) assert s.dtype == 'M8[ns]' # GH3416 dates = [ np.datetime64(datetime(2013, 1, 1)), np.datetime64(datetime(2013, 1, 2)), np.datetime64(datetime(2013, 1, 3)), ] s = Series(dates) assert s.dtype == 'M8[ns]' s.iloc[0] = np.nan assert s.dtype == 'M8[ns]' # GH3414 related pytest.raises( TypeError, lambda x: Series(Series(dates).astype('int') / 1000000, dtype='M8[ms]')) pytest.raises(TypeError, lambda x: Series(dates, dtype='datetime64')) # invalid dates can be help as object result = Series([datetime(2, 1, 1)]) assert result[0] == datetime(2, 1, 1, 0, 0) result = Series([datetime(3000, 1, 1)]) assert result[0] == datetime(3000, 1, 1, 0, 0) # don't mix types result = Series([Timestamp('20130101'), 1], index=['a', 'b']) assert result['a'] == Timestamp('20130101') assert result['b'] == 1 # GH6529 # coerce datetime64 non-ns properly dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') values2 = dates.view(np.ndarray).astype('datetime64[ns]') expected = Series(values2, index=dates) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) # GH 13876 # coerce to non-ns to object properly expected = Series(values2, index=dates, dtype=object) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, index=dates, dtype=object) assert_series_equal(result, expected) # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object # these will correctly infer a datetime s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' # tz-aware (UTC and other tz's) # GH 8411 dr = date_range('20130101', periods=3) assert Series(dr).iloc[0].tz is None dr = date_range('20130101', periods=3, tz='UTC') assert str(Series(dr).iloc[0].tz) == 'UTC' dr = date_range('20130101', periods=3, tz='US/Eastern') assert str(Series(dr).iloc[0].tz) == 'US/Eastern' # non-convertible s = Series([1479596223000, -1479590, pd.NaT]) assert s.dtype == 'object' assert s[2] is pd.NaT assert 'NaT' in str(s) # if we passed a NaT it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) assert s.dtype == 'object' assert s[2] is pd.NaT assert 'NaT' in str(s) # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) assert s.dtype == 'object' assert s[2] is np.nan assert 'NaN' in str(s) def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr) assert s.dtype.name == 'datetime64[ns, US/Eastern]' assert s.dtype == 'datetime64[ns, US/Eastern]' assert is_datetime64tz_dtype(s.dtype) assert 'datetime64[ns, US/Eastern]' in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == 'datetime64[ns]' exp = pd.DatetimeIndex(result) exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # short str assert 'datetime64[ns, US/Eastern]' in str(s) # formatting with NaT result = s.shift() assert 'datetime64[ns, US/Eastern]' in str(result) assert 'NaT' in str(result) # long str t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) assert 'datetime64[ns, US/Eastern]' in str(t) result = pd.DatetimeIndex(s, freq='infer') tm.assert_index_equal(result, dr) # inference s = Series([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific') ]) assert s.dtype == 'datetime64[ns, US/Pacific]' assert lib.infer_dtype(s) == 'datetime64' s = Series([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern') ]) assert s.dtype == 'object' assert lib.infer_dtype(s) == 'datetime' # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units # gh-19223 dtype = "{}[{}]".format(dtype, unit) arr = np.array([1, 2, 3], dtype=arr_dtype) s = Series(arr) result = s.astype(dtype) expected = Series(arr.astype(dtype)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('arg', ['2013-01-01 00:00:00', pd.NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype='datetime64[ns, CET]') expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') assert_series_equal(result, expected) def test_construction_interval(self): # construction from interval & array of intervals index = IntervalIndex.from_breaks(np.arange(3), closed='right') result = Series(index) repr(result) str(result) tm.assert_index_equal(Index(result.values), index) result = Series(index.values) tm.assert_index_equal(Index(result.values), index) def test_construction_consistency(self): # make sure that we are not re-localizing upon construction # GH 14928 s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) result = Series(s, dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series pi = period_range('20130101', periods=5, freq='D') s = Series(pi) expected = Series(pi.astype(object)) assert_series_equal(s, expected) assert s.dtype == 'object' def test_constructor_dict(self): d = {'a': 0., 'b': 1., 'c': 2.} result = Series(d, index=['b', 'c', 'd', 'a']) expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) assert_series_equal(result, expected) pidx = tm.makePeriodIndex(100) d = {pidx[0]: 0, pidx[1]: 1} result = Series(d, index=pidx) expected = Series(np.nan, pidx) expected.iloc[0] = 0 expected.iloc[1] = 1 assert_series_equal(result, expected) def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value d = {'b': 1, 'a': 0, 'c': 2} result = Series(d) if PY36: expected = Series([1, 0, 2], index=list('bac')) else: expected = Series([0, 1, 2], index=list('abc')) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) def test_constructor_dict_nan_key(self, value): # GH 18480 d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'} result = Series(d).sort_values() expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4]) assert_series_equal(result, expected) # MultiIndex: d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'} result = Series(d).sort_values() expected = Series(['a', 'b', 'c'], index=Index([(1, 1), (2, np.nan), (3, value)])) assert_series_equal(result, expected) def test_constructor_dict_datetime64_index(self): # GH 9456 dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] values = [42544017.198965244, 1234565, 40512335.181958228, -1] def create_data(constructor): return dict(zip((constructor(x) for x in dates_as_str), values)) data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) data_Timestamp = create_data(Timestamp) expected = Series(values, (Timestamp(x) for x in dates_as_str)) result_datetime64 = Series(data_datetime64) result_datetime = Series(data_datetime) result_Timestamp = Series(data_Timestamp) assert_series_equal(result_datetime64, expected) assert_series_equal(result_datetime, expected) assert_series_equal(result_Timestamp, expected) def test_constructor_list_of_tuples(self): data = [(1, 1), (2, 2), (2, 3)] s = Series(data) assert list(s) == data def test_constructor_tuple_of_tuples(self): data = ((1, 1), (2, 2), (2, 3)) s = Series(data) assert tuple(s) == data def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected) def test_constructor_set(self): values = set([1, 2, 3, 4, 5]) pytest.raises(TypeError, Series, values) values = frozenset(values) pytest.raises(TypeError, Series, values) def test_fromDict(self): data = {'a': 0, 'b': 1, 'c': 2, 'd': 3} series = Series(data) assert tm.is_sorted(series.index) data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()} series = Series(data) assert series.dtype == np.object_ data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'} series = Series(data) assert series.dtype == np.object_ data = {'a': '0', 'b': '1'} series = Series(data, dtype=float) assert series.dtype == np.float64 def test_fromValue(self): nans = Series(np.NaN, index=self.ts.index) assert nans.dtype == np.float_ assert len(nans) == len(self.ts) strings = Series('foo', index=self.ts.index) assert strings.dtype == np.object_ assert len(strings) == len(self.ts) d = datetime.now() dates = Series(d, index=self.ts.index) assert dates.dtype == 'M8[ns]' assert len(dates) == len(self.ts) # GH12336 # Test construction of categorical series from value categorical = Series(0, index=self.ts.index, dtype="category") expected = Series(0, index=self.ts.index).astype("category") assert categorical.dtype == 'category' assert len(categorical) == len(self.ts) tm.assert_series_equal(categorical, expected) def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1)]) assert td.dtype == 'timedelta64[ns]' td = Series( [timedelta(days=1), timedelta(days=2), np.timedelta64(1, 's')]) assert td.dtype == 'timedelta64[ns]' # mixed with NaT td = Series([timedelta(days=1), NaT], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1), np.nan], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' # improved inference # GH5689 td = Series([np.timedelta64(300000000), NaT]) assert td.dtype == 'timedelta64[ns]' # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), iNaT]) assert td.dtype == 'object' td = Series([np.timedelta64(300000000), np.nan]) assert td.dtype == 'timedelta64[ns]' td = Series([pd.NaT, np.timedelta64(300000000)]) assert td.dtype == 'timedelta64[ns]' td = Series([np.timedelta64(1, 's')]) assert td.dtype == 'timedelta64[ns]' # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: # pytest.raises(TypeError, td.astype, 'm8[%s]' % t) # valid astype td.astype('int64') # invalid casting pytest.raises(TypeError, td.astype, 'int32') # this is an invalid casting def f(): Series([timedelta(days=1), 'foo'], dtype='m8[ns]') pytest.raises(Exception, f) # leave as object here td = Series([timedelta(days=i) for i in range(3)] + ['foo']) assert td.dtype == 'object' # these will correctly infer a timedelta s = Series([None, pd.NaT, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([np.nan, pd.NaT, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([pd.NaT, None, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([pd.NaT, np.nan, '1 Day']) assert s.dtype == 'timedelta64[ns]' # GH 16406 def test_constructor_mixed_tz(self): s = Series( [Timestamp('20130101'), Timestamp('20130101', tz='US/Eastern')]) expected = Series( [Timestamp('20130101'), Timestamp('20130101', tz='US/Eastern')], dtype='object') assert_series_equal(s, expected) def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') val = series[3] assert isna(val) series[2] = val assert isna(series[2]) def test_NaT_cast(self): # GH10747 result = Series([np.nan]).astype('M8[ns]') expected = Series([NaT]) assert_series_equal(result, expected) def test_constructor_name_hashable(self): for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), u"\u05D0"]: for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]: s = Series(data, name=n) assert s.name == n def test_constructor_name_unhashable(self): for n in [['name_list'], np.ones(2), {1: 2}]: for data in [['name_list'], np.ones(2), {1: 2}]: pytest.raises(TypeError, Series, data, name=n) def test_auto_conversion(self): series = Series(list(date_range('1/1/2000', periods=10))) assert series.dtype == 'M8[ns]' def test_convert_non_ns(self): # convert from a numpy array of non-ns timedelta64 arr = np.array([1, 2, 3], dtype='timedelta64[s]') s = Series(arr) expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s')) assert_series_equal(s, expected) # convert from a numpy array of non-ns datetime64 # note that creating a numpy datetime64 is in LOCAL time!!!! # seems to work for M8[D], but not for M8[s] s = Series( np.array(['2013-01-01', '2013-01-02', '2013-01-03'], dtype='datetime64[D]')) assert_series_equal( s, Series(date_range('20130101', periods=3, freq='D'))) # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) # assert_series_equal(s,date_range('20130101 # 00:00:01',period=3,freq='s')) @pytest.mark.parametrize("index", [ date_range('1/1/2000', periods=10), timedelta_range('1 day', periods=10), period_range('2000-Q1', periods=10, freq='Q') ], ids=lambda x: type(x).__name__) def test_constructor_cant_cast_datetimelike(self, index): # floats are not ok msg = "Cannot cast {} to ".format(type(index).__name__) with tm.assert_raises_regex(TypeError, msg): Series(index, dtype=float) # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(index, dtype=np.int64) expected = Series(index.astype(np.int64)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("index", [ date_range('1/1/2000', periods=10), timedelta_range('1 day', periods=10), period_range('2000-Q1', periods=10, freq='Q') ], ids=lambda x: type(x).__name__) def test_constructor_cast_object(self, index): s = Series(index, dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(pd.Index(index, dtype=object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(index.astype(object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) def test_constructor_generic_timestamp_deprecated(self): # see gh-15524 with tm.assert_produces_warning(FutureWarning): dtype = np.timedelta64 s = Series([], dtype=dtype) assert s.empty assert s.dtype == 'm8[ns]' with tm.assert_produces_warning(FutureWarning): dtype = np.datetime64 s = Series([], dtype=dtype) assert s.empty assert s.dtype == 'M8[ns]' # These timestamps have the wrong frequencies, # so an Exception should be raised now. msg = "cannot convert timedeltalike" with tm.assert_raises_regex(TypeError, msg): Series([], dtype='m8[ps]') msg = "cannot convert datetimelike" with tm.assert_raises_regex(TypeError, msg): Series([], dtype='M8[ps]') @pytest.mark.parametrize('dtype', [None, 'uint8', 'category']) def test_constructor_range_dtype(self, dtype): # GH 16804 expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64') result = Series(range(5), dtype=dtype) tm.assert_series_equal(result, expected)
def test_unique_index(self): cases = [pd.Index([1, 2, 3]), pd.RangeIndex(0, 3)] for case in cases: self.assertTrue(case.is_unique) tm.assert_numpy_array_equal(case.duplicated(), np.array([False, False, False]))
def SWBDAnalysis(swbdCorpus, speakerData, labelWanted=None, numberOfWords=5, clusterKMean=0): """ :param swbdCorpus: :param speakerData: :param labelWanted: :param numberOfWords: :param clusterKMean: if 0 we will use label :return: """ dataframe = None if not os.path.isfile(path.join(getPathToSerialized(), "swbdDataframe")): frequencyAnalysis = freqAnalysis(swbdCorpus, numberOfWords=numberOfWords, printMostCommon=True) dimensionAnalysis = analysisInManyDimensions([swbdCorpus]) dataframe = pd.concat([frequencyAnalysis, dimensionAnalysis], axis=1, sort=False) f = open(path.join(getPathToSerialized(), "swbdDataframe"), "wb") pickle.dump(dataframe, f) f.close() else: f = open(path.join(getPathToSerialized(), "swbdDataframe"), "rb") dataframe = pickle.load(f) f.close() # grouping files by speaker eachFilespeakerID = swbdCorpus.getSpeakerByFile() dataframe["idSpeaker"] = eachFilespeakerID dataframe = dataframe.groupby(['idSpeaker']).mean() if clusterKMean == 0 and labelWanted is not None: # we cluster by labelWanted filteredSpeaker = {} for speaker in speakerData: for info in speakerData[speaker]: if info == labelWanted: filteredSpeaker[speaker] = speakerData[speaker][info] dataframe["label"] = pd.Series(filteredSpeaker) #dataframe.index = pd.RangeIndex(len(dataframe.index)) does not work dataframe.index = np.arange(len(dataframe)) elif clusterKMean > 0: dataframe.index = pd.RangeIndex(len( dataframe.index)) # restarting the indexes is important # because the current indices are speakers's identification number kmeanModel = KMeans(n_clusters=clusterKMean).fit(dataframe) clusters = kmeanModel.predict( dataframe) # associate a cluster to each speaker dataframe["label"] = pd.Series(dict( enumerate(clusters))) # going from a list to a dict and assigning else: print( "need either label wanted or clusterKMean in function SWBDAnalysisSpeakers" ) f = open(path.join(getPathToSerialized(), "swbdDataframeBySpeaker"), "wb") pickle.dump(dataframe, f) f.close() return dataframe, eachFilespeakerID
def to_pandas(self): return pd.RangeIndex(start=self._start, stop=self._stop, dtype=self.dtype)
def read_anndata(path, backed=None): path = str(path) tmp_path = None if path.startswith('gs://'): tmp_path = download_gs_url(path) path = tmp_path basename_and_extension = get_filename_and_extension(path) ext = basename_and_extension[1] if ext == 'mtx': x = scipy.io.mmread(path) x = scipy.sparse.csr_matrix(x.T) # look for .barcodes.txt and .genes.txt import itertools sp = os.path.split(path) obs = None for sep_ext in itertools.product(['.', '_', '-'], ['tsv', 'txt']): for prefix in ['', basename_and_extension[0] + sep_ext[0]]: f = os.path.join(sp[0], prefix + 'barcodes.' + sep_ext[1]) if os.path.isfile(f) or os.path.isfile(f + '.gz'): obs = pd.read_csv(f if os.path.isfile(f) else f + '.gz', index_col=0, sep='\t', header=None) break var = None for sep_ext in itertools.product(['.', '_', '-'], ['tsv', 'txt']): for prefix in ['', basename_and_extension[0] + sep_ext[0]]: f = os.path.join(sp[0], prefix + 'genes.' + sep_ext[1]) if os.path.isfile(f) or os.path.isfile(f + '.gz'): var = pd.read_csv(f if os.path.isfile(f) else f + '.gz', index_col=0, sep='\t', header=None) break if var is None: print(basename_and_extension[0] + '.genes.txt not found') var = pd.DataFrame( index=pd.RangeIndex(start=0, stop=x.shape[1], step=1)) if obs is None: print(basename_and_extension[0] + '.barcodes.txt not found') obs = pd.DataFrame( index=pd.RangeIndex(start=0, stop=x.shape[0], step=1)) cell_count, gene_count = x.shape if len(obs) != cell_count: raise ValueError("Wrong number of cells : matrix has {} cells, barcodes file has {}" \ .format(cell_count, len(obs))) if len(var) != gene_count: raise ValueError("Wrong number of genes : matrix has {} genes, genes file has {}" \ .format(gene_count, len(var))) return anndata.AnnData(X=x, obs=obs, var=var) elif ext == 'h5': return sc.read_10x_h5(path, genome=None, gex_only=True) elif ext == 'npz': obj = np.load(path) if tmp_path is not None: os.remove(tmp_path) return anndata.AnnData(X=obj['x'], obs=pd.DataFrame(index=obj['rid']), var=pd.DataFrame(index=obj['cid'])) elif ext == 'npy': x = np.load(path) if tmp_path is not None: os.remove(tmp_path) return anndata.AnnData( X=x, obs=pd.DataFrame( index=pd.RangeIndex(start=0, stop=x.shape[0], step=1)), var=pd.DataFrame( index=pd.RangeIndex(start=0, stop=x.shape[1], step=1))) elif ext == 'loom': # in loom file, convention is rows are genes :( # return anndata.read_loom(path, X_name='matrix', sparse=True) f = h5py.File(path, 'r') x = f['/matrix'] is_x_sparse = x.attrs.get('sparse') if is_x_sparse: # read in blocks of 1000 chunk_start = 0 nrows = x.shape[0] chunk_step = min(nrows, 1000) chunk_stop = chunk_step nchunks = int(np.ceil(max(1, nrows / chunk_step))) sparse_arrays = [] for chunk in range(nchunks): chunk_stop = min(nrows, chunk_stop) subset = scipy.sparse.csr_matrix(x[chunk_start:chunk_stop]) sparse_arrays.append(subset) chunk_start += chunk_step chunk_stop += chunk_step x = scipy.sparse.vstack(sparse_arrays) else: x = x[()] row_meta = {} row_attrs = f['/row_attrs'] for key in row_attrs: values = row_attrs[key][()] if values.dtype.kind == 'S': values = values.astype(str) row_meta[key] = values row_meta = pd.DataFrame(data=row_meta) if row_meta.get('id') is not None: row_meta.set_index('id', inplace=True) elif row_meta.shape[1] == 1: row_meta.set_index(row_meta.columns[0], inplace=True) col_meta = {} col_attrs = f['/col_attrs'] for key in col_attrs: values = col_attrs[key][()] if values.dtype.kind == 'S': values = values.astype(str) col_meta[key] = values col_meta = pd.DataFrame(data=col_meta) if col_meta.get('id') is not None: col_meta.set_index('id', inplace=True) elif col_meta.shape[1] == 1: col_meta.set_index(col_meta.columns[0], inplace=True) f.close() return anndata.AnnData(X=x, obs=row_meta, var=col_meta) elif ext == 'h5ad': return anndata.read_h5ad(path, backed=backed) elif ext == 'hdf5' or ext == 'h5': return anndata.read_hdf(path) elif ext == 'gct': ds = wot.io.read_gct(path) if tmp_path is not None: os.remove(tmp_path) return ds else: # txt df = pd.read_csv(path, engine='python', header=0, sep=None, index_col=0) if tmp_path is not None: os.remove(tmp_path) return anndata.AnnData(X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns))
f.write('packageName=%s (%s) %s\n' % (row['title'], tag, cd)) f.write('text=' + ' '.join(urls) + '\n') f.write('comment=' + row['url'] + '\n') if nocds: filename = 'crawljob/hulkpop-%05d-%s.crawljob' % (row.name, tag) with open(filename, mode='w', encoding='utf8') as f: f.write('packageName=' + row['title'] + ' (' + tag + ')\n') f.write('text=' + ' '.join(nocds) + '\n') f.write('comment=' + row['url'] + '\n') error = False pbar.update(1) return bool(error or row['error']) df = dd.read_csv('out/hulkpop-links-*.csv', encoding='UTF-8', dtype='object', keep_default_na=False).compute() df = df[df['error'] != 'True'] df.index = pd.RangeIndex(0, len(df)) pbar = tqdm(total=len(df), ncols=80) errors = df[df.apply(create_createjob, axis=1)] errors.to_csv('errors.csv', encoding='UTF-8', index=False, line_terminator='\n')
This example shows how you can use selections and layers to create a multi-line tooltip that tracks the x position of the cursor. To find the x-position of the cursor, we employ a little trick: we add some transparent points with only an x encoding (no y encoding) and tie a *nearest* selection to these, tied to the "x" field. """ # category: interactive charts import altair as alt import pandas as pd import numpy as np np.random.seed(42) data = pd.DataFrame(np.cumsum(np.random.randn(100, 3), 0).round(2), columns=['A', 'B', 'C'], index=pd.RangeIndex(100, name='x')) data = data.reset_index().melt('x', var_name='category', value_name='y') # Create a selection that chooses the nearest point & selects based on x-value nearest = alt.selection(type='single', nearest=True, on='mouseover', fields=['x'], empty='none') # The basic line line = alt.Chart().mark_line(interpolate='basis').encode(x='x:Q', y='y:Q', color='category:N') # Transparent selectors across the chart. This is what tells us
], ], ) def test_infer_nullable_series_schema_statistics(null_index, series, expectation): """Test nullable series statistics are correctly inferred.""" series.iloc[null_index] = None statistics = schema_statistics.infer_series_statistics(series) assert statistics == expectation @pytest.mark.parametrize( "index, expectation", [ [ pd.RangeIndex(20), [{ "name": None, "pandas_dtype": PandasDtype.Int, "nullable": False, "checks": { "greater_than_or_equal_to": 0, "less_than_or_equal_to": 19, }, }], ], [ pd.Index([1, 2, 3], name="int_index"), [{ "name": "int_index", "pandas_dtype": PandasDtype.Int,
def compute_validation_summary(ot_model, day_triplets=None, interp_size=10000, compute_full_distances=False): """ Compute the validation summary for the given OTModel Parameters ---------- ot_model : wot.OTModel The OTModel to validate day_triplets : list of (float, float, float) List of day triplets (t0, t0.5, t1) or None to use all consecutive triplets interp_size : int, optional The number of cells in the interpolated population compute_full_distances : bool Whether to compute full distances Returns ------- validation_summary : pandas.DataFrame The validation summary """ if ot_model.covariate_field not in ot_model.matrix.obs: ot_model.matrix.obs['covariate'] = 1 if day_triplets is None: day_triplets = [] unique_times = np.array(ot_model.timepoints) for i in range(len(unique_times) - 2): t0 = unique_times[i] t05 = unique_times[i + 1] t1 = unique_times[i + 2] day_triplets.append((t0, t05, t1)) day_pairs = {} for triplet in day_triplets: day_pairs[(triplet[0], triplet[2])] = {} ot_model.day_pairs = day_pairs has_covariate = ot_model.covariate_field is not None if not has_covariate and not compute_full_distances: raise ValueError('No covariate specified. Please provide a covariate or compute full distances') summary_list = [] # 'P': ["#e41a1c", "between real batches"], # 'I': ["#377eb8", "between interpolated and real batch"], # 'F': ["#4daf4a", "between first and real "], # 'L': ["#984ea3", "between last and real"], # 'R': ["#ff7f00", "between random (no growth) and real"], # 'Rg': ["#ffff33", "between random (with growth) and real"] local_pca = ot_model.ot_config['local_pca'] for triplet in day_triplets: t0, t05, t1 = triplet interp_frac = (t05 - t0) / (t1 - t0) p0_ds = ot_model.matrix[ot_model.matrix.obs[ot_model.day_field] == float(t0), :] p05_ds = ot_model.matrix[ot_model.matrix.obs[ot_model.day_field] == float(t05), :] p1_ds = ot_model.matrix[ot_model.matrix.obs[ot_model.day_field] == float(t1), :] if local_pca > 0: matrices = list() matrices.append(p0_ds.X if not scipy.sparse.isspmatrix(p0_ds.X) else p0_ds.X.toarray()) matrices.append(p1_ds.X if not scipy.sparse.isspmatrix(p1_ds.X) else p1_ds.X.toarray()) p0_pca, p1_pca, pca, mean_shift = wot.ot.compute_pca(p0_ds.X, p1_ds.X, local_pca) p0_ds = anndata.AnnData(p0_pca, obs=p0_ds.obs, var=pd.DataFrame(index=pd.RangeIndex(start=0, stop=local_pca, step=1))) p1_ds = anndata.AnnData(p1_pca, obs=p1_ds.obs, var=pd.DataFrame(index=pd.RangeIndex(start=0, stop=local_pca, step=1))) eigenvals = np.diag(pca.singular_values_) U = np.vstack(matrices).T.dot(pca.components_.T).dot(np.diag(1 / pca.singular_values_)) y = p05_ds.X - mean_shift p05_ds = anndata.AnnData(np.diag(1 / pca.singular_values_).dot(U.T.dot(y.T)).T, obs=p05_ds.obs, var=pd.DataFrame(index=pd.RangeIndex(start=0, stop=local_pca, step=1))) if compute_full_distances: tmap_full = ot_model.compute_transport_map(t0, t1) def update_full_summary(pop, t, name, pop2=p05_ds.X): dist = wot.ot.earth_mover_distance(pop, pop2, eigenvals if local_pca > 0 else None) summary_list.append( {'interval_start': t0, 'interval_mid': t05, 'interval_end': t1, 't0': t, 't1': t05, 'cv0': '', 'cv1': '', 'name': name, 'distance': dist, 'full': True}) if ot_model.cell_growth_rate_field in ot_model.matrix.obs: r05_with_growth = wot.ot.interpolate_randomly_with_growth(p0_ds.X, p1_ds.X, interp_frac, interp_size, p0_ds.obs[ ot_model.cell_growth_rate_field].values ** ( interp_frac)) update_full_summary(r05_with_growth, t05, 'Rg') r05_no_growth = wot.ot.interpolate_randomly(p0_ds.X, p1_ds.X, interp_frac, interp_size) update_full_summary(r05_no_growth, t05, 'R') try: i05 = wot.ot.interpolate_with_ot(p0_ds.X, p1_ds.X, tmap_full.X, interp_frac, interp_size) # TODO handle downsampling cells case update_full_summary(i05, t05, 'I') update_full_summary(i05, t05, 'I1', p0_ds.X) update_full_summary(i05, t05, 'I2', p1_ds.X) except ValueError: pass update_full_summary(p0_ds.X, t0, 'F') update_full_summary(p1_ds.X, t1, 'L') update_full_summary(p0_ds.X, t1, 'A', p1_ds.X) if not has_covariate: continue p0 = wot.split_anndata(p0_ds, ot_model.covariate_field) p05 = wot.split_anndata(p05_ds, ot_model.covariate_field) p1 = wot.split_anndata(p1_ds, ot_model.covariate_field) for cv05 in p05.keys(): p05_x = p05[cv05].X seen_first = set() seen_last = set() def distance_to_p05(pop, t, name, cv): dist = wot.ot.earth_mover_distance(pop, p05_x, eigenvals if local_pca > 0 else None) summary_list.append( {'interval_start': t0, 'interval_mid': t05, 'interval_end': t1, 't0': t, 't1': t05, 'cv0': cv, 'cv1': cv05, 'name': name, 'distance': dist, 'full': False}) # p05_x = wot.ot.pca_transform(pca, mean, p05[cv05].X) for cv05_2 in p05.keys(): # distance between batches if cv05_2 != cv05: distance_to_p05(p05[cv05_2].X, t05, 'P', cv05_2) for cv0, cv1 in itertools.product(p0.keys(), p1.keys()): tmap = ot_model.compute_transport_map(t0, t1, covariate=(cv0, cv1)) if tmap is None: # no data for combination of day and covariate continue # interp_size = (len(p0[cv0]) + len(p1[cv1])) / 2 # pca, mean = wot.ot.get_pca(local_pca, p0[cv0].X, p1[cv1].X) # p0_x = wot.ot.pca_transform(pca, mean, p0[cv0].X) # p1_x = wot.ot.pca_transform(pca, mean, p1[cv1].X) p0_x = p0[cv0].X p1_x = p1[cv1].X i05 = wot.ot.interpolate_with_ot(p0_x, p1_x, tmap.X, interp_frac, interp_size) if ot_model.cell_growth_rate_field in ot_model.matrix.obs: r05_with_growth = wot.ot.interpolate_randomly_with_growth(p0_x, p1_x, interp_frac, interp_size, p0[cv0].obs[ ot_model.cell_growth_rate_field].values ** ( interp_frac)) distance_to_p05(r05_with_growth, t05, 'Rg', (cv0, cv1)) r05_no_growth = wot.ot.interpolate_randomly(p0_x, p1_x, interp_frac, interp_size) distance_to_p05(i05, t05, 'I', (cv0, cv1)) distance_to_p05(r05_no_growth, t05, 'R', (cv0, cv1)) if cv0 == cv05 and cv0 not in seen_first: seen_first.add(cv0) distance_to_p05(p0_x, t0, 'F', cv0) if cv1 == cv05 and cv1 not in seen_last: seen_last.add(cv1) distance_to_p05(p1_x, t1, 'L', cv1) # if save_interpolated: # prefix = os.path.join(tmap_dir, tmap_prefix) # prefix += '_{}_{}_cv{}_cv{}'.format(t0, t1, cv0, cv1) # wot.io.write_dataset(wot.dataset_from_x(i05), # prefix + '_interp.txt') # wot.io.write_dataset(wot.dataset_from_x(r05), # prefix + '_random.txt') return pd.DataFrame(summary_list)
def test_to_frame_or_series(setup): raw = pd.Series(np.random.rand(10), name='col') series = Series(raw) r = series.to_frame() result = r.execute().fetch() pd.testing.assert_frame_equal(raw.to_frame(), result) r = series.to_frame(name='new_name') result = r.execute().fetch() pd.testing.assert_frame_equal(raw.to_frame(name='new_name'), result) series = series[series > 0.1] r = series.to_frame(name='new_name') result = r.execute().fetch() pd.testing.assert_frame_equal(raw[raw > 0.1].to_frame(name='new_name'), result) raw = pd.Index(np.random.rand(10), name='col') index = Index(raw) r = index.to_frame() result = r.execute().fetch() pd.testing.assert_frame_equal(raw.to_frame(), result) r = index.to_frame(index=False) result = r.execute().fetch() pd.testing.assert_frame_equal(raw.to_frame(index=False), result) r = index.to_frame(name='new_name') result = r.execute().fetch() pd.testing.assert_frame_equal(raw.to_frame(name='new_name'), result) r = index.to_series() result = r.execute().fetch() pd.testing.assert_series_equal(raw.to_series(), result) r = index.to_series(index=pd.RangeIndex(0, 10)) result = r.execute().fetch() pd.testing.assert_series_equal(raw.to_series(index=pd.RangeIndex(0, 10)), result) r = index.to_series(name='new_name') result = r.execute().fetch() pd.testing.assert_series_equal(raw.to_series(name='new_name'), result) raw = pd.MultiIndex.from_tuples([('A', 'E'), ('B', 'F'), ('C', 'G')]) index = Index(raw, tupleize_cols=True) r = index.to_frame() result = r.execute().fetch() pd.testing.assert_frame_equal(raw.to_frame(), result) with pytest.raises(TypeError): index.to_frame(name='XY') with pytest.raises(ValueError): index.to_frame(name=['X', 'Y', 'Z']) r = index.to_frame(name=['X', 'Y']) result = r.execute().fetch() pd.testing.assert_frame_equal(raw.to_frame(name=['X', 'Y']), result) r = index.to_series(name='new_name') result = r.execute().fetch() pd.testing.assert_series_equal(raw.to_series(name='new_name'), result)
def execute(cls, ctx, op: 'DataFrameReadSQL'): import sqlalchemy as sa def _adapt_datetime(dt): if isinstance(dt, np.datetime64): return dt.astype('<M8[ms]').astype(datetime.datetime) elif isinstance(dt, pd.Timestamp): return dt.to_pydatetime() return dt out = op.outputs[0] engine = sa.create_engine(op.con, **(op.engine_kwargs or dict())) try: selectable, _ = op._get_selectable(engine) columns = [selectable.columns[col] for col in op.columns] column_names = set(op.columns) if op.index_col: for icol in op.index_col: if icol not in column_names: columns.append(selectable.columns[icol]) # convert to python timestamp in case np / pd time types not handled op._low_limit = _adapt_datetime(op._low_limit) op._high_limit = _adapt_datetime(op._high_limit) query = sa.sql.select(columns) if op.method == 'partition': part_col = selectable.columns[op.partition_col] if op.left_end: query = query.where(part_col < op.high_limit) elif op.right_end: query = query.where(part_col >= op.low_limit) else: query = query.where((part_col >= op.low_limit) & (part_col < op.high_limit)) if hasattr(selectable, 'primary_key') and len(selectable.primary_key) > 0: # if table has primary key, sort as the order query = query.order_by(*list(selectable.primary_key)) elif op.index_col: # if no primary key, sort as the index_col query = query.order_by( *[selectable.columns[col] for col in op.index_col]) else: # at last, we sort by all the columns query = query.order_by(*columns) if op.method == 'offset': query = query.limit(out.shape[0]) if op.offset > 0: query = query.offset(op.offset) df = pd.read_sql(query, engine, index_col=op.index_col, coerce_float=op.coerce_float, parse_dates=op.parse_dates) if op.method == 'offset' and op.index_col is None and op.offset > 0: df.index = pd.RangeIndex(op.offset, op.offset + out.shape[0]) ctx[out.key] = df finally: engine.dispose()
def __call__(self, test_rows, chunk_size): import sqlalchemy as sa from sqlalchemy.sql import elements with create_sa_connection(self._con, **(self._engine_kwargs or dict())) as con: self._con = str(con.engine.url) selectable, src_columns = self._get_selectable(con) # process index_col index_col = self._index_col if index_col is not None: if not isinstance(index_col, (list, tuple)): index_col = (index_col,) new_index_col = [] sa_index_col = [] for col in index_col: if isinstance(col, (sa.Column, elements.Label)): new_index_col.append(col.name) sa_index_col.append(col) elif isinstance(col, str): sa_index_col.append(selectable.columns[col]) new_index_col.append(col) elif col is not None: raise TypeError('unknown index_col type: {}'.format(type(col))) self._index_col = new_index_col index_col = sa_index_col # process columns columns = self._columns if self._columns is not None else src_columns new_columns = [] sa_columns = [] for col in columns: if isinstance(col, str): new_columns.append(col) sa_columns.append(selectable.columns[col]) else: new_columns.append(col.name) sa_columns.append(col) self._columns = new_columns if self._index_col is not None: for icol in index_col: sa_columns.append(icol) test_df, shape = self._collect_info(con, selectable, sa_columns, test_rows) if self.method == 'partition': if not self.index_col or self.partition_col not in self.index_col: part_frame = test_df else: part_frame = test_df.index.to_frame() if not issubclass(part_frame[self.partition_col].dtype.type, (np.number, np.datetime64)): raise TypeError('Type of partition column should be numeric or datetime, ' 'now it is %r' % test_df[self.partition_col].dtype) if isinstance(test_df.index, pd.RangeIndex): index_value = parse_index(pd.RangeIndex(shape[0] if not np.isnan(shape[0]) else -1), str(selectable), self._con) else: index_value = parse_index(test_df.index) columns_value = parse_index(test_df.columns, store_data=True) return self.new_dataframe(None, shape=shape, dtypes=test_df.dtypes, index_value=index_value, columns_value=columns_value, raw_chunk_size=chunk_size)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import pandas as pd from numpy.random import permutation, randn from legate import pandas as lp from tests.utils import equals n = 17 indices = [pd.RangeIndex(1, n + 1), pd.Index(permutation(n))] for index in indices: print(f"Index: {index}") df1 = pd.DataFrame({1: randn(n), 2: randn(n), 5: randn(n)}, index=index) ldf1 = lp.DataFrame(df1) df2 = pd.DataFrame({1: randn(n), 2: randn(n), 5: randn(n)}, index=index) out_pd = df1 + df2 out_lp = ldf1 + df2 assert equals(out_lp, out_pd) out_pd = df1 + df2.values out_lp = ldf1 + df2.values assert equals(out_lp, out_pd)
log.info(f"{tweet_text_dataframe.columns}\n") # Drop any NaN or empty Tweet rows in dataframe (or else CountVectorizer will blow up). tweet_text_dataframe = tweet_text_dataframe.dropna() # Print shape and column names. log.info( f"\nThe shape of the Tweet text dataframe with NaN (empty) rows dropped:") log.info(f"{tweet_text_dataframe.shape}\n") log.info( f"\nThe columns of the Tweet text dataframe with NaN (empty) rows dropped:" ) log.info(f"{tweet_text_dataframe.columns}\n") # Reindex everything. tweet_text_dataframe.index = pd.RangeIndex(len(tweet_text_dataframe.index)) # Assign column names. tweet_text_dataframe_column_names = [ 'text_derived', 'text_derived_preprocessed', 'text_derived_postprocessed' ] # Rename column in dataframe. tweet_text_dataframe.columns = tweet_text_dataframe_column_names # Create input feature. selected_features = tweet_text_dataframe[['text_derived_postprocessed']] processed_features = selected_features.copy() # Check what we are using as inputs. log.info(f"\nA sample Tweet in our input feature:")
class TestNumericArraylikeArithmeticWithTimedeltaLike(object): # TODO: also check name retentention @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize('left', [ pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype) for dtype in ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', 'f2', 'f4', 'f8'] for cls in [pd.Series, pd.Index]], ids=lambda x: type(x).__name__ + str(x.dtype)) def test_mul_td64arr(self, left, box_cls): # GH#22390 right = np.array([1, 2, 3], dtype='m8[s]') right = box_cls(right) expected = pd.TimedeltaIndex(['10s', '40s', '90s']) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) result = left * right tm.assert_equal(result, expected) result = right * left tm.assert_equal(result, expected) # TODO: also check name retentention @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize('left', [ pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype) for dtype in ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', 'f2', 'f4', 'f8'] for cls in [pd.Series, pd.Index]], ids=lambda x: type(x).__name__ + str(x.dtype)) def test_div_td64arr(self, left, box_cls): # GH#22390 right = np.array([10, 40, 90], dtype='m8[s]') right = box_cls(right) expected = pd.TimedeltaIndex(['1s', '2s', '3s']) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) result = right / left tm.assert_equal(result, expected) result = right // left tm.assert_equal(result, expected) with pytest.raises(TypeError): left / right with pytest.raises(TypeError): left // right # TODO: de-duplicate with test_numeric_arr_mul_tdscalar def test_ops_series(self): # regression test for G#H8813 td = Timedelta('1 day') other = pd.Series([1, 2]) expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) # TODO: also test non-nanosecond timedelta64 and Tick objects; # see test_numeric_arr_rdiv_tdscalar for note on these failing @pytest.mark.parametrize('scalar_td', [ Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta()], ids=lambda x: type(x).__name__) def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): # GH#19333 index = numeric_idx expected = pd.timedelta_range('0 days', '4 days') index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = index * scalar_td tm.assert_equal(result, expected) commute = scalar_td * index tm.assert_equal(commute, expected) def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): if box is not pd.Index and isinstance(three_days, pd.offsets.Tick): raise pytest.xfail("Tick division not implemented") index = numeric_idx[1:3] expected = TimedeltaIndex(['3 Days', '36 Hours']) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = three_days / index tm.assert_equal(result, expected) with pytest.raises(TypeError): index / three_days @pytest.mark.parametrize('other', [ pd.Timedelta(hours=31), pd.Timedelta(hours=31).to_pytimedelta(), pd.Timedelta(hours=31).to_timedelta64(), pd.Timedelta(hours=31).to_timedelta64().astype('m8[h]'), np.timedelta64('NaT'), np.timedelta64('NaT', 'D'), pd.offsets.Minute(3), pd.offsets.Second(0)]) def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): left = tm.box_expected(numeric_idx, box) with pytest.raises(TypeError): left + other with pytest.raises(TypeError): other + left with pytest.raises(TypeError): left - other with pytest.raises(TypeError): other - left
def testFromTensorExecution(self): tensor = mt.random.rand(10, 10, chunk_size=5) df = dataframe_from_tensor(tensor) tensor_res = self.executor.execute_tensor(tensor, concat=True)[0] pdf_expected = pd.DataFrame(tensor_res) df_result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10)) pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10)) pd.testing.assert_frame_equal(df_result, pdf_expected) # test converted with specified index_value and columns tensor2 = mt.random.rand(2, 2, chunk_size=1) df2 = dataframe_from_tensor(tensor2, index=pd.Index(['a', 'b']), columns=pd.Index([3, 4])) df_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b'])) pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4])) # test converted from 1-d tensor tensor3 = mt.array([1, 2, 3]) df3 = dataframe_from_tensor(tensor3) result3 = self.executor.execute_dataframe(df3, concat=True)[0] pdf_expected = pd.DataFrame(np.array([1, 2, 3])) pd.testing.assert_frame_equal(pdf_expected, result3) # test converted from identical chunks tensor4 = mt.ones((10, 10), chunk_size=3) df4 = dataframe_from_tensor(tensor4) result4 = self.executor.execute_dataframe(df4, concat=True)[0] pdf_expected = pd.DataFrame( self.executor.execute_tensor(tensor4, concat=True)[0]) pd.testing.assert_frame_equal(pdf_expected, result4) # from tensor with given index tensor5 = mt.ones((10, 10), chunk_size=3) df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2)) result5 = self.executor.execute_dataframe(df5, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor5, concat=True)[0], index=np.arange(0, 20, 2)) pd.testing.assert_frame_equal(pdf_expected, result5) # from tensor with given index that is a tensor raw7 = np.random.rand(10, 10) tensor7 = mt.tensor(raw7, chunk_size=3) index_raw7 = np.random.rand(10) index7 = mt.tensor(index_raw7, chunk_size=4) df7 = dataframe_from_tensor(tensor7, index=index7) result7 = self.executor.execute_dataframe(df7, concat=True)[0] pdf_expected = pd.DataFrame(raw7, index=index_raw7) pd.testing.assert_frame_equal(pdf_expected, result7) # from tensor with given index is a md.Index raw10 = np.random.rand(10, 10) tensor10 = mt.tensor(raw10, chunk_size=3) index10 = md.date_range('2020-1-1', periods=10, chunk_size=3) df10 = dataframe_from_tensor(tensor10, index=index10) result10 = self.executor.execute_dataframe(df10, concat=True)[0] pdf_expected = pd.DataFrame(raw10, index=pd.date_range('2020-1-1', periods=10)) pd.testing.assert_frame_equal(pdf_expected, result10) # from tensor with given columns tensor6 = mt.ones((10, 10), chunk_size=3) df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij')) result6 = self.executor.execute_dataframe(df6, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor6, concat=True)[0], columns=list('abcdefghij')) pd.testing.assert_frame_equal(pdf_expected, result6) # from 1d tensors raws8 = [('a', np.random.rand(8)), ('b', np.random.randint(10, size=8)), ('c', [ ''.join(np.random.choice(list(printable), size=6)) for _ in range(8) ])] tensors8 = OrderedDict( (r[0], mt.tensor(r[1], chunk_size=3)) for r in raws8) raws8.append(('d', 1)) raws8.append(('e', pd.date_range('2020-1-1', periods=8))) tensors8['d'] = 1 tensors8['e'] = raws8[-1][1] df8 = dataframe_from_1d_tileables(tensors8, columns=[r[0] for r in raws8]) result = self.executor.execute_dataframe(df8, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8)) pd.testing.assert_frame_equal(result, pdf_expected) # from 1d tensors and specify index with a tensor index_raw9 = np.random.rand(8) index9 = mt.tensor(index_raw9, chunk_size=4) df9 = dataframe_from_1d_tileables(tensors8, columns=[r[0] for r in raws8], index=index9) result = self.executor.execute_dataframe(df9, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9) pd.testing.assert_frame_equal(result, pdf_expected) # from 1d tensors and specify index df11 = dataframe_from_1d_tileables(tensors8, columns=[r[0] for r in raws8], index=md.date_range('2020-1-1', periods=8)) result = self.executor.execute_dataframe(df11, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8), index=pd.date_range('2020-1-1', periods=8)) pd.testing.assert_frame_equal(result, pdf_expected)
class TestAppend: def test_append(self, sort, float_frame): mixed_frame = float_frame.copy() mixed_frame["foo"] = "bar" begin_index = float_frame.index[:5] end_index = float_frame.index[5:] begin_frame = float_frame.reindex(begin_index) end_frame = float_frame.reindex(end_index) appended = begin_frame.append(end_frame) tm.assert_almost_equal(appended["A"], float_frame["A"]) del end_frame["A"] partial_appended = begin_frame.append(end_frame, sort=sort) assert "A" in partial_appended partial_appended = end_frame.append(begin_frame, sort=sort) assert "A" in partial_appended # mixed type handling appended = mixed_frame[:5].append(mixed_frame[5:]) tm.assert_frame_equal(appended, mixed_frame) # what to test here mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort) mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort) # all equal except 'foo' column tm.assert_frame_equal( mixed_appended.reindex(columns=["A", "B", "C", "D"]), mixed_appended2.reindex(columns=["A", "B", "C", "D"]), ) def test_append_empty(self, float_frame): empty = DataFrame() appended = float_frame.append(empty) tm.assert_frame_equal(float_frame, appended) assert appended is not float_frame appended = empty.append(float_frame) tm.assert_frame_equal(float_frame, appended) assert appended is not float_frame def test_append_overlap_raises(self, float_frame): msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): float_frame.append(float_frame, verify_integrity=True) def test_append_new_columns(self): # see gh-6129: new columns df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) row = Series([5, 6, 7], index=["a", "b", "c"], name="z") expected = DataFrame({ "a": { "x": 1, "y": 2, "z": 5 }, "b": { "x": 3, "y": 4, "z": 6 }, "c": { "z": 7 }, }) result = df.append(row) tm.assert_frame_equal(result, expected) def test_append_length0_frame(self, sort): df = DataFrame(columns=["A", "B", "C"]) df3 = DataFrame(index=[0, 1], columns=["A", "B"]) df5 = df.append(df3, sort=sort) expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) tm.assert_frame_equal(df5, expected) def test_append_records(self): arr1 = np.zeros((2, ), dtype=("i4,f4,a10")) arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] arr2 = np.zeros((3, ), dtype=("i4,f4,a10")) arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) df2 = DataFrame(arr2) result = df1.append(df2, ignore_index=True) expected = DataFrame(np.concatenate((arr1, arr2))) tm.assert_frame_equal(result, expected) # rewrite sort fixture, since we also want to test default of None def test_append_sorts(self, sort): df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) with tm.assert_produces_warning(None): result = df1.append(df2, sort=sort) # for None / True expected = DataFrame( { "b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4] }, columns=["a", "b", "c"], ) if sort is False: expected = expected[["b", "a", "c"]] tm.assert_frame_equal(result, expected) def test_append_different_columns(self, sort): df = DataFrame({ "bools": np.random.randn(10) > 0, "ints": np.random.randint(0, 10, 10), "floats": np.random.randn(10), "strings": ["foo", "bar"] * 5, }) a = df[:5].loc[:, ["bools", "ints", "floats"]] b = df[5:].loc[:, ["strings", "ints", "floats"]] appended = a.append(b, sort=sort) assert isna(appended["strings"][0:4]).all() assert isna(appended["bools"][5:]).all() def test_append_many(self, sort, float_frame): chunks = [ float_frame[:5], float_frame[5:10], float_frame[10:15], float_frame[15:], ] result = chunks[0].append(chunks[1:]) tm.assert_frame_equal(result, float_frame) chunks[-1] = chunks[-1].copy() chunks[-1]["foo"] = "bar" result = chunks[0].append(chunks[1:], sort=sort) tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) assert (result["foo"][15:] == "bar").all() assert result["foo"][:15].isna().all() def test_append_preserve_index_name(self): # #980 df1 = DataFrame(columns=["A", "B", "C"]) df1 = df1.set_index(["A"]) df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) df2 = df2.set_index(["A"]) result = df1.append(df2) assert result.index.name == "A" indexes_can_append = [ pd.RangeIndex(3), Index([4, 5, 6]), Index([4.5, 5.5, 6.5]), Index(list("abc")), pd.CategoricalIndex("A B C".split()), pd.CategoricalIndex("D E F".split(), ordered=True), pd.IntervalIndex.from_breaks([7, 8, 9, 10]), pd.DatetimeIndex([ dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 3, 7, 12), ]), pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]), ] @pytest.mark.parametrize("index", indexes_can_append, ids=lambda x: type(x).__name__) def test_append_same_columns_type(self, index): # GH18359 # df wider than ser df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) ser_index = index[:2] ser = Series([7, 8], index=ser_index, name=2) result = df.append(ser) expected = DataFrame([[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index) # integer dtype is preserved for columns present in ser.index assert expected.dtypes.iloc[0].kind == "i" assert expected.dtypes.iloc[1].kind == "i" tm.assert_frame_equal(result, expected) # ser wider than df ser_index = index index = index[:2] df = DataFrame([[1, 2], [4, 5]], columns=index) ser = Series([7, 8, 9], index=ser_index, name=2) result = df.append(ser) expected = DataFrame( [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], index=[0, 1, 2], columns=ser_index, ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "df_columns, series_index", combinations(indexes_can_append, r=2), ids=lambda x: type(x).__name__, ) def test_append_different_columns_types(self, df_columns, series_index): # GH18359 # See also test 'test_append_different_columns_types_raises' below # for errors raised when appending df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) ser = Series([7, 8, 9], index=series_index, name=2) result = df.append(ser) idx_diff = ser.index.difference(df_columns) combined_columns = Index(df_columns.tolist()).append(idx_diff) expected = DataFrame( [ [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], [4, 5, 6, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, 7, 8, 9], ], index=[0, 1, 2], columns=combined_columns, ) tm.assert_frame_equal(result, expected) def test_append_dtype_coerce(self, sort): # GH 4993 # appending with datetime will incorrectly convert datetime64 df1 = DataFrame( index=[1, 2], data=[ dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0) ], columns=["start_time"], ) df2 = DataFrame( index=[4, 5], data=[ [ dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10) ], [ dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10) ], ], columns=["start_time", "end_time"], ) expected = concat( [ Series( [ pd.NaT, pd.NaT, dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 4, 7, 10), ], name="end_time", ), Series( [ dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0), dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0), ], name="start_time", ), ], axis=1, sort=sort, ) result = df1.append(df2, ignore_index=True, sort=sort) if sort: expected = expected[["end_time", "start_time"]] else: expected = expected[["start_time", "end_time"]] tm.assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self, sort): df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) df2 = DataFrame( {"B": np.array([True, False, True, False], dtype=bool)}) appended = df1.append(df2, ignore_index=True, sort=sort) assert appended["A"].dtype == "f8" assert appended["B"].dtype == "O" def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) ser = Series({"a": 1.0, "b": 2.0, "date": date}) df = DataFrame(columns=["c", "d"]) result_a = df.append(ser, ignore_index=True) expected = DataFrame([[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]) # These columns get cast to object after append expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) tm.assert_frame_equal(result_a, expected) expected = DataFrame([[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]) expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) result_b = result_a.append(ser, ignore_index=True) tm.assert_frame_equal(result_b, expected) result = df.append([ser, ser], ignore_index=True) tm.assert_frame_equal(result, expected) def test_append_empty_tz_frame_with_datetime64ns(self): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") # pd.NaT gets inferred as tz-naive, so append result is tz-naive result = df.append({"a": pd.NaT}, ignore_index=True) expected = DataFrame({"a": [pd.NaT]}).astype(object) tm.assert_frame_equal(result, expected) # also test with typed value to append df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") other = Series({"a": pd.NaT}, dtype="datetime64[ns]") result = df.append(other, ignore_index=True) expected = DataFrame({"a": [pd.NaT]}).astype(object) tm.assert_frame_equal(result, expected) # mismatched tz other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]") result = df.append(other, ignore_index=True) expected = DataFrame({"a": [pd.NaT]}).astype(object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]) @pytest.mark.parametrize("val", [1, "NaT"]) def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype(dtype_str) other = DataFrame({"a": [np.timedelta64(val, "ns")]}) result = df.append(other, ignore_index=True) expected = other.astype(object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]) @pytest.mark.parametrize("val", [1, "NaT"]) def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame({"a": pd.array([1], dtype=dtype_str)}) other = DataFrame({"a": [np.timedelta64(val, "ns")]}) result = df.append(other, ignore_index=True) expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object) tm.assert_frame_equal(result, expected)
import numpy as np import seaborn as sns import matplotlib.pyplot as plt import pandas as pd from pandas import ExcelWriter import xlrd from scipy.cluster.hierarchy import fcluster from scipy.cluster.hierarchy import linkage from scipy.cluster.hierarchy import dendrogram df = pd.read_excel('C:/Users/user/Desktop/price all 2000x2000.xlsx') df = df[df.날짜 <= '2018.09.01'] df = df[df.날짜 >= '2018.08.01'] df.index = pd.RangeIndex(len(df.index)) df = df.drop("날짜", 1) writer = ExcelWriter('all_price.xlsx') df.to_excel(writer, 'Sheet1') writer.save() corr = df.corr(method='pearson') writer = ExcelWriter('corr.xlsx') corr.to_excel(writer, 'Sheet1') writer.save()
def _nonempty_index(idx): typ = type(idx) if typ is pd.RangeIndex: return pd.RangeIndex(2, name=idx.name) elif typ in _numeric_index_types: return typ([1, 2], name=idx.name) elif typ is pd.Index: return pd.Index(["a", "b"], name=idx.name) elif typ is pd.DatetimeIndex: start = "1970-01-01" # Need a non-monotonic decreasing index to avoid issues with # partial string indexing see https://github.com/dask/dask/issues/2389 # and https://github.com/pandas-dev/pandas/issues/16515 # This doesn't mean `_meta_nonempty` should ever rely on # `self.monotonic_increasing` or `self.monotonic_decreasing` try: return pd.date_range(start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name) except ValueError: # older pandas versions data = [start, "1970-01-02"] if idx.freq is None else None return pd.DatetimeIndex(data, start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name) elif typ is pd.PeriodIndex: return pd.period_range(start="1970-01-01", periods=2, freq=idx.freq, name=idx.name) elif typ is pd.TimedeltaIndex: start = np.timedelta64(1, "D") try: return pd.timedelta_range(start=start, periods=2, freq=idx.freq, name=idx.name) except ValueError: # older pandas versions start = np.timedelta64(1, "D") data = [start, start + 1] if idx.freq is None else None return pd.TimedeltaIndex(data, start=start, periods=2, freq=idx.freq, name=idx.name) elif typ is pd.CategoricalIndex: if len(idx.categories) == 0: data = pd.Categorical(_nonempty_index(idx.categories), ordered=idx.ordered) else: data = pd.Categorical.from_codes([-1, 0], categories=idx.categories, ordered=idx.ordered) return pd.CategoricalIndex(data, name=idx.name) elif typ is pd.MultiIndex: levels = [_nonempty_index(l) for l in idx.levels] codes = [[0, 0] for i in idx.levels] try: return pd.MultiIndex(levels=levels, codes=codes, names=idx.names) except TypeError: # older pandas versions return pd.MultiIndex(levels=levels, labels=codes, names=idx.names) raise TypeError("Don't know how to handle index of type {0}".format( typename(type(idx))))
def read_charge(self, name, iscf=Opt.ANY, imd=Opt.ANY, key_scf="scf", as_dataframe=False): r"""Read charges calculated in SCF loop or MD loop (or both) Siesta enables many different modes of writing out charges. NOTE: currently Mulliken charges are not implemented. The below table shows a list of different cases that may be encountered, the letters are referred to in the return section to indicate what is returned. +-----------+-----+-----+--------+-------+------------------+ | Case | *A* | *B* | *C* | *D* | *E* | +-----------+-----+-----+--------+-------+------------------+ | Charge | MD | SCF | MD+SCF | Final | Orbital resolved | +-----------+-----+-----+--------+-------+------------------+ | Voronoi | + | + | + | + | - | +-----------+-----+-----+--------+-------+------------------+ | Hirshfeld | + | + | + | + | - | +-----------+-----+-----+--------+-------+------------------+ | Mulliken | + | + | + | + | + | +-----------+-----+-----+--------+-------+------------------+ Notes ----- Errors will be raised if one requests information not present. I.e. passing an integer or `Opt.ALL` for `iscf` will raise an error if the SCF charges are not present. For `Opt.ANY` it will return the most information, effectively SCF will be returned if present. Currently Mulliken is not implemented, any help in reading this would be very welcome. Parameters ---------- name: {"voronoi", "hirshfeld"} the name of the charges that you want to read iscf: int or Opt, optional index (0-based) of the scf iteration you want the charges for. If the enum specifier `Opt.ANY` or `Opt.ALL` are used, then the returned quantities depend on what is present. If ``None/Opt.NONE`` it will not return any SCF charges. If both `imd` and `iscf` are ``None`` then only the final charges will be returned. imd: int or Opt, optional index (0-based) of the md step you want the charges for. If the enum specifier `Opt.ANY` or `Opt.ALL` are used, then the returned quantities depend on what is present. If ``None/Opt.NONE`` it will not return any MD charges. If both `imd` and `iscf` are ``None`` then only the final charges will be returned. key_scf : str, optional the key lookup for the scf iterations (a ":" will automatically be appended) as_dataframe: boolean, optional whether charges should be returned as a pandas dataframe. Returns ------- numpy.ndarray if a specific MD+SCF index is requested (or special cases where output is not complete) list of numpy.ndarray if one both `iscf` or `imd` is different from ``None/Opt.NONE``. pandas.DataFrame if `as_dataframe` is requested. The dataframe will have multi-indices if multiple SCF or MD steps are requested. """ if not hasattr(self, 'fh'): with self: return read_charge(self, name, iscf, imd, key_scf, as_dataframe) namel = name.lower() if as_dataframe: import pandas as pd def _empty_charge(): # build a fake dataframe with no indices return pd.DataFrame(index=pd.Index([], name="atom", dtype=np.int32), dtype=np.float32) else: pd = None def _empty_charge(): # return for single value with nan values return _a.arrayf([[None]]) # define helper function for reading voronoi+hirshfeld charges def _voronoi_hirshfeld_charges(): """ Read output from Voronoi/Hirshfeld charges """ nonlocal pd # Expecting something like this: # Voronoi Atomic Populations: # Atom # dQatom Atom pop S Sx Sy Sz Species # 1 -0.02936 4.02936 0.00000 -0.00000 0.00000 0.00000 C # Define the function that parses the charges def _parse_charge(line): atom_idx, *vals, symbol = line.split() # assert that this is a proper line # this should catch cases where the following line of charge output # is still parseable atom_idx = int(atom_idx) return list(map(float, vals)) # first line is the header header = ( self.readline().replace("dQatom", "dq") # dQatom in master .replace(" Qatom", " dq") # Qatom in 4.1 .replace("Atom pop", "e") # not found in 4.1 .split())[2:-1] # We have found the header, prepare a list to read the charges atom_charges = [] line = ' ' while line != "": try: line = self.readline() charge_vals = _parse_charge(line) atom_charges.append(charge_vals) except: # We already have the charge values and we reached a line that can't be parsed, # this means we have reached the end. break if pd is None: # not as_dataframe return _a.arrayf(atom_charges) # determine how many columns we have # this will remove atom indices and species, so only inside ncols = len(atom_charges[0]) assert ncols == len(header) # the precision is limited, so no need for double precision return pd.DataFrame(atom_charges, columns=header, dtype=np.float32, index=pd.RangeIndex(stop=len(atom_charges), name="atom")) # define helper function for reading voronoi+hirshfeld charges def _mulliken_charges(): """ Read output from Mulliken charges """ raise NotImplementedError( "Mulliken charges are not implemented currently") # Check that a known charge has been requested if namel == "voronoi": _r_charge = _voronoi_hirshfeld_charges charge_keys = [ "Voronoi Atomic Populations", "Voronoi Net Atomic Populations" ] elif namel == "hirshfeld": _r_charge = _voronoi_hirshfeld_charges charge_keys = [ "Hirshfeld Atomic Populations", "Hirshfeld Net Atomic Populations" ] elif namel == "mulliken": _r_charge = _mulliken_charges charge_keys = ["mulliken: Atomic and Orbital Populations"] else: raise ValueError( f"{self.__class__.__name__}.read_charge name argument should be one of {known_charges}, got {name}?" ) # Ensure the key_scf matches exactly (prepend a space) key_scf = f" {key_scf.strip()}:" # Reading charges may be quite time consuming for large MD simulations. # to see if we finished a MD read, we check for these keys search_keys = [ # two keys can signal ending SCF "SCF Convergence", "SCF_NOT_CONV", "siesta: Final energy", key_scf, *charge_keys ] # adjust the below while loop to take into account any additional # segments of search_keys IDX_SCF_END = [0, 1] IDX_FINAL = [2] IDX_SCF = [3] # the rest are charge keys IDX_CHARGE = list( range(len(search_keys) - len(charge_keys), len(search_keys))) # state to figure out where we are state = PropertyDict() state.INITIAL = 0 state.MD = 1 state.SCF = 2 state.CHARGE = 3 state.FINAL = 4 # a list of scf_charge md_charge = [] md_scf_charge = [] scf_charge = [] final_charge = None # signal that any first reads are INITIAL charges current_state = state.INITIAL charge = _empty_charge() FOUND_SCF = False FOUND_MD = False FOUND_FINAL = False # TODO whalrus ret = self.step_to(search_keys, case=True, ret_index=True, reread=False) while ret[0]: if ret[2] in IDX_SCF_END: # we finished all SCF iterations current_state = state.MD md_scf_charge.append(scf_charge) scf_charge = [] elif ret[2] in IDX_SCF: current_state = state.SCF # collect scf-charges (possibly none) scf_charge.append(charge) elif ret[2] in IDX_FINAL: current_state = state.FINAL # don't do anything, this is the final charge construct # regardless of where it comes from. elif ret[2] in IDX_CHARGE: FOUND_CHARGE = True # also read charge charge = _r_charge() if state.INITIAL == current_state or state.CHARGE == current_state: # this signals scf charges FOUND_SCF = True # There *could* be 2 steps if we are mixing H, # this is because it first does # compute H -> compute DM -> compute H # in the first iteration, subsequently we only do # compute compute DM -> compute H # once we hit ret[2] in IDX_SCF we will append scf_charge = [] elif state.MD == current_state: FOUND_MD = True # we just finished an SCF cycle. # So any output between SCF ending and # a new one beginning *must* be that geometries # charge # Here `charge` may be NONE signalling # we don't have charge in MD steps. md_charge.append(charge) # reset charge charge = _empty_charge() elif state.SCF == current_state: FOUND_SCF = True elif state.FINAL == current_state: FOUND_FINAL = True # a special state writing out the charges after everything final_charge = charge charge = _empty_charge() scf_charge = [] # we should be done and no other charge reads should be found! # should we just break? current_state = state.CHARGE # step to next entry ret = self.step_to(search_keys, case=True, ret_index=True, reread=False) if not any((FOUND_SCF, FOUND_MD, FOUND_FINAL)): raise SileError( f"{str(self)} does not contain any charges ({name})") # if the scf-charges are not stored, it means that the MD step finalization # has not been read. So correct if len(scf_charge) > 0: assert False, "this test shouldn't reach here" # we must not have read through the entire MD step # so this has to be a running simulation if charge is not None: scf_charge.append(charge) charge = _empty_charge() md_scf_charge.append(scf_charge) # otherwise there is some *parsing* error, so for now we use assert assert len(scf_charge) == 0 if as_dataframe: # convert data to proper data structures # regardless of user requests. This is an overhead... But probably not that big of a problem. if FOUND_SCF: md_scf_charge = pd.concat([ pd.concat(iscf, keys=pd.RangeIndex(1, len(iscf) + 1, name="iscf")) for iscf in md_scf_charge ], keys=pd.RangeIndex( 1, len(md_scf_charge) + 1, name="imd")) if FOUND_MD: md_charge = pd.concat(md_charge, keys=pd.RangeIndex(1, len(md_charge) + 1, name="imd")) else: if FOUND_SCF: nan_array = _a.emptyf(md_scf_charge[0][0].shape) nan_array.fill(np.nan) def get_md_scf_charge(scf_charge, iscf): try: return scf_charge[iscf] except: return nan_array if FOUND_MD: md_charge = np.stack(md_charge) # option parsing is a bit *difficult* with flag enums # So first figure out what is there, and handle this based # on arguments def _p(flag, found): """ Helper routine to do the following: Returns ------- is_opt : bool whether the flag is an `Opt` flag : corrected flag """ if isinstance(flag, Opt): # correct flag depending on what `found` is # If the values have been found we # change flag to None only if flag == NONE # If the case has not been found, we # change flag to None if ANY or NONE is in flags if found: # flag is only NONE, then pass none if not (Opt.NONE ^ flag): flag = None else: # not found # we convert flag to none # if ANY or NONE in flag if (Opt.NONE | Opt.ANY) & flag: flag = None return isinstance(flag, Opt), flag opt_imd, imd = _p(imd, FOUND_MD) opt_iscf, iscf = _p(iscf, FOUND_SCF) if not (FOUND_SCF or FOUND_MD): # none of these are found # we request that user does not request any input if (opt_iscf or (not iscf is None)) or \ (opt_imd or (not imd is None)): raise SileError(f"{str(self)} does not contain MD/SCF charges") elif not FOUND_SCF: if opt_iscf or (not iscf is None): raise SileError(f"{str(self)} does not contain SCF charges") elif not FOUND_MD: if opt_imd or (not imd is None): raise SileError(f"{str(self)} does not contain MD charges") # if either are options they may hold if opt_imd and opt_iscf: if FOUND_SCF: return md_scf_charge elif FOUND_MD: return md_charge elif FOUND_FINAL: # I think this will never be reached # If neither are found they will be converted to # None return final_charge raise SileError( f"{str(self)} unknown argument for 'imd' and 'iscf'") elif opt_imd: # flag requested imd if not (imd & (Opt.ANY | Opt.ALL)): # wrong flag raise SileError(f"{str(self)} unknown argument for 'imd'") if FOUND_SCF and iscf is not None: # this should be handled, i.e. the scf should be taken out if as_dataframe: return md_scf_charge.groupby(level=[0, 2]).nth(iscf) return np.stack( tuple(get_md_scf_charge(x, iscf) for x in md_scf_charge)) elif FOUND_MD and iscf is None: return md_charge raise SileError( f"{str(self)} unknown argument for 'imd' and 'iscf', could not find SCF charges" ) elif opt_iscf: # flag requested imd if not (iscf & (Opt.ANY | Opt.ALL)): # wrong flag raise SileError(f"{str(self)} unknown argument for 'iscf'") if imd is None: # correct imd imd = -1 if as_dataframe: md_scf_charge = md_scf_charge.groupby(level=0) group = list(md_scf_charge.groups.keys())[imd] return md_scf_charge.get_group(group).droplevel(0) return np.stack(md_scf_charge[imd]) elif imd is None and iscf is None: if FOUND_FINAL: return final_charge raise SileError(f"{str(self)} does not contain final charges") elif imd is None: # iscf is not None, so pass through as though explicitly passed imd = -1 elif iscf is None: # we return the last MD step and the requested scf iteration if as_dataframe: return md_charge.groupby(level=1).nth(imd) return md_charge[imd] if as_dataframe: # first select imd md_scf_charge = md_scf_charge.groupby(level=0) group = list(md_scf_charge.groups.keys())[imd] md_scf_charge = md_scf_charge.get_group(group).droplevel(0) return md_scf_charge.groupby(level=1).nth(iscf) return md_scf_charge[imd][iscf]
#plt.show() #SINEWAVE plot example Prediction vs Actual Time series data for i in np.arange(0, sine_test_x.shape[0], 500): series_plot_input = pd.DataFrame({'Input': sine_test_x[i, :, 0]}) series_plot_predicted = pd.DataFrame({'Predicted': data_predicted[i, :]}) series_plot_actual = pd.DataFrame({'Actual': data_actual[i, :]}) nan = pd.DataFrame(np.nan, index=np.arange(0, n_out), columns=['A']) concat1 = pd.concat([series_plot_input, nan]) concat2 = pd.concat([nan, series_plot_predicted]) concat3 = pd.concat([nan, series_plot_actual]) series_plot = pd.concat([concat1, concat2, concat3], axis=1) series_plot = series_plot.loc[:, ~series_plot.columns.get_loc("A")] series_plot.index = pd.RangeIndex(len(series_plot.index)) dpi = 300 multiplier = 1e1 ticks = 1e-1 x_tick = int(math.floor(n_out / 10) * 10 / 10) fig, ax = plt.subplots(figsize=(3000 / dpi, 3500 / dpi), dpi=dpi) sns.set_context("talk") plot = sns.lineplot(data=series_plot, legend='full', alpha=1, palette="muted", dashes=False) plot.set(xlabel='Time (Index)', ylabel='Voltage', title='iEEG Values') #plot.set(xlim=(-1,n_out*2), xticks=range(0,n_out*2,x_tick), yticks=np.arange(math.floor(plot.axes[0,0].get_ylim()[0] * multiplier) / multiplier,math.ceil(plot.axes[0,0].get_ylim()[1] * multiplier) / multiplier, ticks)) plot.set_xticklabels(range(0, n_out * 2, x_tick))
df = country_iso_df[country_iso_df['date'] == date_list[ date_slider]] #create dataframe from user-selected date date_formatted = dt.datetime.strftime(datetime_list[date_slider], '%m/%d/%Y') #format date st.write(f"Global COVID-19 {dataset_type} cases as of date: ", date_formatted) #display formatted date st.plotly_chart(generate_map(df)) #plot on map #Prints the relevant dataframe in streamlit st.subheader( f"Top 10 countries for {dataset_type} cases as of date: {date_formatted}") df2 = df[['country', 'count']] #gets user-defined dataframe df2 = df2.sort_values( by='count', ascending=False )[:10] #sorts dataframe by parameter, e.g. positive test and gets top 10 df2.index = pd.RangeIndex(1, 11) st.table(df2) #prints table # ''' # Note: the subsequent code was written based on formatting of archived JHU data sets, which originally contained US # state data. The archived data can be found at the following link: # https://github.com/CSSEGISandData/COVID-19/tree/master/archived_data/archived_time_series. # ''' # #Select state data # st.subheader('Explore data by US state over time') # states_url = "https://secure.ssa.gov/apps10/poms.nsf/lnx/0901501010" # state_dfs = pd.read_html(states_url, header=0)[0] # state_dfs.to_sql('states', conn, index_label='id', if_exists='replace') #create table of state abbreviations # #Function to return US data by state (aggregated) or by county for each state
def test_merge(setup): df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e']) df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y']) df3 = df1.copy() df3.index = pd.RangeIndex(2, 6, name='index') df4 = df1.copy() df4.index = pd.MultiIndex.from_tuples([(i, i + 1) for i in range(4)], names=['i1', 'i2']) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) mdf3 = from_pandas(df3, chunk_size=3) mdf4 = from_pandas(df4, chunk_size=2) # Note [Index of Merge] # # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to # the final result dataframe. # # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex. # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the # same index value with pandas. But we guarantee that the content of dataframe is correct. # merge on index expected0 = df1.merge(df2) jdf0 = mdf1.merge(mdf2) result0 = jdf0.execute().fetch() pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0)) # merge on left index and `right_on` expected1 = df1.merge(df2, how='left', right_on='x', left_index=True) jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True) result1 = jdf1.execute().fetch() expected1.set_index('a_x', inplace=True) result1.set_index('a_x', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0)) # merge on `left_on` and right index expected2 = df1.merge(df2, how='right', left_on='a', right_index=True) jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True) result2 = jdf2.execute().fetch() expected2.set_index('a', inplace=True) result2.set_index('a', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0)) # merge on `left_on` and `right_on` expected3 = df1.merge(df2, how='left', left_on='a', right_on='x') jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x') result3 = jdf3.execute().fetch() expected3.set_index('a_x', inplace=True) result3.set_index('a_x', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0)) # merge on `on` expected4 = df1.merge(df2, how='right', on='a') jdf4 = mdf1.merge(mdf2, how='right', on='a') result4 = jdf4.execute().fetch() expected4.set_index('a', inplace=True) result4.set_index('a', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0)) # merge on multiple columns expected5 = df1.merge(df2, how='inner', on=['a', 'b']) jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b']) result5 = jdf5.execute().fetch() pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0)) # merge when some on is index expected6 = df3.merge(df2, how='inner', left_on='index', right_on='a') jdf6 = mdf3.merge(mdf2, how='inner', left_on='index', right_on='a') result6 = jdf6.execute().fetch() pd.testing.assert_frame_equal(sort_dataframe_inplace(expected6, 0), sort_dataframe_inplace(result6, 0)) # merge when on is in MultiIndex expected7 = df4.merge(df2, how='inner', left_on='i1', right_on='a') jdf7 = mdf4.merge(mdf2, how='inner', left_on='i1', right_on='a') result7 = jdf7.execute().fetch() pd.testing.assert_frame_equal(sort_dataframe_inplace(expected7, 0), sort_dataframe_inplace(result7, 0)) # merge when on is in MultiIndex, and on not in index expected8 = df4.merge(df2, how='inner', on=['a', 'b']) jdf8 = mdf4.merge(mdf2, how='inner', on=['a', 'b']) result8 = jdf8.execute().fetch() pd.testing.assert_frame_equal(sort_dataframe_inplace(expected8, 0), sort_dataframe_inplace(result8, 0))
def item_get_elements(self, s, type, name, filters=None): if filters: # Convert filter elements to strings filters = {dim: as_str_list(ele) for dim, ele in filters.items()} try: # Retrieve the cached value with this exact set of filters return self.cache_get(s, type, name, filters) except KeyError: pass # Cache miss try: # Retrieve a cached, unfiltered value of the same item unfiltered = self.cache_get(s, type, name, None) except KeyError: pass # Cache miss else: # Success; filter and return return filtered(unfiltered, filters) # Failed to load item from cache # Retrieve the item item = self._get_item(s, type, name, load=True) idx_names = list(item.getIdxNames()) idx_sets = list(item.getIdxSets()) # Get list of elements, using filters if provided if filters is not None: jFilter = java.HashMap() for idx_name, values in filters.items(): # Retrieve the elements of the index set as a list idx_set = idx_sets[idx_names.index(idx_name)] elements = self.item_get_elements(s, 'set', idx_set).tolist() # Filter for only included values and store filtered_elements = filter(lambda e: e in values, elements) jFilter.put(idx_name, to_jlist(filtered_elements)) jList = item.getElements(jFilter) else: jList = item.getElements() if item.getDim() > 0: # Mapping set or multi-dimensional equation, parameter, or variable columns = copy(idx_names) # Prepare dtypes for index columns dtypes = {} for idx_name, idx_set in zip(columns, idx_sets): # NB using categoricals could be more memory-efficient, but # requires adjustment of tests/documentation. See # https://github.com/iiasa/ixmp/issues/228 # dtypes[idx_name] = CategoricalDtype( # self.item_get_elements(s, 'set', idx_set)) dtypes[idx_name] = str # Prepare dtypes for additional columns if type == 'par': columns.extend(['value', 'unit']) dtypes['value'] = float # Same as above # dtypes['unit'] = CategoricalDtype(self.jobj.getUnitList()) dtypes['unit'] = str elif type in ('equ', 'var'): columns.extend(['lvl', 'mrg']) dtypes.update({'lvl': float, 'mrg': float}) # Prepare empty DataFrame result = pd.DataFrame(index=pd.RangeIndex(len(jList)), columns=columns) # Copy vectors from Java into DataFrame columns # NB [:] causes JPype to use a faster code path for i in range(len(idx_sets)): result.iloc[:, i] = item.getCol(i, jList)[:] if type == 'par': result.loc[:, 'value'] = item.getValues(jList)[:] result.loc[:, 'unit'] = item.getUnits(jList)[:] elif type in ('equ', 'var'): result.loc[:, 'lvl'] = item.getLevels(jList)[:] result.loc[:, 'mrg'] = item.getMarginals(jList)[:] # .loc assignment above modifies dtypes; set afterwards result = result.astype(dtypes) elif type == 'set': # Index sets # dtype=object is to silence a warning in pandas 1.0 result = pd.Series(item.getCol(0, jList), dtype=object) elif type == 'par': # Scalar parameters result = dict(value=float(item.getScalarValue().floatValue()), unit=str(item.getScalarUnit())) elif type in ('equ', 'var'): # Scalar equations and variables result = dict(lvl=float(item.getScalarLevel().floatValue()), mrg=float(item.getScalarMarginal().floatValue())) # Store cache self.cache(s, type, name, filters, result) return result
if row_['AwayTeam'] == 'Inter': df_odds['AwayTeam'][idx] = 'Inter Milan' if row_['AwayTeam'] == 'Milan': df_odds['AwayTeam'][idx] = 'AC Milan' if row_['AwayTeam'] == 'Siena': df_odds['AwayTeam'][idx] = 'Robur Siena S.S.D.' # Suppress 'SettingWithCopyWarning' & .copy(deep=True) pd.options.mode.chained_assignment = None df_teams = pd.read_excel(DATA_FILE) names_teams = set(list(df_teams['Home Team'])) df_odds = pd.read_csv(ODDS_FILE, error_bad_lines=False) df_odds_ = df_odds.loc[:5326, :] df_odds_.index = pd.RangeIndex(len(df_odds_.index)) names_odds = set(list(df_odds_['HomeTeam'])) names = names_odds.difference(names_teams) edit_odds_df() pre_df = df_odds.loc[:, :] bet_odds = [] for index, row in df_teams.iterrows(): date = row['Date'] converted_date = convert_date() found = False for i, x in pre_df.iterrows(): if converted_date == x['Date'] and row['Home Team'] == x[
class TestGrouping: def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) for index in [ tm.makeFloatIndex, tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex, ]: df.index = index(len(df)) df.groupby(list("abcde")).apply(lambda x: x) df.index = list(reversed(df.index.tolist())) df.groupby(list("abcde")).apply(lambda x: x) def test_grouper_multilevel_freq(self): # GH 7885 # with level and freq specified in a pd.Grouper from datetime import date, timedelta d0 = date.today() - timedelta(days=14) dates = date_range(d0, date.today()) date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"]) df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) # Check string level expected = ( df.reset_index() .groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")]) .sum() ) # reset index changes columns dtype to object expected.columns = pd.Index([0], dtype="int64") result = df.groupby( [pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")] ).sum() tm.assert_frame_equal(result, expected) # Check integer level result = df.groupby( [pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")] ).sum() tm.assert_frame_equal(result, expected) def test_grouper_creation_bug(self): # GH 8795 df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]}) g = df.groupby("A") expected = g.sum() g = df.groupby(pd.Grouper(key="A")) result = g.sum() tm.assert_frame_equal(result, expected) result = g.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() tm.assert_frame_equal(result, expected) # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame( {"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]} ) # Group by single column expected = df.groupby("A").sum() g = df.groupby([pd.Grouper(key="A")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group by two columns # using a combination of strings and Grouper objects expected = df.groupby(["A", "B"]).sum() # Group with two Grouper objects g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group with a string and a Grouper object g = df.groupby(["A", pd.Grouper(key="B")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group with a Grouper object and a string g = df.groupby([pd.Grouper(key="A"), "B"]) result = g.sum() tm.assert_frame_equal(result, expected) # GH8866 s = Series( np.arange(8, dtype="int64"), index=pd.MultiIndex.from_product( [list("ab"), range(2), date_range("20130101", periods=2)], names=["one", "two", "three"], ), ) result = s.groupby(pd.Grouper(level="three", freq="M")).sum() expected = Series( [28], index=Index([Timestamp("2013-01-31")], freq="M", name="three") ) tm.assert_series_equal(result, expected) # just specifying a level breaks result = s.groupby(pd.Grouper(level="one")).sum() expected = s.groupby(level="one").sum() tm.assert_series_equal(result, expected) def test_grouper_column_and_index(self): # GH 14327 # Grouping a multi-index frame by a column and an index level should # be equivalent to resetting the index and grouping by two columns idx = pd.MultiIndex.from_tuples( [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)] ) idx.names = ["outer", "inner"] df_multi = pd.DataFrame( {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]}, index=idx, ) result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() expected = df_multi.reset_index().groupby(["B", "inner"]).mean() tm.assert_frame_equal(result, expected) # Test the reverse grouping order result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() expected = df_multi.reset_index().groupby(["inner", "B"]).mean() tm.assert_frame_equal(result, expected) # Grouping a single-index frame by a column and the index should # be equivalent to resetting the index and grouping by two columns df_single = df_multi.reset_index("outer") result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() expected = df_single.reset_index().groupby(["B", "inner"]).mean() tm.assert_frame_equal(result, expected) # Test the reverse grouping order result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() expected = df_single.reset_index().groupby(["inner", "B"]).mean() tm.assert_frame_equal(result, expected) def test_groupby_levels_and_columns(self): # GH9344, GH9049 idx_names = ["x", "y"] idx = pd.MultiIndex.from_tuples( [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names ) df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) by_levels = df.groupby(level=idx_names).mean() # reset_index changes columns dtype to object by_columns = df.reset_index().groupby(idx_names).mean() tm.assert_frame_equal(by_levels, by_columns, check_column_type=False) by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) tm.assert_frame_equal(by_levels, by_columns) def test_groupby_categorical_index_and_columns(self, observed): # GH18432, adapted for GH25871 columns = ["A", "B", "A", "B"] categories = ["B", "A"] data = np.array( [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int ) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int) expected_columns = CategoricalIndex( categories, categories=categories, ordered=True ) expected = DataFrame(data=expected_data, columns=expected_columns) tm.assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) tm.assert_frame_equal(result, expected) def test_grouper_getting_correct_binner(self): # GH 10063 # using a non-time-based grouper and a time-based grouper # and specifying levels df = DataFrame( {"A": 1}, index=pd.MultiIndex.from_product( [list("ab"), date_range("20130101", periods=80)], names=["one", "two"] ), ) result = df.groupby( [pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")] ).sum() expected = DataFrame( {"A": [31, 28, 21, 31, 28, 21]}, index=MultiIndex.from_product( [list("ab"), date_range("20130101", freq="M", periods=3)], names=["one", "two"], ), ) tm.assert_frame_equal(result, expected) def test_grouper_iter(self, df): assert sorted(df.groupby("A").grouper) == ["bar", "foo"] def test_empty_groups(self, df): # see gh-1048 with pytest.raises(ValueError, match="No group keys passed!"): df.groupby([]) def test_groupby_grouper(self, df): grouped = df.groupby("A") result = df.groupby(grouped.grouper).mean() expected = grouped.mean() tm.assert_frame_equal(result, expected) def test_groupby_dict_mapping(self): # GH #679 from pandas import Series s = Series({"T1": 5}) result = s.groupby({"T1": "T2"}).agg(sum) expected = s.groupby(["T2"]).agg(sum) tm.assert_series_equal(result, expected) s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd")) mapping = {"a": 0, "b": 0, "c": 1, "d": 1} result = s.groupby(mapping).mean() result2 = s.groupby(mapping).agg(np.mean) expected = s.groupby([0, 0, 1, 1]).mean() expected2 = s.groupby([0, 0, 1, 1]).mean() tm.assert_series_equal(result, expected) tm.assert_series_equal(result, result2) tm.assert_series_equal(result, expected2) def test_groupby_grouper_f_sanity_checked(self): dates = date_range("01-Jan-2013", periods=12, freq="MS") ts = Series(np.random.randn(12), index=dates) # GH3035 # index.map is used to apply grouper to the index # if it fails on the elements, map tries it on the entire index as # a sequence. That can yield invalid results that cause trouble # down the line. # the surprise comes from using key[0:6] rather then str(key)[0:6] # when the elements are Timestamp. # the result is Index[0:6], very confusing. msg = r"Grouper result violates len\(labels\) == len\(data\)" with pytest.raises(AssertionError, match=msg): ts.groupby(lambda key: key[0:6]) def test_grouping_error_on_multidim_input(self, df): msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional" with pytest.raises(ValueError, match=msg): Grouping(df.index, df[["A", "A"]]) def test_multiindex_passthru(self): # GH 7997 # regression from 0.14.1 df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) result = df.groupby(axis=1, level=[0, 1]).first() tm.assert_frame_equal(result, df) def test_multiindex_negative_level(self, mframe): # GH 13901 result = mframe.groupby(level=-1).sum() expected = mframe.groupby(level="second").sum() tm.assert_frame_equal(result, expected) result = mframe.groupby(level=-2).sum() expected = mframe.groupby(level="first").sum() tm.assert_frame_equal(result, expected) result = mframe.groupby(level=[-2, -1]).sum() expected = mframe tm.assert_frame_equal(result, expected) result = mframe.groupby(level=[-1, "first"]).sum() expected = mframe.groupby(level=["second", "first"]).sum() tm.assert_frame_equal(result, expected) def test_multifunc_select_col_integer_cols(self, df): df.columns = np.arange(len(df.columns)) # it works! df.groupby(1, as_index=False)[2].agg({"Q": np.mean}) def test_multiindex_columns_empty_level(self): lst = [["count", "values"], ["to filter", ""]] midx = MultiIndex.from_tuples(lst) df = DataFrame([[1, "A"]], columns=midx) grouped = df.groupby("to filter").groups assert grouped["A"] == [0] grouped = df.groupby([("to filter", "")]).groups assert grouped["A"] == [0] df = DataFrame([[1, "A"], [2, "B"]], columns=midx) expected = df.groupby("to filter").groups result = df.groupby([("to filter", "")]).groups assert result == expected df = DataFrame([[1, "A"], [2, "A"]], columns=midx) expected = df.groupby("to filter").groups result = df.groupby([("to filter", "")]).groups tm.assert_dict_equal(result, expected) def test_groupby_multiindex_tuple(self): # GH 17979 df = pd.DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), ) expected = df.groupby([("b", 1)]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) df2 = pd.DataFrame( df.values, columns=pd.MultiIndex.from_arrays( [["a", "b", "b", "c"], ["d", "d", "e", "e"]] ), ) expected = df2.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) expected = df3.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level(self, sort, mframe, df): # GH 17537 frame = mframe deleveled = frame.reset_index() result0 = frame.groupby(level=0, sort=sort).sum() result1 = frame.groupby(level=1, sort=sort).sum() expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum() expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum() expected0.index.name = "first" expected1.index.name = "second" assert result0.index.name == "first" assert result1.index.name == "second" tm.assert_frame_equal(result0, expected0) tm.assert_frame_equal(result1, expected1) assert result0.index.name == frame.index.names[0] assert result1.index.name == frame.index.names[1] # groupby level name result0 = frame.groupby(level="first", sort=sort).sum() result1 = frame.groupby(level="second", sort=sort).sum() tm.assert_frame_equal(result0, expected0) tm.assert_frame_equal(result1, expected1) # axis=1 result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() tm.assert_frame_equal(result0, expected0.T) tm.assert_frame_equal(result1, expected1.T) # raise exception for non-MultiIndex msg = "level > 0 or level < -1 only valid with MultiIndex" with pytest.raises(ValueError, match=msg): df.groupby(level=1) def test_groupby_level_index_names(self): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index( "exp" ) df.groupby(level="exp") msg = "level name foo is not the name of the index" with pytest.raises(ValueError, match=msg): df.groupby(level="foo") @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level_with_nas(self, sort): # GH 17537 index = MultiIndex( levels=[[1, 0], [0, 1, 2, 3]], codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], ) # factorizing doesn't confuse things s = Series(np.arange(8.0), index=index) result = s.groupby(level=0, sort=sort).sum() expected = Series([6.0, 22.0], index=[0, 1]) tm.assert_series_equal(result, expected) index = MultiIndex( levels=[[1, 0], [0, 1, 2, 3]], codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], ) # factorizing doesn't confuse things s = Series(np.arange(8.0), index=index) result = s.groupby(level=0, sort=sort).sum() expected = Series([6.0, 18.0], index=[0.0, 1.0]) tm.assert_series_equal(result, expected) def test_groupby_args(self, mframe): # PR8618 and issue 8015 frame = mframe msg = "You have to supply one of 'by' and 'level'" with pytest.raises(TypeError, match=msg): frame.groupby() msg = "You have to supply one of 'by' and 'level'" with pytest.raises(TypeError, match=msg): frame.groupby(by=None, level=None) @pytest.mark.parametrize( "sort,labels", [ [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], ], ) def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) tm.assert_almost_equal(grouped.grouper.labels[0], exp_labels) def test_grouping_labels(self, mframe): grouped = mframe.groupby(mframe.index.get_level_values(0)) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) tm.assert_almost_equal(grouped.grouper.labels[0], exp_labels) def test_list_grouper_with_nat(self): # GH 14715 df = pd.DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT grouper = pd.Grouper(key="date", freq="AS") # Grouper in a list grouping result = df.groupby([grouper]) expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))} tm.assert_dict_equal(result.groups, expected) # Test case without a list result = df.groupby(grouper) expected = {pd.Timestamp("2011-01-01"): 365} tm.assert_dict_equal(result.groups, expected) @pytest.mark.parametrize( "func,expected", [ ("transform", pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))), ("agg", pd.Series(name=2, index=pd.Float64Index([], name=1))), ("apply", pd.Series(name=2, index=pd.Float64Index([], name=1))), ], ) def test_evaluate_with_empty_groups(self, func, expected): # 26208 # test transform'ing empty groups # (not testing other agg fns, because they return # different index objects. df = pd.DataFrame({1: [], 2: []}) g = df.groupby(1) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 s = pd.Series([], name="name") gr = s.groupby([]) result = gr.mean() tm.assert_series_equal(result, s) # check group properties assert len(gr.grouper.groupings) == 1 tm.assert_numpy_array_equal( gr.grouper.group_info[0], np.array([], dtype=np.dtype("int64")) ) tm.assert_numpy_array_equal( gr.grouper.group_info[1], np.array([], dtype=np.dtype("int")) ) assert gr.grouper.group_info[2] == 0 # check name assert s.groupby(s).grouper.names == ["name"] def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"] ).set_index(["A", "B"]) result = df.groupby(level=["A", "B"]).sum() expected = DataFrame( data=[], index=MultiIndex( levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], codes=[[], []], names=["A", "B"], ), columns=["C"], dtype="int64", ) tm.assert_frame_equal(result, expected)
def robustness_index(self, data, model, targetcol=None, labels=[], nsamples=10, nrecods=100, clevels=[95], random_state=None): """ The function will create nsamples of size nrecords with repetition using bootstrapping. :param nsamples: :param nrecods: :return: """ is_h2o_model = False cols = data.columns # cbind features and target nlen = len(labels) if targetcol is None and nlen == 0: assert False, "Either targetcol or labels must be specified" if nlen > 0: if nlen != data.shape[0]: assert False, "Number of observations and number of labels must match" else: if isinstance(cols, pd.RangeIndex): targetcol = pd.Index([len(cols)]) cols = pd.RangeIndex(start=0, stop=len(cols) + 1, step=1) else: targetcol = 'target' cols.append(targetcol) if is_h2o_frame(data): data = h2o.cbind(data, labels) else: data = pd.concat([data.reset_index(drop=True), labels], axis=1) # check the type of data if is_h2o_frame(data): # if h2o then convert it np_data = h2o.as_list(data).values col_types = [v for v in data.types.values()] is_h2o_model = True else: np_data = data.values for i in range(1, nsamples + 1): logging.info("Sampling " + str(i)) data_boot = resample(np_data, replace=True, n_samples=nrecods, random_state=random_state) data_boot_df = pd.DataFrame(data=data_boot[0:, 0:], columns=cols) if is_h2o_model: data_boot_df = h2o.H2OFrame(data_boot_df, column_types=col_types) y_act = h2o.as_list(data_boot_df[targetcol]).values else: y_act = data_boot_df[len(cols) - 1] # remove the target column for predicting as model if nlen > 0: data_boot_df = data_boot_df.drop([len(cols) - 1], axis=1) preds = model.predict(data_boot_df) if preds.ndim == 1: y_preds = preds else: y_preds = preds[:, 0] # make necessary transformation for h20 frames if is_h2o_frame(y_preds): y_preds = convert_h2o_list(y_preds) if is_h2o_frame(y_act): y_act = convert_h2o_list(y_act) self.prepare_stats_sample(y_act, y_preds) # We have estimations from multiple sample. Now get the mean, se, and CI self.prepare_robustness_index(clevels) return self.stats_df
def testStringMethod(self): s = pd.Series(['a', 'b', 'c'], name='s') series = from_pandas_series(s, chunk_size=2) with self.assertRaises(AttributeError): _ = series.str.non_exist r = series.str.contains('c') self.assertEqual(r.dtype, np.bool_) self.assertEqual(r.name, s.name) pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) self.assertEqual(r.shape, s.shape) r = r.tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i,)) self.assertEqual(c.dtype, np.bool_) self.assertEqual(c.name, s.name) pd.testing.assert_index_equal(c.index_value.to_pandas(), s.index[i * 2: (i + 1) * 2]) self.assertEqual(c.shape, (2,) if i == 0 else (1,)) r = series.str.split(',', expand=True, n=1) self.assertEqual(r.op.output_types[0], OutputType.dataframe) self.assertEqual(r.shape, (3, 2)) pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(2)) r = r.tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, 0)) pd.testing.assert_index_equal(c.index_value.to_pandas(), s.index[i * 2: (i + 1) * 2]) pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(2)) self.assertEqual(c.shape, (2, 2) if i == 0 else (1, 2)) with self.assertRaises(TypeError): _ = series.str.cat([['1', '2']]) with self.assertRaises(ValueError): _ = series.str.cat(['1', '2']) with self.assertRaises(ValueError): _ = series.str.cat(',') with self.assertRaises(TypeError): _ = series.str.cat({'1', '2', '3'}) r = series.str.cat(sep=',') self.assertEqual(r.op.output_types[0], OutputType.scalar) self.assertEqual(r.dtype, s.dtype) r = r.tiles() self.assertEqual(len(r.chunks), 1) self.assertEqual(r.chunks[0].op.output_types[0], OutputType.scalar) self.assertEqual(r.chunks[0].dtype, s.dtype) r = series.str.extract(r'[ab](\d)', expand=False) self.assertEqual(r.op.output_types[0], OutputType.series) self.assertEqual(r.dtype, s.dtype) r = r.tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i,)) self.assertEqual(c.dtype, s.dtype) self.assertEqual(c.name, s.name) pd.testing.assert_index_equal(c.index_value.to_pandas(), s.index[i * 2: (i + 1) * 2]) self.assertEqual(c.shape, (2,) if i == 0 else (1,)) r = series.str.extract(r'[ab](\d)', expand=True) self.assertEqual(r.op.output_types[0], OutputType.dataframe) self.assertEqual(r.shape, (3, 1)) pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(1)) r = r.tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, 0)) pd.testing.assert_index_equal(c.index_value.to_pandas(), s.index[i * 2: (i + 1) * 2]) pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(1)) self.assertEqual(c.shape, (2, 1) if i == 0 else (1, 1)) self.assertIn('lstrip', dir(series.str))