def test_put_dataframe(self): # create some dataframe df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') store.put(df, 'mydata') df2 = store.get('mydata') self.assertTrue(df.equals(df2), "expected dataframes to be equal")
def test_put_python_dict(self): # create some data data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} store = OmegaStore(prefix='') store.put(data, 'mydata') data2 = store.get('mydata') self.assertEquals([data], data2)
def test_get_dataframe_projected_mixin(self): # create some dataframe df = pd.DataFrame({ 'a': list(range(1, 10)), 'b': list(range(1, 10)), 'c': list(range(1, 10)), }) store = OmegaStore(prefix='') store.put(df, 'mydata') # filter in mongodb specs = ['a', ':b', ':', 'b:', '^c'] for spec in specs: name_spec = 'mydata[{}]'.format(spec) df2 = store.get(name_spec) # filter local dataframe if spec == ':': dfx = df.loc[:, :] elif ':' in spec: from_col, to_col = spec.split(':') slice_ = slice(from_col or None, to_col or None) dfx = df.loc[:, slice_] elif spec.startswith('^'): spec_cols = spec[1:].split(',') cols = [col for col in df.columns if col not in spec_cols] dfx = df[cols] else: dfx = df[[spec]] self.assertTrue(dfx.equals(df2), "expected dataframes to be equal")
def __init__(self, defaults=None, mongo_url=None, celeryconf=None, bucket=None, **kwargs): """ Initialize the client API Without arguments create the client API according to the user's configuration in :code:`~/omegaml/config.yml`. Arguments override the user's configuration. :param defaults: the DefaultsContext :param mongo_url: the fully qualified URI to the mongo database, of format :code:`mongodb://user:password@host:port/database` :param celeryconf: the celery configuration dictionary """ from omegaml.util import settings # avoid circular imports from omegaml.notebook.jobs import OmegaJobs from omegaml.runtimes import OmegaRuntime from omegaml.store import OmegaStore # celery and mongo configuration self.defaults = defaults or settings() self.mongo_url = mongo_url or self.defaults.OMEGA_MONGO_URL self.bucket = bucket # setup storage locations self.models = OmegaStore(mongo_url=self.mongo_url, bucket=bucket, prefix='models/', defaults=self.defaults) self.datasets = OmegaStore(mongo_url=self.mongo_url, bucket=bucket, prefix='data/', defaults=self.defaults) self._jobdata = OmegaStore(mongo_url=self.mongo_url, bucket=bucket, prefix='jobs/', defaults=self.defaults) self.scripts = OmegaStore(mongo_url=self.mongo_url, prefix='scripts/', defaults=self.defaults) # runtimes environments self.runtime = OmegaRuntime(self, bucket=bucket, defaults=self.defaults, celeryconf=celeryconf) self.jobs = OmegaJobs(store=self._jobdata)
def test_store_irregular_column_names(self): """ test storing irregular column names """ df = pd.DataFrame({'x_1': range(10)}) store = OmegaStore() store.put(df, 'foo', append=False) df2 = store.get('foo') self.assertEqual(df.columns, df2.columns)
def test_lazy_unique(self): """ test getting a MDataFrame and unique values """ data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} df = pd.DataFrame(data) store = OmegaStore() meta = store.put(df, 'foo', append=False) val = store.get('foo', lazy=True).a.unique().value self.assertListEqual(data['a'], list(val))
def test_store_series(self): """ test storing a pandas series with it's own index """ from string import ascii_lowercase series = pd.Series(range(10), index=(c for c in ascii_lowercase[0:10])) store = OmegaStore() store.put(series, 'fooseries', append=False) series2 = store.get('fooseries') assert_series_equal(series, series2)
def test_store_datetime(self): """ test storing naive datetimes """ df = pd.DataFrame( {'x': pd.date_range(datetime(2016, 1, 1), datetime(2016, 1, 10))}) store = OmegaStore() store.put(df, 'test-date', append=False) df2 = store.get('test-date') assert_frame_equal(df, df2)
def test_store_dict_in_df(self): df = pd.DataFrame({ 'x': [{'foo': 'bar '}], }) store = OmegaStore() store.put(df, 'test-dict', append=False) df2 = store.get('test-dict') testing.assert_frame_equal(df, df2)
def test_hidden_temp_handling(self): foo_store = OmegaStore(bucket='foo') foo_store.put({}, '_temp') self.assertNotIn('_temp', foo_store.list(include_temp=False)) self.assertIn('_temp', foo_store.list(include_temp=True)) foo_store.put({}, '.hidden') self.assertNotIn('.hidden', foo_store.list(hidden=False)) self.assertIn('.hidden', foo_store.list(hidden=True))
def test_put_dataframe_with_index(self): # create some dataframe df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') store.put(df, 'mydata', index=['a', '-b']) idxs = list(store.collection('mydata').list_indexes()) idx_names = map(lambda v: dict(v).get('name'), idxs) self.assertIn('asc_a__desc_b', idx_names)
def test_put_append_false(self): """ test if we can create a new dataframe without previous metadata """ data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} df = pd.DataFrame(data) store = OmegaStore() # store the object unique_name = uuid.uuid4().hex meta = store.put(df, unique_name, append=False) self.assertEqual(meta['name'], unique_name)
def test_put_dataframe_with_index(self): # create some dataframe df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') store.put(df, 'mydata', index=['a', '-b']) idxs = store.collection('mydata').index_information() idx_names = humanize_index(idxs) self.assertIn('asc__id_asc_a_desc_b_asc__idx#0_0_asc__om#rowid', idx_names)
def test_store_tz_datetime(self): """ test storing timezoned datetimes """ df = pd.DataFrame({ 'y': pd.date_range('2019-10-01', periods=5, tz='US/Eastern', normalize=True) }) store = OmegaStore() store.put(df, 'test-date', append=False) df2 = store.get('test-date') testing.assert_frame_equal(df, df2)
def test_store_series_timeindex(self): """ test storing a pandas series with it's own index """ series = pd.Series(range(10), name='foo', index=pd.date_range(pd.datetime(2016, 1, 1), pd.datetime(2016, 1, 10))) store = OmegaStore() store.put(series, 'fooseries', append=False) series2 = store.get('fooseries') assert_series_equal(series, series2)
def test_get_dataframe_project(self): # create some dataframe df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') store.put(df, 'mydata') # filter in mongodb df2 = store.get('mydata', columns=['a']) # filter local dataframe df = df[['a']] self.assertTrue(df.equals(df2), "expected dataframes to be equal")
def test_get_dataframe_filter(self): # create some dataframe df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') store.put(df, 'mydata') # filter in mongodb df2 = store.get('mydata', filter=dict(a__gt=1, a__lt=10)) # filter local dataframe df = df[(df.a > 1) & (df.a < 10)] self.assertTrue(df.equals(df2), "expected dataframes to be equal")
def test_put_python_dict_multiple(self): # create some data data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} store = OmegaStore(prefix='') store.put(data, 'mydata') store.put(data, 'mydata') data2 = store.get('mydata') # we will have stored the same object twice self.assertEquals(data, data2[0]) self.assertEquals(data, data2[1])
def test_existing_arbitrary_collection_mdataframe(self): data = { 'foo': 'bar', 'bax': { 'fox': 'fax', } } store = OmegaStore() store.register_backend(PandasRawDictBackend.KIND, PandasRawDictBackend) foo_coll = store.mongodb['foo'] foo_coll.insert(data) store.make_metadata('myfoo', collection='foo', kind='pandas.rawdict').save() self.assertIn('myfoo', store.list()) # test we get back _id column if raw=True mdf = store.getl('myfoo', raw=True) self.assertIsInstance(mdf, MDataFrame) data_ = mdf.value assert_frame_equal(json_normalize(data), data_) # test we get just the data column mdf = store.getl('myfoo', raw=False) self.assertIsInstance(mdf, MDataFrame) data_ = mdf.value cols = ['foo', 'bax.fox'] assert_frame_equal(json_normalize(data)[cols], data_[cols])
def test_put_dataframe_xtra_large(self): # create some dataframe # force fast insert df = pd.DataFrame({ 'a': list(range(0, int(1e4 + 1))), 'b': list(range(0, int(1e4 + 1))) }) store = OmegaStore(prefix='') store.put(df, 'mydata') df2 = store.get('mydata') self.assertTrue(df.equals(df2), "expected dataframes to be equal")
def test_store_with_metadata(self): om = OmegaStore(prefix='') # dict data = { 'a': list(range(1, 10)), 'b': list(range(1, 10)) } attributes = {'foo': 'bar'} meta = om.put(data, 'data', attributes=attributes) self.assertEqual(meta.kind, 'python.data') self.assertEqual(meta.attributes, attributes) data2 = om.get('data') self.assertEqual([data], data2) # dataframe df = pd.DataFrame(data) meta = om.put(df, 'datadf', attributes=attributes) self.assertEqual(meta.kind, 'pandas.dfrows') self.assertEqual(meta.attributes, attributes) df2 = om.get('datadf') assert_frame_equal(df, df2) # model lr = LogisticRegression(solver='liblinear', multi_class='auto') meta = om.put(lr, 'mymodel', attributes=attributes) self.assertEqual(meta.kind, 'sklearn.joblib') self.assertEqual(meta.attributes, attributes) lr2 = om.get('mymodel') self.assertIsInstance(lr2, LogisticRegression)
def test_list_raw(self): data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} df = pd.DataFrame(data) store = OmegaStore() meta = store.put(df, 'hdfdf', as_hdf=True) # list with pattern entries = store.list(pattern='hdf*', raw=True) self.assertTrue(isinstance(entries[0], Metadata)) self.assertEqual('hdfdf', entries[0].name) self.assertEqual(len(entries), 1) # list with regexp entries = store.list(regexp='hdf.*', raw=True) self.assertTrue(isinstance(entries[0], Metadata)) self.assertEqual('hdfdf', entries[0].name) self.assertEqual(len(entries), 1) # list without pattern nor regexp entries = store.list('hdfdf', kind=Metadata.PANDAS_HDF, raw=True) self.assertTrue(isinstance(entries[0], Metadata)) self.assertEqual('hdfdf', entries[0].name) self.assertEqual(len(entries), 1) # subset kind entries = store.list('hdfdf', raw=True, kind=Metadata.PANDAS_DFROWS) self.assertEqual(len(entries), 0) entries = store.list('hdfdf', raw=True, kind=Metadata.PANDAS_HDF) self.assertEqual(len(entries), 1)
def __init__(self, mongo_url=None, backend=None, broker=None, celeryconf=None, defaults=None): """ Initialize the client API Without arguments create the client API according to the user's configuration in :code:`~/omegaml/config.yml`. Arguments override the user's configuration. :param mongo_url: the fully qualified URI to the mongo database, of format :code:`mongodb://user:password@host:port/database` :param broker: the celery broker URI :param backend: the celery result backend URI :param celeryconf: the celery configuration dictionary :param celerykwargs: kwargs to create the Celery instance """ from omegaml.util import settings # avoid circular imports from omegaml.notebook.jobs import OmegaJobs from omegaml.runtimes import OmegaRuntime from omegaml.store import OmegaStore # celery and mongo configuration self.defaults = defaults or settings() self.mongo_url = mongo_url or self.defaults.OMEGA_MONGO_URL self.broker = broker or self.defaults.OMEGA_BROKER self.backend = backend or self.defaults.OMEGA_RESULT_BACKEND self.celeryconf = celeryconf # setup storage locations self.models = OmegaStore(mongo_url=mongo_url, prefix='models/', defaults=self.defaults) self.datasets = OmegaStore(mongo_url=mongo_url, prefix='data/', defaults=self.defaults) self._jobdata = OmegaStore(mongo_url=mongo_url, prefix='jobs/', defaults=self.defaults) # runtimes environments self.runtime = OmegaRuntime(self, backend=backend, broker=broker, celeryconf=celeryconf, defaults=self.defaults) self.jobs = OmegaJobs(store=self._jobdata)
def test_store_tz_datetime_dst(self): """ test storing timezoned datetimes """ # 2019 11 03 02:00 is the end of US DST https://www.timeanddate.com/time/dst/2019.html # pymongo will transform the object into a naive dt at UTC time at +3h (arguably incorrectly so) # while pandas creates the Timestamp as UTC -4 (as the day starts at 00:00, not 02:00). # On rendering back to a tz-aware datetime, this yields the wrong date (1 day eaerlier) because # pandas applies -4 on converting from UTC to US/Eastern (correctly). df = pd.DataFrame({ 'y': pd.date_range('2019-11-01', periods=5, tz='US/Eastern', normalize=True) }) store = OmegaStore() store.put(df, 'test-date', append=False) df2 = store.get('test-date') # currently this fails, see @skip reason testing.assert_frame_equal(df, df2)
def test_raw_files(self): store = OmegaStore() store.register_backend(PythonRawFileBackend.KIND, PythonRawFileBackend) # test we can write from a file-like object data = "some data" file_like = BytesIO(data.encode('utf-8')) store.put(file_like, 'myfile') self.assertEqual(data.encode('utf-8'), store.get('myfile').read()) # test we can write from an actual file data = "some other data" file_like = BytesIO(data.encode('utf-8')) with open('/tmp/testfile.txt', 'wb') as fout: fout.write(file_like.read()) store.put('/tmp/testfile.txt', 'myfile') self.assertEqual(data.encode('utf-8'), store.get('myfile').read())
def _make_store(self, prefix): from omegaml.store import OmegaStore return OmegaStore(mongo_url=self.mongo_url, bucket=self.bucket, prefix=prefix, defaults=self.defaults, dbalias=self._dbalias)
def test_put_model_with_prefix(self): # create a test model iris = load_iris() X = iris.data Y = iris.target lr = LogisticRegression() lr.fit(X, Y) result = lr.predict(X) # store it remote store = OmegaStore(prefix='models/') store.put(lr, 'foo') # get it back, try predicting lr2 = store.get('foo') self.assertIsInstance(lr2, LogisticRegression) result2 = lr2.predict(X) self.assertTrue((result == result2).all())
def test_help(self): foo_store = OmegaStore(bucket='foo') obj = {} foo_store.put(obj, '_temp') # get backend for different signatures backend_name = foo_store._resolve_help_backend('_temp') backend_obj = foo_store._resolve_help_backend(obj) self.assertEqual(backend_name, backend_obj) self.assertIsInstance(backend_obj, OmegaStore) # get backend for scikit model reg = LinearRegression() foo_store.put(reg, 'regmodel') backend_name = foo_store._resolve_help_backend('regmodel') backend_obj = foo_store._resolve_help_backend(reg) self.assertIsInstance(backend_name, ScikitLearnBackend) self.assertIsInstance(backend_obj, ScikitLearnBackend)
def test_long_index_name(self): store = OmegaStore(bucket='foo', prefix='foo/') store.defaults.OMEGA_STORE_HASHEDNAMES = True df = pd.DataFrame({'xyz' * 100: range(100), 'yyz' * 300: range(100)}) df = df.set_index('yyz' * 300) # name is limited by index key limit in MongoDB # see https://docs.mongodb.com/manual/reference/limits/#Index-Key-Limit long_name = 'a' * 990 raised = False error = '' try: store.put(df, long_name) except Exception as e: raised = True error = str(e) self.assertFalse(raised, error)
def test_put_model(self): # create a test model iris = load_iris() X = iris.data Y = iris.target lr = LogisticRegression(solver='liblinear', multi_class='auto') lr.fit(X, Y) result = lr.predict(X) # store it remote store = OmegaStore() store.put(lr, 'models/foo') # get it back, try predicting lr2 = store.get('models/foo') self.assertIsInstance(lr2, LogisticRegression) result2 = lr2.predict(X) self.assertTrue((result == result2).all())