Exemplo n.º 1
0
 def test_put_dataframe(self):
     # create some dataframe
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     df2 = store.get('mydata')
     self.assertTrue(df.equals(df2), "expected dataframes to be equal")
Exemplo n.º 2
0
 def test_put_python_dict(self):
     # create some data
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     store = OmegaStore(prefix='')
     store.put(data, 'mydata')
     data2 = store.get('mydata')
     self.assertEquals([data], data2)
Exemplo n.º 3
0
 def test_get_dataframe_projected_mixin(self):
     # create some dataframe
     df = pd.DataFrame({
         'a': list(range(1, 10)),
         'b': list(range(1, 10)),
         'c': list(range(1, 10)),
     })
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     # filter in mongodb
     specs = ['a', ':b', ':', 'b:', '^c']
     for spec in specs:
         name_spec = 'mydata[{}]'.format(spec)
         df2 = store.get(name_spec)
         # filter local dataframe
         if spec == ':':
             dfx = df.loc[:, :]
         elif ':' in spec:
             from_col, to_col = spec.split(':')
             slice_ = slice(from_col or None, to_col or None)
             dfx = df.loc[:, slice_]
         elif spec.startswith('^'):
             spec_cols = spec[1:].split(',')
             cols = [col for col in df.columns if col not in spec_cols]
             dfx = df[cols]
         else:
             dfx = df[[spec]]
         self.assertTrue(dfx.equals(df2), "expected dataframes to be equal")
Exemplo n.º 4
0
    def __init__(self, defaults=None, mongo_url=None, celeryconf=None, bucket=None,
                 **kwargs):
        """
        Initialize the client API

        Without arguments create the client API according to the user's
        configuration in :code:`~/omegaml/config.yml`.

        Arguments override the user's configuration.

            :param defaults: the DefaultsContext
        :param mongo_url: the fully qualified URI to the mongo database,
        of format :code:`mongodb://user:password@host:port/database`
        :param celeryconf: the celery configuration dictionary
        """
        from omegaml.util import settings
        # avoid circular imports
        from omegaml.notebook.jobs import OmegaJobs
        from omegaml.runtimes import OmegaRuntime
        from omegaml.store import OmegaStore
        # celery and mongo configuration
        self.defaults = defaults or settings()
        self.mongo_url = mongo_url or self.defaults.OMEGA_MONGO_URL
        self.bucket = bucket
        # setup storage locations
        self.models = OmegaStore(mongo_url=self.mongo_url, bucket=bucket, prefix='models/', defaults=self.defaults)
        self.datasets = OmegaStore(mongo_url=self.mongo_url, bucket=bucket, prefix='data/', defaults=self.defaults)
        self._jobdata = OmegaStore(mongo_url=self.mongo_url, bucket=bucket, prefix='jobs/', defaults=self.defaults)
        self.scripts = OmegaStore(mongo_url=self.mongo_url, prefix='scripts/', defaults=self.defaults)
        # runtimes environments
        self.runtime = OmegaRuntime(self, bucket=bucket, defaults=self.defaults, celeryconf=celeryconf)
        self.jobs = OmegaJobs(store=self._jobdata)
Exemplo n.º 5
0
 def test_store_irregular_column_names(self):
     """ test storing irregular column names """
     df = pd.DataFrame({'x_1': range(10)})
     store = OmegaStore()
     store.put(df, 'foo', append=False)
     df2 = store.get('foo')
     self.assertEqual(df.columns, df2.columns)
Exemplo n.º 6
0
 def test_lazy_unique(self):
     """ test getting a MDataFrame and unique values """
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     df = pd.DataFrame(data)
     store = OmegaStore()
     meta = store.put(df, 'foo', append=False)
     val = store.get('foo', lazy=True).a.unique().value
     self.assertListEqual(data['a'], list(val))
Exemplo n.º 7
0
 def test_store_series(self):
     """ test storing a pandas series with it's own index """
     from string import ascii_lowercase
     series = pd.Series(range(10), index=(c for c in ascii_lowercase[0:10]))
     store = OmegaStore()
     store.put(series, 'fooseries', append=False)
     series2 = store.get('fooseries')
     assert_series_equal(series, series2)
Exemplo n.º 8
0
 def test_store_datetime(self):
     """ test storing naive datetimes """
     df = pd.DataFrame(
         {'x': pd.date_range(datetime(2016, 1, 1), datetime(2016, 1, 10))})
     store = OmegaStore()
     store.put(df, 'test-date', append=False)
     df2 = store.get('test-date')
     assert_frame_equal(df, df2)
Exemplo n.º 9
0
 def test_store_dict_in_df(self):
     df = pd.DataFrame({
         'x': [{'foo': 'bar '}],
     })
     store = OmegaStore()
     store.put(df, 'test-dict', append=False)
     df2 = store.get('test-dict')
     testing.assert_frame_equal(df, df2)
Exemplo n.º 10
0
 def test_hidden_temp_handling(self):
     foo_store = OmegaStore(bucket='foo')
     foo_store.put({}, '_temp')
     self.assertNotIn('_temp', foo_store.list(include_temp=False))
     self.assertIn('_temp', foo_store.list(include_temp=True))
     foo_store.put({}, '.hidden')
     self.assertNotIn('.hidden', foo_store.list(hidden=False))
     self.assertIn('.hidden', foo_store.list(hidden=True))
Exemplo n.º 11
0
 def test_put_dataframe_with_index(self):
     # create some dataframe
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     store.put(df, 'mydata', index=['a', '-b'])
     idxs = list(store.collection('mydata').list_indexes())
     idx_names = map(lambda v: dict(v).get('name'), idxs)
     self.assertIn('asc_a__desc_b', idx_names)
Exemplo n.º 12
0
 def test_put_append_false(self):
     """ test if we can create a new dataframe without previous metadata """
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     df = pd.DataFrame(data)
     store = OmegaStore()
     # store the object
     unique_name = uuid.uuid4().hex
     meta = store.put(df, unique_name, append=False)
     self.assertEqual(meta['name'], unique_name)
Exemplo n.º 13
0
 def test_put_dataframe_with_index(self):
     # create some dataframe
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     store.put(df, 'mydata', index=['a', '-b'])
     idxs = store.collection('mydata').index_information()
     idx_names = humanize_index(idxs)
     self.assertIn('asc__id_asc_a_desc_b_asc__idx#0_0_asc__om#rowid',
                   idx_names)
Exemplo n.º 14
0
 def test_store_tz_datetime(self):
     """ test storing timezoned datetimes """
     df = pd.DataFrame({
         'y': pd.date_range('2019-10-01', periods=5, tz='US/Eastern', normalize=True)
     })
     store = OmegaStore()
     store.put(df, 'test-date', append=False)
     df2 = store.get('test-date')
     testing.assert_frame_equal(df, df2)
Exemplo n.º 15
0
 def test_store_series_timeindex(self):
     """ test storing a pandas series with it's own index """
     series = pd.Series(range(10),
                        name='foo',
                        index=pd.date_range(pd.datetime(2016, 1, 1),
                                            pd.datetime(2016, 1, 10)))
     store = OmegaStore()
     store.put(series, 'fooseries', append=False)
     series2 = store.get('fooseries')
     assert_series_equal(series, series2)
Exemplo n.º 16
0
 def test_get_dataframe_project(self):
     # create some dataframe
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     # filter in mongodb
     df2 = store.get('mydata', columns=['a'])
     # filter local dataframe
     df = df[['a']]
     self.assertTrue(df.equals(df2), "expected dataframes to be equal")
Exemplo n.º 17
0
 def test_get_dataframe_filter(self):
     # create some dataframe
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     # filter in mongodb
     df2 = store.get('mydata', filter=dict(a__gt=1, a__lt=10))
     # filter local dataframe
     df = df[(df.a > 1) & (df.a < 10)]
     self.assertTrue(df.equals(df2), "expected dataframes to be equal")
Exemplo n.º 18
0
 def test_put_python_dict_multiple(self):
     # create some data
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     store = OmegaStore(prefix='')
     store.put(data, 'mydata')
     store.put(data, 'mydata')
     data2 = store.get('mydata')
     # we will have stored the same object twice
     self.assertEquals(data, data2[0])
     self.assertEquals(data, data2[1])
Exemplo n.º 19
0
 def test_existing_arbitrary_collection_mdataframe(self):
     data = {
         'foo': 'bar',
         'bax': {
             'fox': 'fax',
         }
     }
     store = OmegaStore()
     store.register_backend(PandasRawDictBackend.KIND, PandasRawDictBackend)
     foo_coll = store.mongodb['foo']
     foo_coll.insert(data)
     store.make_metadata('myfoo', collection='foo',
                         kind='pandas.rawdict').save()
     self.assertIn('myfoo', store.list())
     # test we get back _id column if raw=True
     mdf = store.getl('myfoo', raw=True)
     self.assertIsInstance(mdf, MDataFrame)
     data_ = mdf.value
     assert_frame_equal(json_normalize(data), data_)
     # test we get just the data column
     mdf = store.getl('myfoo', raw=False)
     self.assertIsInstance(mdf, MDataFrame)
     data_ = mdf.value
     cols = ['foo', 'bax.fox']
     assert_frame_equal(json_normalize(data)[cols], data_[cols])
Exemplo n.º 20
0
 def test_put_dataframe_xtra_large(self):
     # create some dataframe
     # force fast insert
     df = pd.DataFrame({
         'a': list(range(0, int(1e4 + 1))),
         'b': list(range(0, int(1e4 + 1)))
     })
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     df2 = store.get('mydata')
     self.assertTrue(df.equals(df2), "expected dataframes to be equal")
Exemplo n.º 21
0
 def test_store_with_metadata(self):
     om = OmegaStore(prefix='')
     # dict
     data = {
         'a': list(range(1, 10)),
         'b': list(range(1, 10))
     }
     attributes = {'foo': 'bar'}
     meta = om.put(data, 'data', attributes=attributes)
     self.assertEqual(meta.kind, 'python.data')
     self.assertEqual(meta.attributes, attributes)
     data2 = om.get('data')
     self.assertEqual([data], data2)
     # dataframe
     df = pd.DataFrame(data)
     meta = om.put(df, 'datadf', attributes=attributes)
     self.assertEqual(meta.kind, 'pandas.dfrows')
     self.assertEqual(meta.attributes, attributes)
     df2 = om.get('datadf')
     assert_frame_equal(df, df2)
     # model
     lr = LogisticRegression(solver='liblinear', multi_class='auto')
     meta = om.put(lr, 'mymodel', attributes=attributes)
     self.assertEqual(meta.kind, 'sklearn.joblib')
     self.assertEqual(meta.attributes, attributes)
     lr2 = om.get('mymodel')
     self.assertIsInstance(lr2, LogisticRegression)
Exemplo n.º 22
0
 def test_list_raw(self):
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     df = pd.DataFrame(data)
     store = OmegaStore()
     meta = store.put(df, 'hdfdf', as_hdf=True)
     # list with pattern
     entries = store.list(pattern='hdf*', raw=True)
     self.assertTrue(isinstance(entries[0], Metadata))
     self.assertEqual('hdfdf', entries[0].name)
     self.assertEqual(len(entries), 1)
     # list with regexp
     entries = store.list(regexp='hdf.*', raw=True)
     self.assertTrue(isinstance(entries[0], Metadata))
     self.assertEqual('hdfdf', entries[0].name)
     self.assertEqual(len(entries), 1)
     # list without pattern nor regexp
     entries = store.list('hdfdf', kind=Metadata.PANDAS_HDF, raw=True)
     self.assertTrue(isinstance(entries[0], Metadata))
     self.assertEqual('hdfdf', entries[0].name)
     self.assertEqual(len(entries), 1)
     # subset kind
     entries = store.list('hdfdf', raw=True, kind=Metadata.PANDAS_DFROWS)
     self.assertEqual(len(entries), 0)
     entries = store.list('hdfdf', raw=True, kind=Metadata.PANDAS_HDF)
     self.assertEqual(len(entries), 1)
Exemplo n.º 23
0
    def __init__(self,
                 mongo_url=None,
                 backend=None,
                 broker=None,
                 celeryconf=None,
                 defaults=None):
        """
        Initialize the client API

        Without arguments create the client API according to the user's
        configuration in :code:`~/omegaml/config.yml`.

        Arguments override the user's configuration.

        :param mongo_url: the fully qualified URI to the mongo database,
        of format :code:`mongodb://user:password@host:port/database`
        :param broker: the celery broker URI
        :param backend: the celery result backend URI
        :param celeryconf: the celery configuration dictionary
        :param celerykwargs: kwargs to create the Celery instance
        """
        from omegaml.util import settings
        # avoid circular imports
        from omegaml.notebook.jobs import OmegaJobs
        from omegaml.runtimes import OmegaRuntime
        from omegaml.store import OmegaStore
        # celery and mongo configuration
        self.defaults = defaults or settings()
        self.mongo_url = mongo_url or self.defaults.OMEGA_MONGO_URL
        self.broker = broker or self.defaults.OMEGA_BROKER
        self.backend = backend or self.defaults.OMEGA_RESULT_BACKEND
        self.celeryconf = celeryconf
        # setup storage locations
        self.models = OmegaStore(mongo_url=mongo_url,
                                 prefix='models/',
                                 defaults=self.defaults)
        self.datasets = OmegaStore(mongo_url=mongo_url,
                                   prefix='data/',
                                   defaults=self.defaults)
        self._jobdata = OmegaStore(mongo_url=mongo_url,
                                   prefix='jobs/',
                                   defaults=self.defaults)
        # runtimes environments
        self.runtime = OmegaRuntime(self,
                                    backend=backend,
                                    broker=broker,
                                    celeryconf=celeryconf,
                                    defaults=self.defaults)
        self.jobs = OmegaJobs(store=self._jobdata)
Exemplo n.º 24
0
 def test_store_tz_datetime_dst(self):
     """ test storing timezoned datetimes """
     # 2019 11 03 02:00 is the end of US DST https://www.timeanddate.com/time/dst/2019.html
     # pymongo will transform the object into a naive dt at UTC time at +3h (arguably incorrectly so)
     # while pandas creates the Timestamp as UTC -4 (as the day starts at 00:00, not 02:00).
     # On rendering back to a tz-aware datetime, this yields the wrong date (1 day eaerlier) because
     # pandas applies -4 on converting from UTC to US/Eastern (correctly).
     df = pd.DataFrame({
         'y': pd.date_range('2019-11-01', periods=5, tz='US/Eastern', normalize=True)
     })
     store = OmegaStore()
     store.put(df, 'test-date', append=False)
     df2 = store.get('test-date')
     # currently this fails, see @skip reason
     testing.assert_frame_equal(df, df2)
Exemplo n.º 25
0
 def test_raw_files(self):
     store = OmegaStore()
     store.register_backend(PythonRawFileBackend.KIND, PythonRawFileBackend)
     # test we can write from a file-like object
     data = "some data"
     file_like = BytesIO(data.encode('utf-8'))
     store.put(file_like, 'myfile')
     self.assertEqual(data.encode('utf-8'), store.get('myfile').read())
     # test we can write from an actual file
     data = "some other data"
     file_like = BytesIO(data.encode('utf-8'))
     with open('/tmp/testfile.txt', 'wb') as fout:
         fout.write(file_like.read())
     store.put('/tmp/testfile.txt', 'myfile')
     self.assertEqual(data.encode('utf-8'), store.get('myfile').read())
Exemplo n.º 26
0
 def _make_store(self, prefix):
     from omegaml.store import OmegaStore
     return OmegaStore(mongo_url=self.mongo_url,
                       bucket=self.bucket,
                       prefix=prefix,
                       defaults=self.defaults,
                       dbalias=self._dbalias)
Exemplo n.º 27
0
 def test_put_model_with_prefix(self):
     # create a test model
     iris = load_iris()
     X = iris.data
     Y = iris.target
     lr = LogisticRegression()
     lr.fit(X, Y)
     result = lr.predict(X)
     # store it remote
     store = OmegaStore(prefix='models/')
     store.put(lr, 'foo')
     # get it back, try predicting
     lr2 = store.get('foo')
     self.assertIsInstance(lr2, LogisticRegression)
     result2 = lr2.predict(X)
     self.assertTrue((result == result2).all())
Exemplo n.º 28
0
 def test_help(self):
     foo_store = OmegaStore(bucket='foo')
     obj = {}
     foo_store.put(obj, '_temp')
     # get backend for different signatures
     backend_name = foo_store._resolve_help_backend('_temp')
     backend_obj = foo_store._resolve_help_backend(obj)
     self.assertEqual(backend_name, backend_obj)
     self.assertIsInstance(backend_obj, OmegaStore)
     # get backend for scikit model
     reg = LinearRegression()
     foo_store.put(reg, 'regmodel')
     backend_name = foo_store._resolve_help_backend('regmodel')
     backend_obj = foo_store._resolve_help_backend(reg)
     self.assertIsInstance(backend_name, ScikitLearnBackend)
     self.assertIsInstance(backend_obj, ScikitLearnBackend)
Exemplo n.º 29
0
 def test_long_index_name(self):
     store = OmegaStore(bucket='foo', prefix='foo/')
     store.defaults.OMEGA_STORE_HASHEDNAMES = True
     df = pd.DataFrame({'xyz' * 100: range(100), 'yyz' * 300: range(100)})
     df = df.set_index('yyz' * 300)
     # name is limited by index key limit in MongoDB
     # see https://docs.mongodb.com/manual/reference/limits/#Index-Key-Limit
     long_name = 'a' * 990
     raised = False
     error = ''
     try:
         store.put(df, long_name)
     except Exception as e:
         raised = True
         error = str(e)
     self.assertFalse(raised, error)
Exemplo n.º 30
0
 def test_put_model(self):
     # create a test model
     iris = load_iris()
     X = iris.data
     Y = iris.target
     lr = LogisticRegression(solver='liblinear', multi_class='auto')
     lr.fit(X, Y)
     result = lr.predict(X)
     # store it remote
     store = OmegaStore()
     store.put(lr, 'models/foo')
     # get it back, try predicting
     lr2 = store.get('models/foo')
     self.assertIsInstance(lr2, LogisticRegression)
     result2 = lr2.predict(X)
     self.assertTrue((result == result2).all())