def test_task_mapreduce_script(self): om = Omega() df = pd.DataFrame({'x': range(1, 10), 'y': range(5, 14)}) lr = LinearRegression() om.datasets.put(df, 'sample') om.models.put(lr, 'regmodel') om.runtime.model('regmodel').fit('sample[x]', 'sample[y]').get() om = Omega() basepath = os.path.join( os.path.dirname(sys.modules['omegaml'].__file__), 'example') pkgpath = os.path.abspath(os.path.join(basepath, 'demo', 'callback')) pkgsrc = 'pkg://{}'.format(pkgpath) om.scripts.put(pkgsrc, 'callback') with om.runtime.mapreduce() as ctr: # two tasks to map ctr.model('regmodel').predict('sample[x]') ctr.model('regmodel').predict('sample[x]') # one task to reduce ctr.script('callback').run(as_callback=True) result = ctr.run() result.get() self.assertEqual(len(om.datasets.get('callback_results')), 18) with om.runtime.mapreduce() as ctr: # two tasks to map ctr.model('regmodel').predict('sample[x]') # one task to reduce ctr.script('callback').run(as_callback=True) result = ctr.run() result.get() self.assertEqual(len(om.datasets.get('callback_results')), 27)
def setUp(self): om = self.om = Omega() self.clean() om.models.register_backend(MLFlowModelBackend.KIND, MLFlowModelBackend) om.models.register_backend(MLFlowRegistryBackend.KIND, MLFlowRegistryBackend)
def test_promotion_to_other_db_works(self): om = self.om other = Omega(mongo_url=om.mongo_url + '_promotest') [other.models.drop(name, force=True) for name in other.models.list(include_temp=True)] [other.datasets.drop(name, force=True) for name in other.datasets.list(include_temp=True)] reg = LinearRegression() reg.coef_ = 10 # try models om.models.put(reg, 'mymodel') self.assertIn('mymodel', om.models.list()) self.assertNotIn('mymodel', other.models.list()) om.models.promote('mymodel', other.models) self.assertIn('mymodel', other.models.list()) # ensure changes only in original reg.coef_ = 15 om.models.put(reg, 'mymodel') self.assertNotEqual(om.models.get('mymodel').coef_, other.models.get('mymodel').coef_) # try datasets om.datasets.put(['foo'], 'foo') # -- ensure only in original self.assertIn('foo', om.datasets.list()) self.assertNotIn('foo', other.datasets.list()) # -- promote to other om.datasets.promote('foo', other.datasets) self.assertIn('foo', other.datasets.list()) self.assertEqual(om.datasets.get('foo'), other.datasets.get('foo')) # change original ensure copy not changed om.datasets.put(['foo'], 'foo', append=True) self.assertNotEqual(om.datasets.get('foo'), other.datasets.get('foo'))
def test_ping(self): om = Omega() result = om.runtime.ping('foo', fox='bar') self.assertIn('message', result) self.assertIn('worker', result) self.assertEqual(result['args'], ('foo',)) self.assertEqual(result['kwargs'], dict(fox='bar'))
def setUp(self): import os os.environ['TF_EAGER'] = '1' tf_perhaps_eager_execution() self.om = Omega() self.om.models.register_backend(TFDatasetBackend.KIND, TFDatasetBackend) self.clean()
def setUp(self): TestCase.setUp(self) df = self.df = pd.DataFrame({'x': list(range(0, 10)) + list(range(0, 10)), 'y': list(range(0, 10)) + list(range(0, 10))}) om = self.om = Omega() om.datasets.put(df, 'sample', append=False) self.coll = om.datasets.collection('sample')
def test_partial_fit_chunked(self): # create some data x = np.array(list(range(0, 100))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True # generate a large dataset for i in range(100): om.datasets.put(df, 'data', append=(i > 0)) # create a model locally, store (unfitted) in Omega # -- ignore warnings on y shape import warnings warnings.filterwarnings("ignore", category=DataConversionWarning) lr = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42) om.models.put(lr, 'mymodel2') # have Omega fit the model to get a start, then predict result = om.runtime.model('mymodel2').fit(df[['x']], df[['y']]) result.get() # check the new model version metadata includes the datax/y references result = om.runtime.model('mymodel2').predict('data[x]') pred1 = result.get() mse = mean_squared_error(om.datasets.get('data[y]'), pred1) self.assertGreater(mse, 40) # fit mini batches add better training data, update model result = om.runtime.model('mymodel2').partial_fit('data[x]#', 'data[y]#') result = om.runtime.model('mymodel2').predict('data[x]') pred1 = result.get() mse_2 = mean_squared_error(om.datasets.get('data[y]'), pred1) self.assertLess(mse_2, mse)
def test_predict_hdf_dataframe(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df['x'] Y = df['y'] # put into Omega -- assume a client with pandas, scikit learn os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.pure_python = True om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax', as_hdf=True) om.datasets.put(Y, 'datay', as_hdf=True) # have Omega fit the model then predict lr = LinearRegression() lr.fit(reshaped(X), reshaped(Y)) pred = lr.predict(reshaped(X)) om.models.put(lr, 'mymodel2') # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel2').predict('foo') result = om.runtime.model('mymodel2').predict('datax') pred2 = result.get() self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
def test_predict(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, fit it, store in Omega lr = LinearRegression() lr.fit(X, Y) pred = lr.predict(X) om.models.put(lr, 'mymodel') self.assertIn('mymodel', om.models.list('*')) # have Omega predict it # -- using data already in Omega result = om.runtime.model('mymodel').predict('datax') pred1 = result.get() # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel').predict('foo') result = om.runtime.model('mymodel').predict(X) pred2 = result.get() self.assertTrue((pred == pred1).all(), "runtimes prediction is different(1)") self.assertTrue((pred == pred2).all(), "runtimes prediction is different(2)")
def setUp(self): from omegaml.backends.tensorflow.tfsavedmodel import TensorflowSavedModelBackend self.om = Omega() self.om.models.register_backend(TensorflowSavedModelBackend.KIND, TensorflowSavedModelBackend) self.clean()
def test_fit_pipeline(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a pipeline locally, store (unfitted) in Omega p = Pipeline([ ('lr', LinearRegression()), ]) om.models.put(p, 'mymodel2') self.assertIn('mymodel2', om.models.list('*')) # predict locally for comparison p.fit(reshaped(X), reshaped(Y)) pred = p.predict(reshaped(X)) # have Omega fit the model then predict result = om.runtime.model('mymodel2').fit('datax', 'datay') result.get() result = om.runtime.model('mymodel2').predict('datax') pred1 = result.get() self.assertTrue( (pred == pred1).all(), "runtimes prediction is different(1)")
def test_default_bucket_fails(self): # put a model in the default bucket om = Omega() X = np.arange(10).reshape(-1, 1) y = X * 2 # train model locally clf = LinearRegression() clf.fit(X, y) result = clf.predict(X) # store model in om om.models.put(clf, 'regression') resp = self.client.put('/api/v1/model/regression/predict', json={ 'columns': ['v'], 'data': [dict(v=[5])], }, auth=self.auth, headers=self._headers) # we expect an error because the model does not exist in the default bucket self.assertEqual(resp.status_code, 500) # see if we can get it to predict with the correct bucket (all other tests do this) # -- note we simply remove the the 'bucket' header which reverts to the default resp = self.client.put('/api/v1/model/regression/predict', json={ 'columns': ['v'], 'data': dict(v=[5]) }, auth=self.auth) self.assertEqual(resp.status_code, 200)
def test_promotion_to_other_db_works(self): om = self.om other = Omega(mongo_url=om.mongo_url + '_test') reg = LinearRegression() om.models.put(reg, 'mymodel') om.models.promote('mymodel', other.models) om.datasets.put(['foo'], 'foo') om.datasets.promote('foo', other.datasets)
def test_predict_multiple_samples(self): om = Omega() reg = LinearRegression() df = pd.DataFrame({'x': range(10)}) df['y'] = df['x'] * 2 + 3 reg.fit(df[['x']], df['y']) om.models.put(reg, 'regmodel') result = om.runtime.model('regmodel').predict([[5], [6]]).get()
def setUp(self): from omegaml.backends.tensorflow.tfestimatormodel import TFEstimatorModelBackend self.om = Omega() self.om.models.register_backend(TFEstimatorModelBackend.KIND, TFEstimatorModelBackend) self.clean() tf_perhaps_eager_execution()
def setUp(self): self.om = Omega() self.om.models.register_backend(TensorflowKerasBackend.KIND, TensorflowKerasBackend) self.om.models.register_backend(TensorflowKerasSavedModelBackend.KIND, TensorflowKerasSavedModelBackend) self.clean() tf_perhaps_eager_execution()
def setUp(self): self.om = om = Omega() self.om_restore = self.om['restore'] for omx in (self.om, self.om_restore): self._apply_store_mixin(omx) self.clean() self.clean(bucket='restore') with OmegaExportArchive('/tmp/test', None) as arc: arc.clear()
def setUp(self): self.defaults = settings() OMEGA_STORE_BACKENDS = self.defaults.OMEGA_STORE_BACKENDS self.backend = backend = 'omegaml.backends.package.PythonPipSourcedPackageData' if PythonPipSourcedPackageData.KIND not in OMEGA_STORE_BACKENDS: OMEGA_STORE_BACKENDS[PythonPipSourcedPackageData.KIND] = backend self.om = Omega() delete_database() self.pkgsdir = self.om.scripts.get_backend_bykind( PythonPipSourcedPackageData.KIND).packages_path mkdirs(self.pkgsdir)
def test_mapreduce_getall(self): om = Omega() code = """print('hello')""" om.jobs.create(code, 'myjob') # --mapreduce with om.runtime.mapreduce() as crt: for i in range(5): om.runtime.job(f'myjob').run() result = crt.run() results = result.getall() self.assertEqual(len(results), 5)
def setUp(self): from omegaml.backends.tensorflow.tfkeras import TensorflowKerasBackend from omegaml.backends.tensorflow.tfkerassavedmodel import TensorflowKerasSavedModelBackend self.om = Omega() self.om.models.register_backend(TensorflowKerasBackend.KIND, TensorflowKerasBackend) self.om.models.register_backend(TensorflowKerasSavedModelBackend.KIND, TensorflowKerasSavedModelBackend) self.clean() tf_perhaps_eager_execution()
def test_defaults_repr(self): # ensure accessing defaults in shell does not cause RecursionError om = Omega() not_raised = False try: context_repr = repr(om.defaults) except RecursionError as e: pass else: not_raised = True self.assertTrue(not_raised)
def test_fit(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, store (unfitted) in Omega lr = LinearRegression() om.models.put(lr, 'mymodel2') self.assertIn('mymodel2', om.models.list('*')) # predict locally for comparison lr.fit(X, Y) pred = lr.predict(X) # try predicting without fitting with self.assertRaises(NotFittedError): result = om.runtime.model('mymodel2').predict('datax') result.get() # have Omega fit the model then predict result = om.runtime.model('mymodel2').fit('datax', 'datay') result.get() # check the new model version metadata includes the datax/y references meta = om.models.metadata('mymodel2') self.assertIn('metaX', meta.attributes) self.assertIn('metaY', meta.attributes) # -- using data already in Omega result = om.runtime.model('mymodel2').predict('datax') pred1 = result.get() # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel2').predict('foo') result = om.runtime.model('mymodel2').fit(X, Y) result = om.runtime.model('mymodel2').predict(X) pred2 = result.get() # -- check the local data provided to fit was stored as intended meta = om.models.metadata('mymodel2') self.assertIn('metaX', meta.attributes) self.assertIn('metaY', meta.attributes) self.assertIn('_fitX', meta.attributes.get('metaX').get('collection')) self.assertIn('_fitY', meta.attributes.get('metaY').get('collection')) self.assertTrue( (pred == pred1).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
def consumer(q, url): # note the stream decorator blocks the consumer and runs the decorated # function asynchronously upon the window criteria is satisfied om = Omega(mongo_url=url) @streaming('test', size=2, url=url, keep=True, queue=q, sink=DatasetSink(om, 'consumer')) def myprocess(window): return {'myprocess': True, 'data': window.data}
def test_task_sequence(self): om = Omega() df = pd.DataFrame({'x': range(1, 10), 'y': range(5, 14)}) lr = LinearRegression() om.datasets.put(df, 'sample') om.models.put(lr, 'regmodel') with om.runtime.sequence() as ctr: ctr.ping(wait=False) ctr.model('regmodel').fit('sample[x]', 'sample[y]') ctr.model('regmodel').predict('sample[x]') result = ctr.run() data = result.get() assert_array_almost_equal(df['y'].values, data[:, 0])
def test_partial_fit(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']][0:2] Y = df[['y']][0:2] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(df[['x']], 'datax-full') om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, store (unfitted) in Omega # -- ignore warnings on y shape import warnings warnings.filterwarnings("ignore", category=DataConversionWarning) lr = SGDRegressor() om.models.put(lr, 'mymodel2') # have Omega fit the model to get a start, then predict result = om.runtime.model('mymodel2').fit('datax', 'datay') result.get() # check the new model version metadata includes the datax/y references result = om.runtime.model('mymodel2').predict('datax-full') pred1 = result.get() mse = mean_squared_error(df.y, pred1) self.assertGreater(mse, 90) # fit mini batches add better training data, update model batch_size = 2 for i, start in enumerate(range(0, len(df))): previous_mse = mse X = df[['x']][start:start + batch_size] Y = df[['y']][start:start + batch_size] om.datasets.put(X, 'datax-update', append=False) om.datasets.put(Y, 'datay-update', append=False) result = om.runtime.model('mymodel2').partial_fit( 'datax-update', 'datay-update') result.get() # check the new model version metadata includes the datax/y # references result = om.runtime.model('mymodel2').predict('datax-full') pred1 = result.get() mse = mean_squared_error(df.y, pred1) self.assertLess(mse, previous_mse) # mse == 0 is most accurate the best self.assertLess(mse, 1.0)
def _get_omega_from_config(configfile, qualifier=None): from omegaml import Omega from omegaml import settings, _base_config defaults = settings() with open(configfile, 'r') as fconfig: configs = yaml.safe_load(fconfig) qualifier = qualifier or 'default' if qualifier == 'default': config = configs.get(qualifier, configs) else: config = configs[qualifier] _base_config.update_from_dict(config, attrs=defaults) settings(reload=True) om = Omega(defaults=defaults) return om
def get_omega_from_apikey(userid, apikey, api_url=None, requested_userid=None, qualifier=None, view=False): """ setup an Omega instance from userid and apikey :param userid: the userid :param apikey: the apikey :param api_url: the api URL :param requested_userid: the userid to request config for. in this case userid and apikey must for a staff user for the request to succeed :param qualifier: the database qualifier requested. defaults to 'default' :returns: the Omega instance configured for the given user """ from omegaml import Omega from omegaml import settings, _base_config defaults = settings() qualifier = qualifier or 'default' api_url = api_url or defaults.OMEGA_RESTAPI_URL if api_url.startswith('http') or any('test' in v for v in sys.argv): api_auth = OmegaRestApiAuth(userid, apikey) configs = get_user_config_from_api(api_auth, api_url=api_url, requested_userid=requested_userid, view=view) configs = configs['objects'][0]['data'] elif api_url == 'local': configs = { k: getattr(defaults, k) for k in dir(defaults) if k.startswith('OMEGA') } else: raise ValueError('invalid api_url {}'.format(api_url)) if qualifier == 'default': config = configs.get(qualifier, configs) else: config = configs[qualifier] _base_config.update_from_dict(config) settings(reload=True) om = Omega(defaults=defaults) return om
def test_gridsearch(self): X, y = make_classification() logreg = LogisticRegression(solver='liblinear') os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.models.put(logreg, 'logreg') params = {'C': [0.1, 0.5, 1.0]} # gridsearch on runtimes om.runtime.model('logreg').gridsearch(X, y, parameters=params).get() meta = om.models.metadata('logreg') # check gridsearch was saved self.assertIn('gridsearch', meta.attributes) self.assertEqual(len(meta.attributes['gridsearch']), 1) self.assertIn('gsModel', meta.attributes['gridsearch'][0]) # check we can get back the gridsearch model gs_model = om.models.get(meta.attributes['gridsearch'][0]['gsModel']) self.assertIsInstance(gs_model, GridSearchCV)
def test_task_callback(self): om = Omega() basepath = os.path.join( os.path.dirname(sys.modules['omegaml'].__file__), 'example') pkgpath = os.path.abspath(os.path.join(basepath, 'demo', 'callback')) pkgsrc = 'pkg://{}'.format(pkgpath) om.scripts.put(pkgsrc, 'callback') df = pd.DataFrame({'x': range(1, 10), 'y': range(5, 14)}) lr = LinearRegression() lr.fit(df[['x']], df['y']) om.datasets.put(df, 'sample') om.models.put(lr, 'regmodel') result = (om.runtime.callback('callback').model('regmodel').predict( 'sample[x]').get()) self.assertEqual(len(om.datasets.get('callback_results')), 1) result = (om.runtime.callback('callback').model('regmodel').predict( 'sample[x]').get()) self.assertEqual(len(om.datasets.get('callback_results')), 2)
def test_logging_mode(self): """ test task python output can be logged for all requests """ om = Omega() om.logger.reset() # -- request logging om.runtime.mode(local=True, logging=True) om.runtime.ping(fox='bar') self.assertEqual(len(om.logger.dataset.get(level='INFO')), 4) # -- switch off logging om.logger.reset() om.runtime.mode(local=True, logging=False) om.runtime.ping(fox='bar') self.assertEqual(len(om.logger.dataset.get(level='INFO')), 0) # -- request specific logger om.logger.reset() om.runtime.mode(local=True, logging=('celery', 'DEBUG')) om.runtime.ping(fox='bar') self.assertEqual(len(om.logger.dataset.get(level='DEBUG')), 3)