def train_classifier(when): mldb.put( "/v1/procedures/tng_classif", { "type": "classifier.train", "params": { "trainingData": { "select": "{* EXCLUDING (x)} as features, x as label", "when": when, "from": { "id": "dataset1" } }, "configuration": { "glz": { "type": "glz", "verbosity": 3, "normalize": True, "regularization": 'l2' } }, "algorithm": "glz", "modelFileUrl": "file://tmp/MLDB-945.tng.cls" } }) mldb.post('/v1/procedures/tng_classif/runs')
def train_svd(when, output_index): global dataset_index dataset_index += 1 svd_procedure = "/v1/procedures/when_svd" # svd procedure configuration svd_config = { 'type': 'svd.train', 'params': { "trainingData": { "from": { "id": "svd_example" }, "when": when }, "rowOutputDataset": { "id": "when_svd_row_" + str(dataset_index), 'type': "embedding" }, "columnOutputDataset": { "id": "svd_embedding_" + str(output_index), "type": "embedding" } } } mldb.put(svd_procedure, svd_config) mldb.post(svd_procedure + '/runs') result = mldb.get('/v1/query', q="SELECT * FROM when_svd_row_" + str(dataset_index)) response = result.json() return len(response[0]["columns"])
def train_tsne(when): global dataset_index dataset_index += 1 tsne_procedure = "/v1/procedures/when_tsne" # t-sne procedure configuration tsne_config = { 'type': 'tsne.train', 'params': { "trainingData": { "from": { "id": "svd_example" }, "when": when }, "rowOutputDataset": { "id": "tsne_embedding_" + str(dataset_index), 'type': "embedding" } } } mldb.put(tsne_procedure, tsne_config) mldb.post(tsne_procedure + '/runs') result = mldb.get('/v1/query', q="SELECT * FROM tsne_embedding_" + str(dataset_index)) return len(result.json()[0]["columns"])
def test_most_frequent(self): ds = mldb.create_dataset({ 'id': 'most_freq_source', 'type': 'sparse.mutable' }) row_num = 0 class Counter(object): def __init__(self): self.num = 0 def __next__(self): self.num += 1 return self.num vals = { 'a': 5, 'b': 4, 'c': 3, 'd': 2, 'e': 1, 'f': 1, 'g': 1, 'h': 1, 'i': 1, 'j': 1, 'k': 1, 'l': 1, 'm': 1, } c = Counter() for k, count in vals.items(): for _ in range(count): ds.record_row(next(c), [['col', k, 0]]) ds.commit() mldb.post( '/v1/procedures', { 'type': 'summary.statistics', 'params': { 'runOnCreation': True, 'inputData': "SELECT * FROM most_freq_source", 'outputDataset': { 'id': 'most_freq_output', 'type': 'sparse.mutable' } } }) res = mldb.query("SELECT * FROM most_freq_output ORDER BY rowName()") self.assertTableResultEquals(res, [[ "_rowName", "value.data_type", "value.most_frequent_items.a", "value.most_frequent_items.b", "value.most_frequent_items.c", "value.most_frequent_items.d", "value.most_frequent_items.h", "value.most_frequent_items.i", "value.most_frequent_items.j", "value.most_frequent_items.k", "value.most_frequent_items.l", "value.most_frequent_items.m", "value.num_null", "value.num_unique" ], ["col", "categorical", 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 0, 13]])
def test_string_over_null(self): """ The primary test. The issue column, which was always null for training, is now having a string values for testing. """ ds = mldb.create_dataset({ 'id': 'test_string_over_null_ds', 'type': 'sparse.mutable' }) for idx in range(10): ds.record_row( 'row{}'.format(idx), [['line', idx, 0], ['label', 0, 0], ['feature', random.random() + 0.6, 0], ['noise', random.random(), 0], ['issue', 'STRING', 0]]) ds.commit() mldb.post('/v1/procedures', { 'type': 'classifier.test', 'params' : { 'mode': 'boolean', 'testingData': 'SELECT score_it({features: {feature, noise, issue}})[score] AS score, ' \ 'label FROM test_string_over_null_ds' } })
def test_record_null_row_name(self): mldb.put('/v1/datasets/ds_null', {'type': 'sparse.mutable'}) with self.assertRaises(ResponseException): # noqa mldb.post('/v1/datasets/ds_null/rows', { 'rowName': None, 'columns': [['colA', 1, 1]] })
def test_set_return_0(self): mldb.put( "/v1/plugins/mldb2114", { "type": "python", "params": { "source": { "routes": """request.set_return("", 0)""" } } }) with self.assertRaises(ResponseException) as e: mldb.get('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.post('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.put('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.delete('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500)
def test_no_set_return(self): mldb.put( "/v1/plugins/mldb2114", { "type": "python", "params": { "source": { "routes": """ from mldb import mldb mldb.log('no return') """ } } }) msg = "Return value is required for route handlers but not set" with self.assertRaisesRegex(ResponseException, msg) as e: mldb.get('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.post('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.put('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.delete('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500)
def test_dottest_col_names(self): ds = mldb.create_dataset({ 'id': 'dotted_col_ds', 'type': 'sparse.mutable' }) ds.record_row('row1', [['col.a', 1, 0]]) ds.commit() mldb.post( '/v1/procedures', { 'type': 'summary.statistics', 'params': { 'runOnCreation': True, 'inputData': 'SELECT * FROM dotted_col_ds', 'outputDataset': { 'id': 'output_dotted_col_ds', 'type': 'sparse.mutable' } } }) res = mldb.query("SELECT * FROM output_dotted_col_ds") self.assertTableResultEquals(res, [[ "_rowName", "value.1st_quartile", "value.3rd_quartile", "value.data_type", "value.max", "value.avg", "value.median", "value.min", "value.most_frequent_items.1", "value.num_null", "value.num_unique", "value.stddev" ], ['"col.a"', 1, 1, "number", 1, 1, 1, 1, 1, 0, 1, "NaN"]])
def insert_with_ts(ts): mldb.post('/v1/datasets/ds/rows', { 'rowName' : 'row1', 'columns' : [ ['colA', 1, ts], ] })
def test_import_missing_param(self): msg = 'uriConnectionScheme is a required property' with self.assertRaisesRegex(ResponseException, msg): mldb.post('/v1/procedures', { 'type' : 'mongodb.import', 'params' : { 'collection' : 'users', 'outputDataset' : { 'id' : 'out', 'type' : 'sparse.mutable' } } }) msg = 'collection is a required property and must not be empty' with self.assertRaisesRegex(ResponseException, msg): mldb.post('/v1/procedures', { 'type' : 'mongodb.import', 'params' : { 'uriConnectionScheme' : 'mongodb://localhost:27017/tutorial', 'outputDataset' : { 'id' : 'out', 'type' : 'sparse.mutable' } } })
def test_rowHash(self): mldb.post( '/v1/procedures', { 'type': 'import.text', 'params': { "dataFileUrl": "https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv", 'outputDataset': "titanic_hashed", "where": "rowHash() % 3 = 0", 'runOnCreation': True, } }) mldb.post( '/v1/procedures', { 'type': 'import.text', 'params': { "dataFileUrl": "https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv", 'outputDataset': "titanic_no_hashed", 'runOnCreation': True, } }) self.assertTableResultEquals( mldb.query("select count(*) from titanic_hashed"), [["_rowName", "count(*)"], ["[]", 287]]) self.assertTableResultEquals( mldb.query("select count(*) from titanic_no_hashed"), [["_rowName", "count(*)"], ["[]", 891]])
def test_function_creation_bug(self): mldb.post( "/v1/procedures", { "type": "import.text", "params": { "dataFileUrl": "http://public.mldb.ai/narrow_test.csv.gz", "outputDataset": "narrow", "runOnCreation": True } }) # it seems that the training fails to save the function but we proceed to testing # where we try to use the function but then can't find it # 1) we should not move to testing if function-creation fails # we should report that function-creation failed # 2) function creation should not fail for a dt on this dataset mldb.put( "/v1/procedures/train", { "type": "classifier.experiment", "params": { "experimentName": "x", "inputData": "select {a} as features, b as label from narrow", "algorithm": "dt", "mode": "regression", "configurationFile": "./mldb/container_files/classifiers.json", "modelFileUrlPattern": "file://tmp/MLDB-1597-creation$runid.cls", "runOnCreation": True } })
def train_svd(order_by, where, offset, limit): svd_procedure = "/v1/procedures/order_svd" # svd procedure configuration svd_config = { 'type': 'svd.train', 'params': { "trainingData": { "from": { "id": "svd_example" }, "select": "x, y, z", "orderBy": order_by, "where": where, "offset": offset, "limit": limit }, "rowOutputDataset": { "id": "svd_row", 'type': "embedding" }, "columnOutputDataset": { "id": "svd_column", "type": "embedding" } } } mldb.put(svd_procedure, svd_config) mldb.post(svd_procedure + '/runs') result = mldb.get('/v1/query', q="SELECT * FROM svd_row") return len(result.json()[0]["columns"])
def test_do_not_run_on_creation(self): mldb.put('/v1/procedures/do_not_run_on_creation', { 'type' : 'transform', 'params' : { 'skipEmptyRows' : False, 'runOnCreation' : False } }) msg = 'You need to define inputData' with self.assertRaisesRegex(ResponseException, msg): mldb.put('/v1/procedures/do_not_run_on_creation/runs/r1', { 'params' : {} }) res = mldb.post('/v1/procedures', { 'type' : 'transform', 'params' : { 'skipEmptyRows' : False, 'runOnCreation' : False } }).json() with self.assertRaisesRegex(ResponseException, msg): mldb.post('/v1/procedures/{}/runs'.format(res['id']), { 'params' : {} })
def test_join_with_and(self): resp = mldb.query('select * from ds_train') mldb.log(resp) mldb.post( "/v1/procedures", { "type": "transform", "params": { "inputData": """ select * from ds left join ds_stats on (ds.dow=ds_stats.dow and ds.a_int=ds_stats.a_int) limit 10 """, "outputDataset": { "id": "ds_train2", "type": "tabular", "params": { "unknownColumns": "add" } }, "runOnCreation": True } }) resp2 = mldb.query('select * from ds_train2') mldb.log(resp2) # equivalent join conditions should be returning the same dataset # this is a very weak check because the columns and the row ordering # of these two equivalent joins are currently very different self.assertEqual(len(resp), len(resp2), 'expected response sizes to match')
def setUpClass(cls): ds = mldb.create_dataset({'id' : 'ds', 'type' : 'sparse.mutable'}) ds.record_row('row1', [['label', 1, 12], ['feat1', 1, 0], ['feat2', 1, 0]]) ds.record_row('row2', [['label', 0, 12], ['feat1', 1, 0], ['feat2', 0, 0]]) ds.record_row('row3', [['label', 0, 12], ['feat1', 0, 0], ['feat2', 0, 0]]) ds.commit() mldb.post('/v1/procedures', { 'type' : 'classifier.train', 'params' : { 'runOnCreation' : True, "mode": "boolean", 'algorithm' : 'glz', "configuration": { "glz": { "type": "glz", "verbosity": 3, "normalize": False, "regularization": 'l2' } }, 'trainingData' : """ SELECT {* EXCLUDING(label)} AS features, label FROM ds """, "modelFileUrl": "file://build/x86_64/tmp/fmlhTODO.cls", } })
def test_c_over_a_or_b(self): """ This is an alternate test of unknown values. The model was built with column a_or_b having always a value of either a or b. Here, we test with a dataset having always the "never seen value" of c for that column. """ ds = mldb.create_dataset({ 'id': 'test_c_over_a_or_b_ds', 'type': 'sparse.mutable' }) for idx in range(10): ds.record_row('row{}'.format(idx), [['line', idx, 0], ['label', 0, 0], ['feature', random.random() + 0.6, 0], ['noise', random.random(), 0], ['a_or_b', 'c', 0]]) ds.commit() mldb.post('/v1/procedures', { 'type': 'classifier.test', 'params' : { 'mode': 'boolean', 'testingData': 'SELECT score_it({features: {feature, noise, a_or_b}})[score] AS score, ' \ 'label FROM test_c_over_a_or_b_ds' } })
def test_too_many_requested_rows(self): # too many requested rows without sampling sampled_dataset_conf = { "type": "sampled", "params": { "dataset": { "id": "toy" }, "rows": 25000, "withReplacement": False } } with self.assertRaises(ResponseException) as re: mldb.put("/v1/datasets/patate", sampled_dataset_conf) self.assertEqual(re.exception.response.status_code, 400) sampled_dataset_conf["params"]["withReplacement"] = True mldb.put("/v1/datasets/patate", sampled_dataset_conf) # try to insert and make sure we get an exception with self.assertRaises(ResponseException) as re: mldb.post("/v1/datasets/patate/rows", { "rowName": "patato", "columns": [["a", "b", 0]] }) self.assertEqual(re.exception.response.status_code, 400)
def setUpClass(cls): mldb.post( '/v1/procedures', { 'type': 'import.text', 'params': { 'dataFileUrl': 'http://public.mldb.ai/datasets/enron.csv.gz', 'outputDataset': 'enron_data', 'named': "'enron_' + dataset + '_mail_' + index", 'where': 'dataset = 1' } }) mldb.put( '/v1/functions/bow', { 'type': 'sql.expression', 'params': { 'expression': """ tokenize(msg, {splitChars: ' :.-!?''"()[],', quoteChar: ''}) as bow """ } }) mldb.post( '/v1/procedures', { 'type': 'transform', 'params': { 'inputData': """ select bow({msg})[bow] as *, label = 'spam' as message_is_spam from enron_data """, 'outputDataset': 'enron_features' } })
def test_delete(self): # this test depends on put and post url = '/v1/datasets/ds' mldb.put(url, { 'type' : 'sparse.mutable' }) mldb.post(url + '/commit') mldb.delete(url)
def test_offset_not_accepted(self): with self.assertRaises(ResponseException): mldb.post( "/v1/procedures", { "type": "classifier.experiment", "params": { 'runOnCreation': True, "experimentName": "test_no_test_data", "inputData": """ SELECT {* EXCLUDING(label)} AS features, label FROM toy OFFSET 1""", "modelFileUrlPattern": "file://build/x86_64/tmp/test_no_test_data_$runid.cls", "algorithm": "glz", "equalizationFactor": 0.5, "mode": "boolean", "configuration": { "glz": { "type": "glz", "verbosity": 3, "normalize": False, "regularization": 'l2' } }, "outputAccuracyDataset": False } }) with self.assertRaises(ResponseException): mldb.post( "/v1/procedures", { "type": "classifier.experiment", "params": { 'runOnCreation': True, "experimentName": "test_no_test_data", "inputData": """ SELECT {* EXCLUDING(label)} AS features, label FROM toy""", "testingDataOverride": """ SELECT {* EXCLUDING(label)} AS features, label FROM toy OFFSET 1""", "modelFileUrlPattern": "file://build/x86_64/tmp/test_no_test_data_$runid.cls", "algorithm": "glz", "equalizationFactor": 0.5, "mode": "boolean", "configuration": { "glz": { "type": "glz", "verbosity": 3, "normalize": False, "regularization": 'l2' } }, "outputAccuracyDataset": False } })
def test_post_empty_row_name(self): mldb.put('/v1/datasets/empty_row_name', {'type': 'sparse.mutable'}) mldb.post('/v1/datasets/empty_row_name/rows', { 'rowName': '', 'columns': [['col', 'val', 0]] }) mldb.post('/v1/datasets/empty_row_name/commit') res = mldb.query("SELECT * FROM empty_row_name") self.assertTableResultEquals(res, [['_rowName', 'col'], ['""', "val"]])
def setUpClass(cls): mldb.post( "/v1/functions", { "id": "explain", "type": "classifier.explain", "params": { "modelFileUrl": "file://mldb/testing/fixtures/mnist_glz_categorical.cls" } })
def test_post_row_name_none(self): mldb.put('/v1/datasets/ds2', {'type': 'sparse.mutable'}) with self.assertRaises(ResponseException): # noqa mldb.post( '/v1/datasets/ds2/rows', { 'rowName': None, # should not work 'columns': [['colA', 1, 1]] })
def test_post_no_row_name(self): mldb.put('/v1/datasets/ds3', {'type': 'sparse.mutable'}) with self.assertRaises(ResponseException): # noqa mldb.post( '/v1/datasets/ds3/rows', { # rowName missing -> should not work 'columns': [['colA', 1, 1]] })
def test_post(self): res = mldb.post("/v1/datasets", { 'type' : 'sparse.mutable' }) id_ = res.json()['id'] url = '/v1/datasets/{}'.format(id_) mldb.post(url + '/rows', { 'rowName' : 'row1', 'columns' : [['colA', 1, 0]] }) mldb.post(url + '/commit')
def test_query(self): url = '/v1/datasets/ds' mldb.put(url, { 'type' : 'sparse.mutable' }) mldb.post(url + '/rows', { 'rowName' : 'row1', 'columns' : [['colA', 1, 0]] }) mldb.post(url + '/commit') res = mldb.query('SELECT * FROM ds') self.assertEqual(res, [['_rowName', 'colA'], ['row1', 1]])
def run_proc(input_data): mldb.post( '/v1/procedures', { 'type': 'summary.statistics', 'params': { 'runOnCreation': True, 'inputData': input_data, 'outputDataset': { 'id': 'error', 'type': 'sparse.mutable' } } })
def do_it(self, ds_type): mldb.post( "/v1/procedures", { "type": "transform", "params": { "inputData": self.__class__.query, "outputDataset": { "id": ds_type, "type": ds_type }, 'runOnCreation': True } })