def train_classifier(when): mldb.put( "/v1/procedures/tng_classif", { "type": "classifier.train", "params": { "trainingData": { "select": "{* EXCLUDING (x)} as features, x as label", "when": when, "from": { "id": "dataset1" } }, "configuration": { "glz": { "type": "glz", "verbosity": 3, "normalize": True, "regularization": 'l2' } }, "algorithm": "glz", "modelFileUrl": "file://tmp/MLDB-945.tng.cls" } }) mldb.post('/v1/procedures/tng_classif/runs')
def test_svd(self): # this is throwing because the not_yet_created dataset # does not exist with self.assertRaises(ResponseException) as re: mldb.put("/v1/datasets/training_data",{ "type": "merged", "params": { "datasets": [ {"id": "sample"}, {"id": "not_yet_created"} # attention ] } }) # we want to store output in 'not_yet_created' # the fact we tried to access 'not_yet_created' above # makes the first attempt to create it fail mldb.put("/v1/procedures/train_svd", { "type": "svd.train", "params": { "rowOutputDataset": "not_yet_created", # attention "outputColumn": "svd.embedding.00", "modelFileUrl": "file://tmp/svd.bin.test.gz", "trainingData": "select * from sample", "numSingularValues": 1, "runOnCreation": True } }) # this should now work mldb.get("/v1/query", q="select x from not_yet_created")
def test_mldb_put_dataset(self): _id = 'épopée' url = quote('/v1/datasets/' + _id) mldb.log(url) mldb.put(url, {'type': 'sparse.mutable'}) res = mldb.get(url).json() self.assertEqual(res['id'], _id)
def test_query_first_row(self): # Example of a query passed straight to mongodb. The result comes back # formatted as an MLDB result. mldb.put('/v1/functions/mongo_query', { 'type' : 'mongodb.query', 'params' : { 'uriConnectionScheme' : self.connection_scheme, 'collection' : 'test_collection' } }) query = json.dumps({ 'type' : { '$eq' : 'nested_obj' } }) res = mldb.get('/v1/functions/mongo_query/application', input={'query' : query}).json() self.assertEqual(res['output']['type'], 'nested_obj') _id = res['output']['_id'] query = json.dumps({ '_id' : _id }) res = mldb.get('/v1/functions/mongo_query/application', input={'query' : query}).json() self.assertEqual(res['output']['type'], 'nested_obj')
def test_onevsall_simple(self): conf = { "type": "classifier.train", "params": { "trainingData": """ select {* EXCLUDING(label0, label1)} as features, {label0, label1} as label from trivial """, "modelFileUrl": "file://build/x86_64/tmp/multilabel1.cls", "algorithm": "dt", "mode": "multilabel", "multilabelStrategy": "one-vs-all", "functionName": "classifyMe", "configuration": { "dt": { "type": "decision_tree", "max_depth": 8, "verbosity": 0, "update_alg": "gentle", "random_feature_propn": 1 } }, } } mldb.put("/v1/procedures/multilabel_train", conf) res = mldb.query( "SELECT classifyMe({features : {5 as feat1, 0 as feat2}}) as *") self.assertTableResultEquals(res, [[ "_rowName", "scores.\"\"\"label0\"\"\"", "scores.\"\"\"label1\"\"\"" ], ["result", 0.9999726414680481, 2.73847472271882e-05]])
def test_empty_json(self): """ Empty JSON returns the proper code """ mldb.put( "/v1/plugins/mldb2114", { "type": "python", "params": { "source": { "routes": """ if request.verb in ['GET', 'DELETE']: request.set_return({}, 200) else: request.set_return({}, 201) """ } } }) res = mldb.get('/v1/plugins/mldb2114/routes/foo') self.assertEqual(res.status_code, 200) self.assertEqual(res.json(), {}) res = mldb.post('/v1/plugins/mldb2114/routes/foo') self.assertEqual(res.status_code, 201) self.assertEqual(res.json(), {}) res = mldb.put('/v1/plugins/mldb2114/routes/foo') self.assertEqual(res.status_code, 201) self.assertEqual(res.json(), {}) res = mldb.delete('/v1/plugins/mldb2114/routes/foo') self.assertEqual(res.status_code, 200) self.assertEqual(res.json(), {})
def test_set_return_0(self): mldb.put( "/v1/plugins/mldb2114", { "type": "python", "params": { "source": { "routes": """request.set_return("", 0)""" } } }) with self.assertRaises(ResponseException) as e: mldb.get('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.post('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.put('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.delete('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500)
def test_wildcard(self): cls_config = { "my_fasttext": { "type": "fasttext", "verbosity" : 0, "dims" : 4, "epoch" : 5, } } with self.assertMldbRaises(expected_regexp= "Dataset column 'tokens.alabama' cannot be used in both label and feature because of label wildcard 'tokens'"): mldb.put("/v1/procedures/trainer", { "type": "classifier.train", "params": { "trainingData": "SELECT {tokens.*} as features, {tokens.*} as label FROM bag_of_words", "modelFileUrl": "file://tmp/src_fasttext.cls", "functionName" : 'myclassify', "algorithm": "my_fasttext", "mode": "multilabel", "runOnCreation": True, "configuration": cls_config } })
def test_do_not_run_on_creation(self): mldb.put('/v1/procedures/do_not_run_on_creation', { 'type' : 'transform', 'params' : { 'skipEmptyRows' : False, 'runOnCreation' : False } }) msg = 'You need to define inputData' with self.assertRaisesRegex(ResponseException, msg): mldb.put('/v1/procedures/do_not_run_on_creation/runs/r1', { 'params' : {} }) res = mldb.post('/v1/procedures', { 'type' : 'transform', 'params' : { 'skipEmptyRows' : False, 'runOnCreation' : False } }).json() with self.assertRaisesRegex(ResponseException, msg): mldb.post('/v1/procedures/{}/runs'.format(res['id']), { 'params' : {} })
def test_assert_full_result_equals(self): url = '/v1/datasets/ds' mldb.put(url, { 'type' : 'sparse.mutable' }) mldb.post(url + '/rows', { 'rowName' : 'row1', 'columns' : [['colA', 1, 0]] }) mldb.post(url + '/rows', { 'rowName' : 'row2', 'columns' : [['colB', 2, 1]] }) mldb.post(url + '/commit') res = mldb.get('/v1/query', q="SELECT colA, colB FROM ds ORDER BY rowName()").json() self.assertFullResultEquals(res, [ { 'rowName' : 'row1', 'columns' : [['colA', 1, '1970-01-01T00:00:00Z'], ['colB', None, '-Inf']] }, { 'rowName' : 'row2', 'columns' : [ ['colA', None, '-Inf'], ['colB', 2, '1970-01-01T00:00:01Z']] } ])
def setUpClass(self): mldb.put("/v1/procedures/csv_proc", { "type": "import.text", "params": { 'dataFileUrl' : 'file://mldb/testing/dataset/fasttext_train.csv', "outputDataset": { "id": "src_train", }, "ignoreBadLines" : True, "allowMultiLines" : True, "structuredColumnNames" : True, "limit" : 10000, } }) mldb.put("/v1/procedures/baggify", { "type": "transform", "params": { "inputData": """ select Theme, tokenize(lower(Body), {splitChars:' ,.:;«»[]()%!?', quoteChar:'', minTokenLength: 2}) as tokens from src_train """, "outputDataset": { "id": "bag_of_words", "type": "sparse.mutable" }, } })
def test_fasttext_explain(self): mldb.log("explain") cls_config = { "my_fasttext": { "type": "fasttext", "verbosity" : 0, "dims" : 4, "epoch" : 5, } } tmp_file = tempfile.NamedTemporaryFile(prefix=os.getcwd() + '/build/x86_64/tmp/') mldb.put("/v1/procedures/trainer", { "type": "classifier.train", "params": { "trainingData": "SELECT {tokens.*} as features, Theme as label FROM bag_of_words", "modelFileUrl": "file:///" + tmp_file.name, "functionName" : 'myclassify', "algorithm": "my_fasttext", "mode": "categorical", "runOnCreation": True, "configuration": cls_config } }) mldb.put("/v1/functions/explain", { "type": "classifier.explain", "params": { "modelFileUrl": "file:///" + tmp_file.name, } }) res = mldb.query("""SELECT explain({features : {tokenize(lower(' hockey Alabama Futbol'), {splitChars:' ,.:;«»[]()%!?', quoteChar:'', minTokenLength: 2}) as tokens}, label : 'Politique'}) as * """) self.assertTableResultEquals(res, [ [ "_rowName", "bias", "explanation.tokens.alabama", "explanation.tokens.futbol", "explanation.tokens.hockey" ], [ "result", 0, -0.006820799317210913, -0.07053825259208679, -0.08547607064247131 ] ]); with self.assertRaisesRegex(ResponseException, "label not in model"): res = mldb.query("""SELECT explain({features : {tokenize(lower(' hockey Alabama Futbol'), {splitChars:' ,.:;«»[]()%!?', quoteChar:'', minTokenLength: 2}) as tokens}, label : 'Futurama'}) as * """)
def setUpClass(cls): mldb.put('/v1/plugins/deepteach', { "type": "python", "params": { "address": "git://github.com/mldbai/deepteach#ip-rndfrst-prob", } })
def train_svd(when, output_index): global dataset_index dataset_index += 1 svd_procedure = "/v1/procedures/when_svd" # svd procedure configuration svd_config = { 'type': 'svd.train', 'params': { "trainingData": { "from": { "id": "svd_example" }, "when": when }, "rowOutputDataset": { "id": "when_svd_row_" + str(dataset_index), 'type': "embedding" }, "columnOutputDataset": { "id": "svd_embedding_" + str(output_index), "type": "embedding" } } } mldb.put(svd_procedure, svd_config) mldb.post(svd_procedure + '/runs') result = mldb.get('/v1/query', q="SELECT * FROM when_svd_row_" + str(dataset_index)) response = result.json() return len(response[0]["columns"])
def test_spread(self): mldb.put( "/v1/procedures/split", { "type": "split", "params": { "labels": "SELECT * FROM ds1", "reproducible": True, "splits": [0.8, 0.2], "outputDatasets": [{ "id": "ds_train", "type": "sparse.mutable" }, { "id": "ds_test", "type": "sparse.mutable" }], } }) res1 = mldb.query("SELECT * FROM ds_train ORDER BY rowName() DESC") res2 = mldb.query("SELECT * FROM ds_test ORDER BY rowName() DESC") self.assertEqual( res1, [["_rowName", "y", "x"], ["3", 1, None], ["0", None, 1]]) self.assertEqual( res2, [["_rowName", "y", "x"], ["2", 1, None], ["1", None, 1]])
def test_const_userfunction_var(self): mldb.put('/v1/functions/fetch', {'type': 'fetcher'}) res = mldb.query( "SELECT __isconst(fetch({url: a})) as isconst FROM ds1 ORDER BY rowName()" ) self.assertTableResultEquals(res, [ [ '_rowName', 'isconst', ], ['row1', False], ]) mldb.put('/v1/functions/fetch2', { 'type': 'fetcher', 'deterministic': False }) res = mldb.query( "SELECT __isconst(fetch2({url: 'itdoesntreallymatter'})) as isconst FROM ds1 ORDER BY rowName()" ) self.assertTableResultEquals(res, [ [ '_rowName', 'isconst', ], ['row1', False], ])
def test_selection_of_creds(self): # store a dummy credential for a specific path resp = mldb.put( "/v1/credentials/badcred", { "store": { "resourceType": "aws:s3", "resource": "s3://dummy", "credential": { "provider": "Credentials collection", "protocol": "http", "location": "s3.amazonaws.com", "id": "this is my key", "secret": "this is my secret" } } }) csv_conf = { "type": "import.text", "params": { 'dataFileUrl': 's3://dummy/test.csv', "outputDataset": { "id": "test" }, "runOnCreation": True } } # this is expected to pick the most specific but invalid credentials with self.assertRaises(ResponseException) as re: resp = mldb.put("/v1/procedures/import", csv_conf)
def test_function_creation_bug(self): mldb.post( "/v1/procedures", { "type": "import.text", "params": { "dataFileUrl": "http://public.mldb.ai/narrow_test.csv.gz", "outputDataset": "narrow", "runOnCreation": True } }) # it seems that the training fails to save the function but we proceed to testing # where we try to use the function but then can't find it # 1) we should not move to testing if function-creation fails # we should report that function-creation failed # 2) function creation should not fail for a dt on this dataset mldb.put( "/v1/procedures/train", { "type": "classifier.experiment", "params": { "experimentName": "x", "inputData": "select {a} as features, b as label from narrow", "algorithm": "dt", "mode": "regression", "configurationFile": "./mldb/container_files/classifiers.json", "modelFileUrlPattern": "file://tmp/MLDB-1597-creation$runid.cls", "runOnCreation": True } })
def test_no_set_return(self): mldb.put( "/v1/plugins/mldb2114", { "type": "python", "params": { "source": { "routes": """ from mldb import mldb mldb.log('no return') """ } } }) msg = "Return value is required for route handlers but not set" with self.assertRaisesRegex(ResponseException, msg) as e: mldb.get('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.post('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.put('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.delete('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500)
def train_svd(order_by, where, offset, limit): svd_procedure = "/v1/procedures/order_svd" # svd procedure configuration svd_config = { 'type': 'svd.train', 'params': { "trainingData": { "from": { "id": "svd_example" }, "select": "x, y, z", "orderBy": order_by, "where": where, "offset": offset, "limit": limit }, "rowOutputDataset": { "id": "svd_row", 'type': "embedding" }, "columnOutputDataset": { "id": "svd_column", "type": "embedding" } } } mldb.put(svd_procedure, svd_config) mldb.post(svd_procedure + '/runs') result = mldb.get('/v1/query', q="SELECT * FROM svd_row") return len(result.json()[0]["columns"])
def test_empty_str_json(self): mldb.put( "/v1/plugins/mldb2114", { "type": "python", "params": { "source": { "routes": """request.set_return("", 200)""" } } }) res = mldb.get('/v1/plugins/mldb2114/routes/foo') self.assertEqual(res.status_code, 200) self.assertEqual(res.json(), "") res = mldb.post('/v1/plugins/mldb2114/routes/foo') self.assertEqual(res.status_code, 200) self.assertEqual(res.json(), "") res = mldb.put('/v1/plugins/mldb2114/routes/foo') self.assertEqual(res.status_code, 200) self.assertEqual(res.json(), "") res = mldb.delete('/v1/plugins/mldb2114/routes/foo') self.assertEqual(res.status_code, 200) self.assertEqual(res.json(), "")
def test_long_quoted_lines(self): with open("tmp/broken_csv.csv", 'wt', encoding="utf-8") as f: f.write("a,b\n") f.write("1,\"" + " ".join(["word " for x in range(50)])+"\"\n") f.write("1,\"" + " ".join(["word " for x in range(100)])+"\"\n") f.write("1,\"" + " ".join(["word " for x in range(1000)])+"\"\n") f.write("1,\"" + " ".join(["word " for x in range(10000)])+"\"\n") csv_conf = { "type": "import.text", "params": { 'dataFileUrl' : 'file://tmp/broken_csv.csv', "outputDataset": { "id": "x", }, "runOnCreation": True, "ignoreBadLines": False } } mldb.put("/v1/procedures/csv_proc", csv_conf) result = mldb.get( "/v1/query", q="select tokenize(b, {splitChars: ' '}) as cnt " "from x order by rowName() ASC") js_rez = result.json() mldb.log(js_rez) answers = {"2": 50, "3": 100, "4": 1000, "5": 10000} for row in js_rez: self.assertEqual(answers[row["rowName"]], row["columns"][0][1])
def test_dataset(self): # Example of a read only mongo db dataset. MLDB queries can be made # over it. mldb.put('/v1/datasets/ds', { 'type' : 'mongodb.dataset', 'params' : { 'uriConnectionScheme' : self.connection_scheme, 'collection' : 'test_collection', } }) res = mldb.query("SELECT * FROM ds") self.assertEqual(len(res), 5) res = mldb.query("SELECT * FROM ds WHERE unexisting_field='Finch'") self.assertEqual(len(res), 1) res = mldb.query("SELECT * FROM ds WHERE type='simple'") self.assertEqual(len(res), 2) self.assertEqual(res[1][2], 'simple') res = mldb.query("SELECT type FROM ds ORDER BY type") self.assertEqual(res[1][1], None) self.assertEqual(res[2][1], 'nested_arr') self.assertEqual(res[3][1], 'nested_obj') self.assertEqual(res[4][1], 'simple') res = mldb.query("SELECT username FROM ds WHERE unexisting != 'Finch'") self.assertEqual(len(res), 1) res = mldb.query("SELECT username FROM ds WHERE type != 'simple'") self.assertEqual(len(res), 3)
def test_no_cls_write_perms(self): conf = { "type": "classifier.experiment", "params": { "experimentName": "my_test_no_write", "inputData": "select {* EXCLUDING(label)} as features, label from toy", "kfold": 2, "modelFileUrlPattern": "file:///bouya-$runid.cls", "algorithm": "glz", "mode": "boolean", "configuration": { "glz": { "type": "glz", "verbosity": 3, "normalize": False, "regularization": 'l2' } }, "outputAccuracyDataset": False, "runOnCreation": True } } with self.assertRaisesRegex(ResponseException, 'Error when trying'): mldb.put("/v1/procedures/rocket_science", conf)
def test_it(self): url = '/v1/datasets/input' mldb.put(url, { 'type' : 'sparse.mutable' }) mldb.post(url + '/rows', { 'rowName' : 'row1', 'columns' : [['score', 5, 6]] }) mldb.post(url + '/rows', { 'rowName' : 'row2', 'columns' : [['score', 1, 5]] }) mldb.post(url + '/commit', {}) mldb.post('/v1/procedures', { 'type' : 'bucketize', 'params' : { 'inputData' : 'SELECT * FROM input ORDER BY score', 'outputDataset' : { 'id' : 'output', 'type' : 'sparse.mutable' }, 'percentileBuckets': {'b1': [0, 50], 'b2': [50, 100]}, 'runOnCreation' : True } }) res = mldb.query('SELECT latest_timestamp({*}) FROM output') self.assertEqual(res[1][1], '1970-01-01T00:00:06Z')
def test_record_null_row_name(self): mldb.put('/v1/datasets/ds_null', {'type': 'sparse.mutable'}) with self.assertRaises(ResponseException): # noqa mldb.post('/v1/datasets/ds_null/rows', { 'rowName': None, 'columns': [['colA', 1, 1]] })
def run_MLDBFB_545_with_ds_type(self, ds_type): id1 = ds_type + 'mldbfb545_1' ds = mldb.create_dataset({'id': id1, 'type': ds_type + '.mutable'}) ds.record_row('user1', [['converted', 'n', 0]]) ds.commit() id2 = ds_type + 'mldbfb545_2' ds = mldb.create_dataset({'id': id2, 'type': ds_type + '.mutable'}) ds.record_row('user2', [['blah', 'blah', 0]]) ds.commit() # query directly on the dataset works res = mldb.query(""" SELECT * FROM {} WHERE converted='c' LIMIT 1 """.format(id1)) self.assertEqual(len(res), 1) merge_id = ds_type + 'mldbfb545merged' mldb.put("/v1/datasets/" + merge_id, { "type": "merged", "params": { "datasets": [{ "id": id1 }, { "id": id2 }] } }) # query on the merged dataset yields incorrect results res = mldb.query(""" SELECT * FROM {} WHERE converted='c' LIMIT 1 """.format(merge_id)) mldb.log(res) self.assertEqual(len(res), 1)
def test_too_many_requested_rows(self): # too many requested rows without sampling sampled_dataset_conf = { "type": "sampled", "params": { "dataset": { "id": "toy" }, "rows": 25000, "withReplacement": False } } with self.assertRaises(ResponseException) as re: mldb.put("/v1/datasets/patate", sampled_dataset_conf) self.assertEqual(re.exception.response.status_code, 400) sampled_dataset_conf["params"]["withReplacement"] = True mldb.put("/v1/datasets/patate", sampled_dataset_conf) # try to insert and make sure we get an exception with self.assertRaises(ResponseException) as re: mldb.post("/v1/datasets/patate/rows", { "rowName": "patato", "columns": [["a", "b", 0]] }) self.assertEqual(re.exception.response.status_code, 400)
def test_name_with_space(self): _id = "name with space" url = '/v1/datasets/' + quote(_id) mldb.log(url) mldb.put(url, {'type': 'sparse.mutable'}) res = mldb.get(url).json() self.assertEqual(res['id'], _id)
def train_tsne(when): global dataset_index dataset_index += 1 tsne_procedure = "/v1/procedures/when_tsne" # t-sne procedure configuration tsne_config = { 'type': 'tsne.train', 'params': { "trainingData": { "from": { "id": "svd_example" }, "when": when }, "rowOutputDataset": { "id": "tsne_embedding_" + str(dataset_index), 'type': "embedding" } } } mldb.put(tsne_procedure, tsne_config) mldb.post(tsne_procedure + '/runs') result = mldb.get('/v1/query', q="SELECT * FROM tsne_embedding_" + str(dataset_index)) return len(result.json()[0]["columns"])