Пример #1
0
def train_classifier(when):
    mldb.put(
        "/v1/procedures/tng_classif", {
            "type": "classifier.train",
            "params": {
                "trainingData": {
                    "select": "{* EXCLUDING (x)} as features, x as label",
                    "when": when,
                    "from": {
                        "id": "dataset1"
                    }
                },
                "configuration": {
                    "glz": {
                        "type": "glz",
                        "verbosity": 3,
                        "normalize": True,
                        "regularization": 'l2'
                    }
                },
                "algorithm": "glz",
                "modelFileUrl": "file://tmp/MLDB-945.tng.cls"
            }
        })

    mldb.post('/v1/procedures/tng_classif/runs')
Пример #2
0
def train_svd(when, output_index):
    global dataset_index
    dataset_index += 1

    svd_procedure = "/v1/procedures/when_svd"
    # svd procedure configuration
    svd_config = {
        'type': 'svd.train',
        'params': {
            "trainingData": {
                "from": {
                    "id": "svd_example"
                },
                "when": when
            },
            "rowOutputDataset": {
                "id": "when_svd_row_" + str(dataset_index),
                'type': "embedding"
            },
            "columnOutputDataset": {
                "id": "svd_embedding_" + str(output_index),
                "type": "embedding"
            }
        }
    }

    mldb.put(svd_procedure, svd_config)
    mldb.post(svd_procedure + '/runs')
    result = mldb.get('/v1/query',
                      q="SELECT * FROM when_svd_row_" + str(dataset_index))
    response = result.json()
    return len(response[0]["columns"])
Пример #3
0
def train_tsne(when):
    global dataset_index
    dataset_index += 1

    tsne_procedure = "/v1/procedures/when_tsne"
    # t-sne procedure configuration
    tsne_config = {
        'type': 'tsne.train',
        'params': {
            "trainingData": {
                "from": {
                    "id": "svd_example"
                },
                "when": when
            },
            "rowOutputDataset": {
                "id": "tsne_embedding_" + str(dataset_index),
                'type': "embedding"
            }
        }
    }

    mldb.put(tsne_procedure, tsne_config)
    mldb.post(tsne_procedure + '/runs')

    result = mldb.get('/v1/query',
                      q="SELECT * FROM tsne_embedding_" + str(dataset_index))
    return len(result.json()[0]["columns"])
Пример #4
0
    def test_most_frequent(self):
        ds = mldb.create_dataset({
            'id': 'most_freq_source',
            'type': 'sparse.mutable'
        })

        row_num = 0

        class Counter(object):
            def __init__(self):
                self.num = 0

            def __next__(self):
                self.num += 1
                return self.num

        vals = {
            'a': 5,
            'b': 4,
            'c': 3,
            'd': 2,
            'e': 1,
            'f': 1,
            'g': 1,
            'h': 1,
            'i': 1,
            'j': 1,
            'k': 1,
            'l': 1,
            'm': 1,
        }
        c = Counter()
        for k, count in vals.items():
            for _ in range(count):
                ds.record_row(next(c), [['col', k, 0]])

        ds.commit()

        mldb.post(
            '/v1/procedures', {
                'type': 'summary.statistics',
                'params': {
                    'runOnCreation': True,
                    'inputData': "SELECT * FROM most_freq_source",
                    'outputDataset': {
                        'id': 'most_freq_output',
                        'type': 'sparse.mutable'
                    }
                }
            })

        res = mldb.query("SELECT * FROM most_freq_output ORDER BY rowName()")
        self.assertTableResultEquals(res, [[
            "_rowName", "value.data_type", "value.most_frequent_items.a",
            "value.most_frequent_items.b", "value.most_frequent_items.c",
            "value.most_frequent_items.d", "value.most_frequent_items.h",
            "value.most_frequent_items.i", "value.most_frequent_items.j",
            "value.most_frequent_items.k", "value.most_frequent_items.l",
            "value.most_frequent_items.m", "value.num_null", "value.num_unique"
        ], ["col", "categorical", 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 0, 13]])
    def test_string_over_null(self):
        """
        The primary test.
        The issue column, which was always null for training, is now having
        a string values for testing.
        """
        ds = mldb.create_dataset({
            'id': 'test_string_over_null_ds',
            'type': 'sparse.mutable'
        })
        for idx in range(10):
            ds.record_row(
                'row{}'.format(idx),
                [['line', idx, 0], ['label', 0, 0],
                 ['feature', random.random() + 0.6, 0],
                 ['noise', random.random(), 0], ['issue', 'STRING', 0]])
        ds.commit()

        mldb.post('/v1/procedures', {
            'type': 'classifier.test',
            'params' : {
                'mode': 'boolean',
                'testingData':
                    'SELECT score_it({features: {feature, noise, issue}})[score] AS score, ' \
                    'label FROM test_string_over_null_ds'
            }
        })
Пример #6
0
 def test_record_null_row_name(self):
     mldb.put('/v1/datasets/ds_null', {'type': 'sparse.mutable'})
     with self.assertRaises(ResponseException):  # noqa
         mldb.post('/v1/datasets/ds_null/rows', {
             'rowName': None,
             'columns': [['colA', 1, 1]]
         })
Пример #7
0
    def test_set_return_0(self):
        mldb.put(
            "/v1/plugins/mldb2114", {
                "type": "python",
                "params": {
                    "source": {
                        "routes": """request.set_return("", 0)"""
                    }
                }
            })

        with self.assertRaises(ResponseException) as e:
            mldb.get('/v1/plugins/mldb2114/routes/foo')
        self.assertEqual(e.exception.response.status_code, 500)

        with self.assertRaises(ResponseException) as e:
            mldb.post('/v1/plugins/mldb2114/routes/foo')
        self.assertEqual(e.exception.response.status_code, 500)

        with self.assertRaises(ResponseException) as e:
            mldb.put('/v1/plugins/mldb2114/routes/foo')
        self.assertEqual(e.exception.response.status_code, 500)

        with self.assertRaises(ResponseException) as e:
            mldb.delete('/v1/plugins/mldb2114/routes/foo')
        self.assertEqual(e.exception.response.status_code, 500)
Пример #8
0
    def test_no_set_return(self):
        mldb.put(
            "/v1/plugins/mldb2114", {
                "type": "python",
                "params": {
                    "source": {
                        "routes":
                        """
from mldb import mldb
mldb.log('no return')
"""
                    }
                }
            })

        msg = "Return value is required for route handlers but not set"

        with self.assertRaisesRegex(ResponseException, msg) as e:
            mldb.get('/v1/plugins/mldb2114/routes/foo')
        self.assertEqual(e.exception.response.status_code, 500)

        with self.assertRaisesRegex(ResponseException, msg) as e:
            mldb.post('/v1/plugins/mldb2114/routes/foo')
        self.assertEqual(e.exception.response.status_code, 500)

        with self.assertRaisesRegex(ResponseException, msg) as e:
            mldb.put('/v1/plugins/mldb2114/routes/foo')
        self.assertEqual(e.exception.response.status_code, 500)

        with self.assertRaisesRegex(ResponseException, msg) as e:
            mldb.delete('/v1/plugins/mldb2114/routes/foo')
        self.assertEqual(e.exception.response.status_code, 500)
Пример #9
0
    def test_dottest_col_names(self):
        ds = mldb.create_dataset({
            'id': 'dotted_col_ds',
            'type': 'sparse.mutable'
        })
        ds.record_row('row1', [['col.a', 1, 0]])
        ds.commit()

        mldb.post(
            '/v1/procedures', {
                'type': 'summary.statistics',
                'params': {
                    'runOnCreation': True,
                    'inputData': 'SELECT * FROM dotted_col_ds',
                    'outputDataset': {
                        'id': 'output_dotted_col_ds',
                        'type': 'sparse.mutable'
                    }
                }
            })
        res = mldb.query("SELECT * FROM output_dotted_col_ds")
        self.assertTableResultEquals(res, [[
            "_rowName", "value.1st_quartile", "value.3rd_quartile",
            "value.data_type", "value.max", "value.avg", "value.median",
            "value.min", "value.most_frequent_items.1", "value.num_null",
            "value.num_unique", "value.stddev"
        ], ['"col.a"', 1, 1, "number", 1, 1, 1, 1, 1, 0, 1, "NaN"]])
def insert_with_ts(ts):
    mldb.post('/v1/datasets/ds/rows', {
        'rowName' : 'row1',
        'columns' : [
            ['colA', 1, ts],
        ]
    })
Пример #11
0
    def test_import_missing_param(self):
        msg = 'uriConnectionScheme is a required property'
        with self.assertRaisesRegex(ResponseException, msg):
            mldb.post('/v1/procedures', {
                'type' : 'mongodb.import',
                'params' : {
                    'collection' : 'users',
                    'outputDataset' : {
                        'id' : 'out',
                        'type' : 'sparse.mutable'
                    }
                }
            })

        msg = 'collection is a required property and must not be empty'
        with self.assertRaisesRegex(ResponseException, msg):
            mldb.post('/v1/procedures', {
                'type' : 'mongodb.import',
                'params' : {
                    'uriConnectionScheme' : 'mongodb://localhost:27017/tutorial',
                    'outputDataset' : {
                        'id' : 'out',
                        'type' : 'sparse.mutable'
                    }
                }
            })
Пример #12
0
    def test_rowHash(self):
        mldb.post(
            '/v1/procedures', {
                'type': 'import.text',
                'params': {
                    "dataFileUrl":
                    "https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv",
                    'outputDataset': "titanic_hashed",
                    "where": "rowHash() % 3 = 0",
                    'runOnCreation': True,
                }
            })

        mldb.post(
            '/v1/procedures', {
                'type': 'import.text',
                'params': {
                    "dataFileUrl":
                    "https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv",
                    'outputDataset': "titanic_no_hashed",
                    'runOnCreation': True,
                }
            })

        self.assertTableResultEquals(
            mldb.query("select count(*) from titanic_hashed"),
            [["_rowName", "count(*)"], ["[]", 287]])

        self.assertTableResultEquals(
            mldb.query("select count(*) from titanic_no_hashed"),
            [["_rowName", "count(*)"], ["[]", 891]])
Пример #13
0
    def test_function_creation_bug(self):
        mldb.post(
            "/v1/procedures", {
                "type": "import.text",
                "params": {
                    "dataFileUrl": "http://public.mldb.ai/narrow_test.csv.gz",
                    "outputDataset": "narrow",
                    "runOnCreation": True
                }
            })

        # it seems that the training fails to save the function but we proceed to testing
        # where we try to use the function but then can't find it
        # 1) we should not move to testing if function-creation fails
        # we should report that function-creation failed
        # 2) function creation should not fail for a dt on this dataset

        mldb.put(
            "/v1/procedures/train", {
                "type": "classifier.experiment",
                "params": {
                    "experimentName": "x",
                    "inputData":
                    "select {a} as features, b as label from narrow",
                    "algorithm": "dt",
                    "mode": "regression",
                    "configurationFile":
                    "./mldb/container_files/classifiers.json",
                    "modelFileUrlPattern":
                    "file://tmp/MLDB-1597-creation$runid.cls",
                    "runOnCreation": True
                }
            })
def train_svd(order_by, where, offset, limit):
    svd_procedure = "/v1/procedures/order_svd"
    # svd procedure configuration
    svd_config = {
        'type': 'svd.train',
        'params': {
            "trainingData": {
                "from": {
                    "id": "svd_example"
                },
                "select": "x, y, z",
                "orderBy": order_by,
                "where": where,
                "offset": offset,
                "limit": limit
            },
            "rowOutputDataset": {
                "id": "svd_row",
                'type': "embedding"
            },
            "columnOutputDataset": {
                "id": "svd_column",
                "type": "embedding"
            }
        }
    }

    mldb.put(svd_procedure, svd_config)
    mldb.post(svd_procedure + '/runs')

    result = mldb.get('/v1/query', q="SELECT * FROM svd_row")
    return len(result.json()[0]["columns"])
    def test_do_not_run_on_creation(self):
        mldb.put('/v1/procedures/do_not_run_on_creation', {
            'type' : 'transform',
            'params' : {
                'skipEmptyRows' : False,
                'runOnCreation' : False
            }
        })

        msg = 'You need to define inputData'
        with self.assertRaisesRegex(ResponseException, msg):
            mldb.put('/v1/procedures/do_not_run_on_creation/runs/r1', {
                'params' : {}
            })

        res = mldb.post('/v1/procedures', {
            'type' : 'transform',
            'params' : {
                'skipEmptyRows' : False,
                'runOnCreation' : False
            }
        }).json()
        with self.assertRaisesRegex(ResponseException, msg):
            mldb.post('/v1/procedures/{}/runs'.format(res['id']), {
                'params' : {}
            })
Пример #16
0
    def test_join_with_and(self):
        resp = mldb.query('select * from ds_train')
        mldb.log(resp)

        mldb.post(
            "/v1/procedures", {
                "type": "transform",
                "params": {
                    "inputData": """
                        select *
                        from ds left join ds_stats on (ds.dow=ds_stats.dow and ds.a_int=ds_stats.a_int)
                        limit 10
                    """,
                    "outputDataset": {
                        "id": "ds_train2",
                        "type": "tabular",
                        "params": {
                            "unknownColumns": "add"
                        }
                    },
                    "runOnCreation": True
                }
            })

        resp2 = mldb.query('select * from ds_train2')
        mldb.log(resp2)

        # equivalent join conditions should be returning the same dataset
        # this is a very weak check because the columns and the row ordering
        # of these two equivalent joins are currently very different
        self.assertEqual(len(resp), len(resp2),
                         'expected response sizes to match')
Пример #17
0
    def setUpClass(cls):
        ds = mldb.create_dataset({'id' : 'ds', 'type' : 'sparse.mutable'})
        ds.record_row('row1', [['label', 1, 12], ['feat1', 1, 0], ['feat2', 1, 0]])
        ds.record_row('row2', [['label', 0, 12], ['feat1', 1, 0], ['feat2', 0, 0]])
        ds.record_row('row3', [['label', 0, 12], ['feat1', 0, 0], ['feat2', 0, 0]])
        ds.commit()

        mldb.post('/v1/procedures', {
            'type' : 'classifier.train',
            'params' : {
                'runOnCreation' : True,
                "mode": "boolean",
                'algorithm' : 'glz',
                "configuration": {
                    "glz": {
                        "type": "glz",
                        "verbosity": 3,
                        "normalize": False,
                        "regularization": 'l2'
                    }
                },
                'trainingData' : """
                    SELECT {* EXCLUDING(label)} AS features, label
                    FROM ds
                """,
                "modelFileUrl":
                    "file://build/x86_64/tmp/fmlhTODO.cls",
            }
        })
    def test_c_over_a_or_b(self):
        """
        This is an alternate test of unknown values. The model was built with
        column a_or_b having always a value of either a or b. Here, we test
        with a dataset having always the "never seen value" of c for that
        column.
        """
        ds = mldb.create_dataset({
            'id': 'test_c_over_a_or_b_ds',
            'type': 'sparse.mutable'
        })
        for idx in range(10):
            ds.record_row('row{}'.format(idx),
                          [['line', idx, 0], ['label', 0, 0],
                           ['feature', random.random() + 0.6, 0],
                           ['noise', random.random(), 0], ['a_or_b', 'c', 0]])
        ds.commit()

        mldb.post('/v1/procedures', {
            'type': 'classifier.test',
            'params' : {
                'mode': 'boolean',
                'testingData':
                    'SELECT score_it({features: {feature, noise, a_or_b}})[score] AS score, ' \
                    'label FROM test_c_over_a_or_b_ds'
            }
        })
Пример #19
0
    def test_too_many_requested_rows(self):
        # too many requested rows without sampling
        sampled_dataset_conf = {
            "type": "sampled",
            "params": {
                "dataset": {
                    "id": "toy"
                },
                "rows": 25000,
                "withReplacement": False
            }
        }
        with self.assertRaises(ResponseException) as re:
            mldb.put("/v1/datasets/patate", sampled_dataset_conf)
        self.assertEqual(re.exception.response.status_code, 400)

        sampled_dataset_conf["params"]["withReplacement"] = True
        mldb.put("/v1/datasets/patate", sampled_dataset_conf)

        # try to insert and make sure we get an exception
        with self.assertRaises(ResponseException) as re:
            mldb.post("/v1/datasets/patate/rows", {
                "rowName": "patato",
                "columns": [["a", "b", 0]]
            })
        self.assertEqual(re.exception.response.status_code, 400)
    def setUpClass(cls):
        mldb.post(
            '/v1/procedures', {
                'type': 'import.text',
                'params': {
                    'dataFileUrl':
                    'http://public.mldb.ai/datasets/enron.csv.gz',
                    'outputDataset': 'enron_data',
                    'named': "'enron_' + dataset + '_mail_' + index",
                    'where': 'dataset = 1'
                }
            })

        mldb.put(
            '/v1/functions/bow', {
                'type': 'sql.expression',
                'params': {
                    'expression':
                    """
                    tokenize(msg, {splitChars: ' :.-!?''"()[],', quoteChar: ''}) as bow
                    """
                }
            })

        mldb.post(
            '/v1/procedures', {
                'type': 'transform',
                'params': {
                    'inputData': """
                    select bow({msg})[bow] as *, label = 'spam' as message_is_spam
                    from enron_data
                    """,
                    'outputDataset': 'enron_features'
                }
            })
Пример #21
0
 def test_delete(self):
     # this test depends on put and post
     url = '/v1/datasets/ds'
     mldb.put(url, {
         'type' : 'sparse.mutable'
     })
     mldb.post(url + '/commit')
     mldb.delete(url)
Пример #22
0
    def test_offset_not_accepted(self):
        with self.assertRaises(ResponseException):
            mldb.post(
                "/v1/procedures", {
                    "type": "classifier.experiment",
                    "params": {
                        'runOnCreation': True,
                        "experimentName": "test_no_test_data",
                        "inputData": """
                        SELECT {* EXCLUDING(label)} AS features, label
                        FROM toy OFFSET 1""",
                        "modelFileUrlPattern":
                        "file://build/x86_64/tmp/test_no_test_data_$runid.cls",
                        "algorithm": "glz",
                        "equalizationFactor": 0.5,
                        "mode": "boolean",
                        "configuration": {
                            "glz": {
                                "type": "glz",
                                "verbosity": 3,
                                "normalize": False,
                                "regularization": 'l2'
                            }
                        },
                        "outputAccuracyDataset": False
                    }
                })

        with self.assertRaises(ResponseException):
            mldb.post(
                "/v1/procedures", {
                    "type": "classifier.experiment",
                    "params": {
                        'runOnCreation': True,
                        "experimentName": "test_no_test_data",
                        "inputData": """
                        SELECT {* EXCLUDING(label)} AS features, label
                        FROM toy""",
                        "testingDataOverride": """
                        SELECT {* EXCLUDING(label)} AS features, label
                        FROM toy OFFSET 1""",
                        "modelFileUrlPattern":
                        "file://build/x86_64/tmp/test_no_test_data_$runid.cls",
                        "algorithm": "glz",
                        "equalizationFactor": 0.5,
                        "mode": "boolean",
                        "configuration": {
                            "glz": {
                                "type": "glz",
                                "verbosity": 3,
                                "normalize": False,
                                "regularization": 'l2'
                            }
                        },
                        "outputAccuracyDataset": False
                    }
                })
Пример #23
0
 def test_post_empty_row_name(self):
     mldb.put('/v1/datasets/empty_row_name', {'type': 'sparse.mutable'})
     mldb.post('/v1/datasets/empty_row_name/rows', {
         'rowName': '',
         'columns': [['col', 'val', 0]]
     })
     mldb.post('/v1/datasets/empty_row_name/commit')
     res = mldb.query("SELECT * FROM empty_row_name")
     self.assertTableResultEquals(res, [['_rowName', 'col'], ['""', "val"]])
 def setUpClass(cls):
     mldb.post(
         "/v1/functions", {
             "id": "explain",
             "type": "classifier.explain",
             "params": {
                 "modelFileUrl":
                 "file://mldb/testing/fixtures/mnist_glz_categorical.cls"
             }
         })
Пример #25
0
    def test_post_row_name_none(self):
        mldb.put('/v1/datasets/ds2', {'type': 'sparse.mutable'})

        with self.assertRaises(ResponseException):  # noqa
            mldb.post(
                '/v1/datasets/ds2/rows',
                {
                    'rowName': None,  # should not work
                    'columns': [['colA', 1, 1]]
                })
Пример #26
0
    def test_post_no_row_name(self):
        mldb.put('/v1/datasets/ds3', {'type': 'sparse.mutable'})

        with self.assertRaises(ResponseException):  # noqa
            mldb.post(
                '/v1/datasets/ds3/rows',
                {
                    # rowName missing -> should not work
                    'columns': [['colA', 1, 1]]
                })
Пример #27
0
    def test_post(self):
        res = mldb.post("/v1/datasets", {
            'type' : 'sparse.mutable'
        })
        id_ = res.json()['id']
        url = '/v1/datasets/{}'.format(id_)

        mldb.post(url + '/rows', {
            'rowName' : 'row1',
            'columns' : [['colA', 1, 0]]
        })
        mldb.post(url + '/commit')
Пример #28
0
    def test_query(self):
        url = '/v1/datasets/ds'
        mldb.put(url, {
            'type' : 'sparse.mutable'
        })
        mldb.post(url + '/rows', {
            'rowName' : 'row1',
            'columns' : [['colA', 1, 0]]
        })
        mldb.post(url + '/commit')

        res = mldb.query('SELECT * FROM ds')
        self.assertEqual(res, [['_rowName', 'colA'], ['row1', 1]])
Пример #29
0
 def run_proc(input_data):
     mldb.post(
         '/v1/procedures', {
             'type': 'summary.statistics',
             'params': {
                 'runOnCreation': True,
                 'inputData': input_data,
                 'outputDataset': {
                     'id': 'error',
                     'type': 'sparse.mutable'
                 }
             }
         })
Пример #30
0
 def do_it(self, ds_type):
     mldb.post(
         "/v1/procedures", {
             "type": "transform",
             "params": {
                 "inputData": self.__class__.query,
                 "outputDataset": {
                     "id": ds_type,
                     "type": ds_type
                 },
                 'runOnCreation': True
             }
         })