def test_no_table_error_message(self):
        msg = 'Wildcard usage requires a FROM statement'
        with self.assertRaisesRegex(ResponseException, msg):
            mldb.query("SELECT *")

        with self.assertRaisesRegex(ResponseException, msg):
            mldb.query("SELECT * WHERE columnCount() > 0")
Пример #2
0
    def test_reshape_row(self):
        res = mldb.query(
            'SELECT reshape({"0": 1, "1": 2, "2": 3, "3": 4}, [2, 2]) as *')

        expected = [["_rowName", "0.0", "0.1", "1.0", "1.1"],
                    ["result", 1, 2, 3, 4]]

        self.assertTableResultEquals(res, expected)

        res = mldb.query(
            'SELECT reshape({"0": 1, "1": 2, "2": 3, "3": 4}, [1, 4]) as *')

        expected = [["_rowName", "0.0", "0.1", "0.2", "0.3"],
                    ["result", 1, 2, 3, 4]]

        self.assertTableResultEquals(res, expected)

        res = mldb.query(
            'SELECT reshape({"0": {"0": 1, "1": 2}, "1": {"0": 3, "1": 4}}, [4]) as *'
        )

        expected = [["_rowName", "0", "1", "2", "3"], ["result", 1, 2, 3, 4]]

        self.assertTableResultEquals(res, expected)

        res = mldb.query(
            'SELECT reshape({"0": {"0": 1, "1": 2}, "1": {"0": 3, "1": 4}}, [1, 4]) as *'
        )

        expected = [["_rowName", "0.0", "0.1", "0.2", "0.3"],
                    ["result", 1, 2, 3, 4]]

        self.assertTableResultEquals(res, expected)
Пример #3
0
    def test_reshape_not_embedding(self):
        with self.assertRaisesRegex(ResponseException, 'Null embedding'):
            mldb.query("SELECT shape(reshape('not an embedding', [1])) as dim")

        with self.assertRaisesRegex(ResponseException,
                                    'requires an embedding'):
            mldb.query("SELECT shape(reshape([1], 'not an embedding')) as dim")
Пример #4
0
    def test_rowHash(self):
        mldb.post(
            '/v1/procedures', {
                'type': 'import.text',
                'params': {
                    "dataFileUrl":
                    "https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv",
                    'outputDataset': "titanic_hashed",
                    "where": "rowHash() % 3 = 0",
                    'runOnCreation': True,
                }
            })

        mldb.post(
            '/v1/procedures', {
                'type': 'import.text',
                'params': {
                    "dataFileUrl":
                    "https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv",
                    'outputDataset': "titanic_no_hashed",
                    'runOnCreation': True,
                }
            })

        self.assertTableResultEquals(
            mldb.query("select count(*) from titanic_hashed"),
            [["_rowName", "count(*)"], ["[]", 287]])

        self.assertTableResultEquals(
            mldb.query("select count(*) from titanic_no_hashed"),
            [["_rowName", "count(*)"], ["[]", 891]])
Пример #5
0
 def test_sequence(self):
     with self.assertRaisesRegex(ResponseException, "Executing builtin function exp: Can't convert value 'a' of type 'ASCII_STRING' to double") as re:
         query = "SELECT exp('a')"
         mldb.query(query)
     with self.assertRaisesRegex(ResponseException, "Binding builtin function sqrt: expected 1 argument, got 3") as re:
         query = "SELECT sqrt(1,2,3)"
         mldb.query(query)
Пример #6
0
 def test_bad_alias_rhs_where(self):
     with self.assertRaises(ResponseException):
         mldb.query("""
             SELECT *
             FROM a
             WHERE a.rowName()=undefined.column
         """)
Пример #7
0
    def test_toy_regression_works(self):
        rez = mldb.put(
            "/v1/procedures/toy_reg", {
                "type": "classifier.test",
                "params": {
                    "mode": "regression",
                    "testingData": """
                    SELECT score as score, label as label from toy_regression
                """,
                    "outputDataset": "toy_reg_output",
                    "runOnCreation": True
                }
            })

        jsRez = rez.json()
        mldb.log(jsRez)

        self.assertEqual(jsRez["status"]["firstRun"]["status"]["mse"], 0.375)

        quart_rez = mldb.query(
            """select abs((label-score)/label) as prnct_error, label, score
                                  from toy_regression order by prnct_error ASC"""
        )
        mldb.log("------------------------ here")
        mldb.log(quart_rez)
        self.assertAlmostEqual(
            jsRez["status"]["firstRun"]["status"]["quantileErrors"]["0.5"],
            quart_rez[2][2])
        self.assertAlmostEqual(
            jsRez["status"]["firstRun"]["status"]["quantileErrors"]["0.9"],
            quart_rez[3][2])

        # Check the accuracy dataset
        self.assertEqual(len(mldb.query("select * from toy_reg_output")), 5)
Пример #8
0
    def test_incomplete(self):
        res = mldb.put(
            "/v1/procedures/split", {
                "type": "split",
                "params": {
                    "reproducible":
                    True,
                    "labels":
                    "SELECT * FROM ds5",
                    "splits": [0.8, 0.2],
                    "outputDatasets": [{
                        "id": "ds_train",
                        "type": "sparse.mutable"
                    }, {
                        "id": "ds_test",
                        "type": "sparse.mutable"
                    }],
                }
            })

        self.assertEqual(
            res.json()["status"]["firstRun"]["status"]["incompleteLabels"],
            ["y"])

        res1 = mldb.query("SELECT sum({*}) FROM ds_train")
        res2 = mldb.query("SELECT sum({*}) FROM ds_test")

        self.assertEqual(
            res1, [["_rowName", "sum({*}).x", "sum({*}).y"], ["[]", 2, 1]])

        self.assertEqual(res2, [["_rowName", "sum({*}).x"], ["[]", 1]])
Пример #9
0
    def test_no_duplicate_rows_in_left_join_with_batch_exec(self):
        # the left condition is always true
        resp = mldb.query("""
            SELECT count(*) FROM left_table LEFT JOIN right_table 
                            ON left_table.c = right_table.c
        """)

        self.assertEqual(resp[1][1], 1000, "expected 1000 rows to be returned")

        # the right condition is always false
        resp = mldb.query("""
            SELECT count(*) FROM left_table LEFT JOIN right_table 
                            ON left_table.c = right_table.c AND
                            2 < right_table.d
        """)

        self.assertEqual(resp[1][1], 100, "expected 100 rows to be returned")

        # the right condition is half the time true
        resp = mldb.query("""
            SELECT count(*) FROM left_table LEFT JOIN right_table 
                            ON left_table.c = right_table.c AND
                            right_table.d = 1
        """)

        # when the row index is even the condition always fails. This accounts for
        # 50 rows.  When the index is odd, each of the left row match 10 different
        # right rows.  So this account for 50 * 10 rows.
        self.assertEqual(resp[1][1], 550, "expected 550 rows to be returned")
Пример #10
0
    def test_join_with_and(self):
        resp = mldb.query('select * from ds_train')
        mldb.log(resp)

        mldb.post(
            "/v1/procedures", {
                "type": "transform",
                "params": {
                    "inputData": """
                        select *
                        from ds left join ds_stats on (ds.dow=ds_stats.dow and ds.a_int=ds_stats.a_int)
                        limit 10
                    """,
                    "outputDataset": {
                        "id": "ds_train2",
                        "type": "tabular",
                        "params": {
                            "unknownColumns": "add"
                        }
                    },
                    "runOnCreation": True
                }
            })

        resp2 = mldb.query('select * from ds_train2')
        mldb.log(resp2)

        # equivalent join conditions should be returning the same dataset
        # this is a very weak check because the columns and the row ordering
        # of these two equivalent joins are currently very different
        self.assertEqual(len(resp), len(resp2),
                         'expected response sizes to match')
Пример #11
0
        def test_fasttext_explain(self):

            mldb.log("explain")

            cls_config = {
                "my_fasttext": {
                    "type": "fasttext",
                    "verbosity" : 0,
                    "dims" : 4,
                    "epoch" : 5,
                }
            }

            tmp_file =  tempfile.NamedTemporaryFile(prefix=os.getcwd() + '/build/x86_64/tmp/')

            mldb.put("/v1/procedures/trainer", {
                "type": "classifier.train",
                "params": {
                    "trainingData": "SELECT {tokens.*} as features, Theme as label FROM bag_of_words",
                    "modelFileUrl": "file:///" + tmp_file.name,
                    "functionName" : 'myclassify',
                    "algorithm": "my_fasttext",
                    "mode": "categorical",
                    "runOnCreation": True,
                    "configuration": cls_config
                }
            })
            
            mldb.put("/v1/functions/explain", {
                "type": "classifier.explain",
                "params": {
                    "modelFileUrl": "file:///" + tmp_file.name,
                }
            })

            res = mldb.query("""SELECT explain({features : {tokenize(lower(' hockey Alabama Futbol'), {splitChars:' ,.:;«»[]()%!?', quoteChar:'', minTokenLength: 2}) as tokens},
                                                label : 'Politique'}) as * 
                            """)

            self.assertTableResultEquals(res, [
                [
                    "_rowName",
                    "bias",
                    "explanation.tokens.alabama",
                    "explanation.tokens.futbol",
                    "explanation.tokens.hockey"
                ],
                [
                    "result",
                    0,
                    -0.006820799317210913,
                    -0.07053825259208679,
                    -0.08547607064247131
                ]
            ]);

            with self.assertRaisesRegex(ResponseException, "label not in model"):
                res = mldb.query("""SELECT explain({features : {tokenize(lower(' hockey Alabama Futbol'), {splitChars:' ,.:;«»[]()%!?', quoteChar:'', minTokenLength: 2}) as tokens},
                                                    label : 'Futurama'}) as * 
                                """)
Пример #12
0
    def test_const_and_var(self):
        res = mldb.query(
            "SELECT __isconst(a AND true) as isconst FROM ds1 ORDER BY rowName()"
        )
        self.assertTableResultEquals(res, [
            [
                '_rowName',
                'isconst',
            ],
            ['row1', False],
        ])

        res = mldb.query(
            "SELECT __isconst(true AND a) as isconst FROM ds1 ORDER BY rowName()"
        )
        self.assertTableResultEquals(res, [
            [
                '_rowName',
                'isconst',
            ],
            ['row1', False],
        ])

        res = mldb.query(
            "SELECT __isconst(a AND a) as isconst FROM ds1 ORDER BY rowName()")
        self.assertTableResultEquals(res, [
            [
                '_rowName',
                'isconst',
            ],
            ['row1', False],
        ])
Пример #13
0
 def test_bad_alias_rhs_inner_join(self):
     with self.assertRaises(ResponseException):
         mldb.query("""
             SELECT *
             FROM a
             INNER JOIN b ON a.rowName() = undefined.rowName()
         """)
Пример #14
0
    def test_hour_equivalence(self):
        self.assertTableResultEquals(
            mldb.query("select INTERVAL '2H' = INTERVAL '120m' as equal"),
            [
                ["_rowName", "equal"],
                ["result",  True ]
            ]
        )


        self.assertTableResultEquals(
            mldb.query("select INTERVAL '2 hour' = INTERVAL '2 HOUR' as equal"),
            [
                ["_rowName", "equal"],
                ["result",  True ]
            ]
        )

        self.assertTableResultEquals(
            mldb.query("select INTERVAL '24 H' = INTERVAL '1440 m' as equal"),
            [
                ["_rowName", "equal"],
                ["result",  True ]
            ]
        )

        self.assertTableResultEquals(
            mldb.query("select INTERVAL '24 H' = INTERVAL '86400 s' as equal"),
            [
                ["_rowName", "equal"],
                ["result",  True ]
            ]
        )
    def test_mldbfb_516_aggregator_incorrect_with_join(self):
        ds = mldb.create_dataset({'id': 'ds516', 'type': 'sparse.mutable'})
        ds.record_row('user3', [['behA', 1, 11], ['conv', 1, 70],
                                ['behB', 1, 14], ['behA', 1, 14]])
        ds.commit()

        ds = mldb.create_dataset({'id': 'conv', 'type': 'sparse.mutable'})
        ds.record_row('user3', [['ts', 70, 0]])
        ds.commit()

        res = mldb.query("""
            SELECT temporal_count({ds516.*}) AS *
            FROM ds516
        """)
        mldb.log(res)
        self.assertTableResultEquals(
            res, [['_rowName', 'behA', 'behB', 'conv'], ['user3', 2, 1, 1]])

        res = mldb.query("""
            SELECT temporal_count({ds516.* as *}) AS *
            FROM ds516 INNER JOIN conv ON ds516.rowName() = conv.rowName()
        """)
        mldb.log(res)
        self.assertTableResultEquals(res,
                                     [['_rowName', 'behA', 'behB', 'conv'],
                                      ['[user3]-[user3]', 2, 1, 1]])
        mldb.log(res)
Пример #16
0
    def test_it(self):
        ds = mldb.create_dataset({'id': 'ds', 'type': 'sparse.mutable'})
        ds.record_row('row1', [])
        ds.commit()

        query = """
            SELECT jseval('
                {}
                return {{"foo" : "bar"}};
                ',
                'cols',
                {{*}}
            ) AS *
            FROM ds
                """

        # the query works
        mldb.log(mldb.query(query.format("")))

        # add an exception, good luck understanding what's going on now...
        try:
            mldb.query(query.format('throw "this query is weird";'))

        except ResponseException as exc:
            mldb.log(exc.response.json())
            assert 'this query is weird' in exc.response.json()['error']
        else:
            assert False, 'should not be here'
Пример #17
0
 def test_columnPathElem(self):
     msg = "Cannot have a NULL column name"
     with self.assertRaisesRegex(ResponseException, msg):
         mldb.query('''
             select COLUMN EXPR (AS columnPathElement(1)
                 WHERE columnName() LIKE '%topics%Junk%') from example
         ''')
Пример #18
0
    def test_it(self):
        res = mldb.query("""
            SELECT parse_json('{"a" : 5}')
        """)
        self.assertEqual(res[1][1], 5)

        msg = 'Executing builtin function parse_json'
        with self.assertRaisesRegex(ResponseException, msg):
            res = mldb.query("""
                SELECT parse_json('coco')
            """)

        res = mldb.query("""
            SELECT try(parse_json('{"a" : 5}'), 'err')
        """)
        self.assertEqual(res[1][1], 5)

        res = mldb.query("""
            SELECT try(parse_json('coco'), 'err')
        """)
        self.assertEqual(res[1][1], 'err')

        res = mldb.query("""
            SELECT try(parse_json('coco'))
        """)
        self.assertRegex(
            res[1][1],
            "JSON passed to parse_json must be an object or an array")
Пример #19
0
    def test_no_duplicate_rows_in_left_join_with_pipeline_exec(self):
        # the cross condition is always true
        resp = mldb.query("""
            SELECT count(*) FROM left_table LEFT JOIN right_table 
                            ON left_table.c = right_table.c AND
                            left_table.const > right_table.d
        """)

        self.assertEqual(resp[1][1], 1000, "expected 1000 rows to be returned")

        # the cross condition is always false
        resp = mldb.query("""
            SELECT count(*) FROM left_table LEFT JOIN right_table 
                            ON left_table.c = right_table.c AND
                            left_table.const < right_table.d
        """)

        self.assertEqual(resp[1][1], 100, "expected 100 rows to be returned")

        # the right condition is half the time true, the cross condition is always true
        resp = mldb.query("""
            SELECT count(*) FROM left_table LEFT JOIN right_table 
                            ON left_table.c = right_table.c AND
                            left_table.const > right_table.d AND
                            right_table.d = 1 order by rowName()
        """)

        #mldb.log(resp)
        self.assertEqual(resp[1][1], 550, "expected 550 rows to be returned")
Пример #20
0
    def test_spread(self):
        mldb.put(
            "/v1/procedures/split", {
                "type": "split",
                "params": {
                    "labels":
                    "SELECT * FROM ds1",
                    "reproducible":
                    True,
                    "splits": [0.8, 0.2],
                    "outputDatasets": [{
                        "id": "ds_train",
                        "type": "sparse.mutable"
                    }, {
                        "id": "ds_test",
                        "type": "sparse.mutable"
                    }],
                }
            })

        res1 = mldb.query("SELECT * FROM ds_train ORDER BY rowName() DESC")
        res2 = mldb.query("SELECT * FROM ds_test ORDER BY rowName() DESC")

        self.assertEqual(
            res1, [["_rowName", "y", "x"], ["3", 1, None], ["0", None, 1]])

        self.assertEqual(
            res2, [["_rowName", "y", "x"], ["2", 1, None], ["1", None, 1]])
Пример #21
0
    def test_const_userfunction_var(self):
        mldb.put('/v1/functions/fetch', {'type': 'fetcher'})
        res = mldb.query(
            "SELECT __isconst(fetch({url: a})) as isconst FROM ds1 ORDER BY rowName()"
        )
        self.assertTableResultEquals(res, [
            [
                '_rowName',
                'isconst',
            ],
            ['row1', False],
        ])

        mldb.put('/v1/functions/fetch2', {
            'type': 'fetcher',
            'deterministic': False
        })
        res = mldb.query(
            "SELECT __isconst(fetch2({url: 'itdoesntreallymatter'})) as isconst FROM ds1 ORDER BY rowName()"
        )
        self.assertTableResultEquals(res, [
            [
                '_rowName',
                'isconst',
            ],
            ['row1', False],
        ])
Пример #22
0
    def test_not_equivalent(self):
        self.assertTableResultEquals(
            mldb.query("select INTERVAL '1 day' = INTERVAL '24H' as equal"),
            [
                ["_rowName", "equal"],
                ["result",  False ] # because of daylight saving
            ]
        )

        self.assertTableResultEquals(
            mldb.query("select INTERVAL '1 month' = INTERVAL '30day' as equal"),
            [
                ["_rowName", "equal"],
                ["result",  False ] # because months were not all created equal
            ]
        )

        self.assertTableResultEquals(
            mldb.query("select INTERVAL '1 month' = INTERVAL '4 week' as equal"),
            [
                ["_rowName", "equal"],
                ["result",  False ] # because months were not all created equal
            ]
        )

        self.assertTableResultEquals(
            mldb.query("select INTERVAL '1 year' = INTERVAL '365 day' as equal"),
            [
                ["_rowName", "equal"],
                ["result",  False ] # because of leap years
            ]
        )
Пример #23
0
    def run_MLDBFB_545_with_ds_type(self, ds_type):
        id1 = ds_type + 'mldbfb545_1'
        ds = mldb.create_dataset({'id': id1, 'type': ds_type + '.mutable'})
        ds.record_row('user1', [['converted', 'n', 0]])
        ds.commit()

        id2 = ds_type + 'mldbfb545_2'
        ds = mldb.create_dataset({'id': id2, 'type': ds_type + '.mutable'})
        ds.record_row('user2', [['blah', 'blah', 0]])
        ds.commit()

        # query directly on the dataset works
        res = mldb.query("""
            SELECT * FROM {} WHERE converted='c' LIMIT 1
        """.format(id1))
        self.assertEqual(len(res), 1)

        merge_id = ds_type + 'mldbfb545merged'
        mldb.put("/v1/datasets/" + merge_id, {
            "type": "merged",
            "params": {
                "datasets": [{
                    "id": id1
                }, {
                    "id": id2
                }]
            }
        })

        # query on the merged dataset yields incorrect results
        res = mldb.query("""
            SELECT * FROM {} WHERE converted='c' LIMIT 1
        """.format(merge_id))
        mldb.log(res)
        self.assertEqual(len(res), 1)
Пример #24
0
    def test_string_mixing(self):
        res = mldb.query("SELECT a.f['b'] FROM (SELECT {f: {b: 123}} AS a)")
        self.assertEqual(res, [['_rowName', '"a.f[\'b\']"'], ['result', 'b']])

        with self.assertRaises(ResponseException):
            res = mldb.query(
                "SELECT a['f'].b FROM (SELECT {f: {b: 123}} AS a)")
Пример #25
0
    def test_domain_parsing(self):
        self.assertTableResultEquals(
            mldb.query("""
                select 
                    extract_domain('http://www.datacratic.com/pwetpwet/houa.html') as c1,
                    extract_domain('http://datacratic.com/pwetpwet/houa.html') as c2,
                    extract_domain('http://data.datacratic.com/pwetpwet/houa.html') as c3,

                    extract_domain('http://www.datacratic.com/pwetpwet/houa.html', {removeSubdomain:1}) as c1nosub,
                    extract_domain('http://datacratic.com/pwetpwet/houa.html', {removeSubdomain:1}) as c2nosub,
                    extract_domain('http://data.datacratic.com/pwetpwet/houa.html', {removeSubdomain:1}) as c3nosub
            """),
            [["_rowName", "c1", "c1nosub", "c2", "c2nosub", "c3", "c3nosub"],
             [
                 "result", "www.datacratic.com", "datacratic.com",
                 "datacratic.com", "datacratic.com", "data.datacratic.com",
                 "datacratic.com"
             ]])

        with self.assertRaisesRegex(
                ResponseException, 'Attempt to create a URL without a scheme'):
            mldb.query("SELECT extract_domain('pwet.com') as c4")

        self.assertTableResultEquals(
            mldb.query("""
                select extract_domain(patate) as domain,
                       extract_domain(value) as domain2
                from (
                    select * from row_dataset({"domain": 'http://www.domain.com'})
                )
            """),
            [["_rowName", "domain", "domain2"], ["0", None, "www.domain.com"]])
Пример #26
0
 def test_valid_aggregator_on_wildcard_builtin(self):
     mldb.query(
         "select count(*), earliest(temporal_earliest({*})) from sample group by x"
     )
     mldb.query(
         "select count(*), earliest({horizontal_earliest({*})}) from sample group by x"
     )
    def test_it(self):
        ds = mldb.create_dataset({'id': 'ds', 'type': 'sparse.mutable'})
        ds.record_row('row1', [['colA', 1, 0]])
        ds.commit()

        msg = "function avg expected 1 argument, got 2"
        with self.assertRaisesRegex(ResponseException, msg):
            mldb.query("SELECT avg(colA, 2) FROM ds")
Пример #28
0
    def test_mixing_double_quotes(self):
        res = mldb.query(
            """SELECT a.f["b"] FROM (SELECT {f: {b: 123}} AS a)""")
        self.assertEqual(res, [['_rowName', 'a.f[b]'], ['result', 123]])

        res = mldb.query(
            """SELECT a["f"].b FROM (SELECT {f: {b: 123}} AS a)""")
        self.assertEqual(res, [['_rowName', 'a.f[b]'], ['result', 123]])
Пример #29
0
 def test_invalid_group_by_and_wildcard_builtin(self):
     with self.assertRaisesRegex(
             ResponseException,
             "Non-aggregator 'temporal_earliest\(\{\*\}\)' with GROUP BY clause is not allowed"
     ):
         mldb.query(
             "select temporal_earliest({*}) as earliest from sample group by x"
         )
    def test_value_desc_on_wrong_params(self):
        with self.assertRaisesRegex(
                ResponseException,
                'Binding builtin function sqrt: expected 1 argument, got 2'):
            mldb.query("select sqrt(2, NULL)")

        with self.assertRaisesRegex(
                ResponseException,
                'Binding builtin function sqrt: expected 1 argument, got 2'):
            mldb.query("select sqrt(2, 1)")