Пример #1
0
    def test_clickhouse_ds(self):
        from mindsdb_datasources import ClickhouseDS
        LIMIT = 100

        clickhouse_ds = ClickhouseDS(
            host=self.HOST,
            port=self.PORT,
            user=self.USER,
            password=self.PASSWORD,
            query='SELECT * FROM {}.{} LIMIT {}'.format(
                self.DATABASE,
                'home_rentals',
                LIMIT
            )
        )

        # test filter
        for val in clickhouse_ds.filter([['location', 'like','ood']])['location']:
            assert val == 'good'

        assert len(clickhouse_ds.filter([['rental_price', '>', 2500]], 3)) == 3
        assert len(clickhouse_ds.filter([['initial_price', '<', 0]], 3)) == 0

        # mess with the values inside then try to analyze it
        clickhouse_ds.df = break_dataset(clickhouse_ds.df)
        assert len(clickhouse_ds) <= LIMIT
        F.analyse_dataset(from_data=clickhouse_ds)
Пример #2
0
    def test_predictor_deduplicate_data(self):
        n_points = 100
        input_dataframe = pd.DataFrame({
            'numeric_int': [x % 44 for x in list(range(n_points))],
            'numeric_int_2': [x % 20 for x in list(range(n_points))],
        }, index=list(range(n_points)))
        input_dataframe['y'] = input_dataframe['numeric_int'] % 10

        # Add duplicate row
        input_dataframe = input_dataframe.append(input_dataframe.iloc[99], ignore_index=True)

        mdb = Predictor(name='test_drop_duplicates')
        mdb.learn(
            from_data=input_dataframe,
            to_predict='y',
            stop_training_in_x_seconds=1,
            use_gpu=False
        )

        model_data = F.get_model_data('test_drop_duplicates')

        # Ensure duplicate row was not used for training, or analysis

        assert model_data['data_preparation']['total_row_count'] == n_points
        assert model_data['data_preparation']['used_row_count'] <= n_points

        assert sum([model_data['data_preparation']['train_row_count'],
                   model_data['data_preparation']['validation_row_count'],
                   model_data['data_preparation']['test_row_count']]) == n_points

        assert sum([mdb.transaction.input_data.train_df.shape[0],
                    mdb.transaction.input_data.test_df.shape[0],
                    mdb.transaction.input_data.validation_df.shape[0]]) == n_points

        # Disable deduplication and ensure the duplicate row is used
        mdb = Predictor(name='test_drop_duplicates')
        mdb.learn(
            from_data=input_dataframe,
            to_predict='y',
            stop_training_in_x_seconds=1,
            use_gpu=False,
            advanced_args={
                'deduplicate_data': False
            }
        )

        model_data = F.get_model_data('test_drop_duplicates')

        # Duplicate row was used for analysis and training

        assert model_data['data_preparation']['total_row_count'] == n_points+1
        assert model_data['data_preparation']['used_row_count'] <= n_points+1

        assert sum([model_data['data_preparation']['train_row_count'],
                    model_data['data_preparation']['validation_row_count'],
                    model_data['data_preparation']['test_row_count']]) == n_points+1

        assert sum([mdb.transaction.input_data.train_df.shape[0],
                    mdb.transaction.input_data.test_df.shape[0],
                    mdb.transaction.input_data.validation_df.shape[0]]) == n_points+1
Пример #3
0
    def test_mysql_ds(self):
        from mindsdb_datasources import MySqlDS

        LIMIT = 400

        mysql_ds = MySqlDS(
            host=self.HOST,
            user=self.USER,
            password=self.PASSWORD,
            database=self.DATABASE,
            port=self.PORT,
            query=
            ' (SELECT * FROM (SELECT * FROM {table} LIMIT {limit}) as t1) UNION ALL (SELECT * FROM (SELECT * FROM {table} LIMIT {limit}) as t1)'
            .format(table=self.TABLE, limit=int(LIMIT / 2)))

        mysql_ds.df = break_dataset(mysql_ds.df)

        assert len(mysql_ds) <= LIMIT

        F.analyse_dataset(mysql_ds)

        # Our SQL parsing fails here, test if we're still able to filter via the dataframe fallback
        for val in mysql_ds.filter([['sex', 'like', 'fem']])['sex']:
            assert val == 'female'

        assert len(mysql_ds.filter([['age', '>', 20]], 12)) == 12
        assert len(mysql_ds.filter([['age', '=', 60]], 1)) == 1
        assert len(mysql_ds.filter([['age', '>', 150]], 11)) == 0
Пример #4
0
 def delete_model(self, name):
     predictor_record = Predictor.query.filter_by(
         company_id=self.company_id, name=name, is_custom=False).first()
     id = predictor_record.id
     session.delete(predictor_record)
     session.commit()
     F.delete_model(name)
     self.dbw.unregister_predictor(name)
     self.fs_store.delete(f'predictor_{self.company_id}_{id}')
Пример #5
0
    def delete_model(self, name):
        from mindsdb_native import F
        from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES
        from mindsdb.interfaces.storage.db import session, Predictor

        predictor_record = Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=False).first()
        id = predictor_record.id
        session.delete(predictor_record)
        session.commit()
        F.delete_model(name)
        self.dbw.unregister_predictor(name)
        self.fs_store.delete(f'predictor_{self.company_id}_{id}')
        return 0
Пример #6
0
    def test_timeseries(self, tmp_path):
        ts_hours = 12
        data_len = 120
        train_file_name = os.path.join(str(tmp_path), 'train_data.csv')
        test_file_name = os.path.join(str(tmp_path), 'test_data.csv')

        features = generate_value_cols(['date', 'int'], data_len,
                                       ts_hours * 3600)
        labels = [generate_timeseries_labels(features)]

        feature_headers = list(map(lambda col: col[0], features))
        label_headers = list(map(lambda col: col[0], labels))

        # Create the training dataset and save it to a file
        columns_train = list(
            map(lambda col: col[1:int(len(col) * 3 / 4)], features))
        columns_train.extend(
            list(map(lambda col: col[1:int(len(col) * 3 / 4)], labels)))
        columns_to_file(columns_train,
                        train_file_name,
                        headers=[*feature_headers, *label_headers])
        # Create the testing dataset and save it to a file
        columns_test = list(
            map(lambda col: col[int(len(col) * 3 / 4):], features))
        columns_to_file(columns_test, test_file_name, headers=feature_headers)

        mdb = Predictor(name='test_timeseries')

        mdb.learn(from_data=train_file_name,
                  to_predict=label_headers,
                  timeseries_settings={
                      'order_by': [feature_headers[0]],
                      'window': 3
                  },
                  stop_training_in_x_seconds=10,
                  use_gpu=False,
                  advanced_args={'force_predict': True})

        results = mdb.predict(when_data=test_file_name, use_gpu=False)

        for row in results:
            expect_columns = [
                label_headers[0], label_headers[0] + '_confidence'
            ]
            for col in expect_columns:
                assert col in row

        models = F.get_models()
        model_data = F.get_model_data(models[0]['name'])
        assert model_data
Пример #7
0
    def test_analyze_dataset(self):
        n_points = 100
        n_category_values = 4
        input_dataframe = pd.DataFrame({
            'numeric_int': [x % 10 for x in list(range(n_points))],
            'numeric_float': np.linspace(0, n_points, n_points),
            'date_timestamp': [
                (datetime.now() - timedelta(minutes=int(i))).isoformat() for i in
                range(n_points)],
            'date_date': [
                (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d') for i in
                range(n_points)],
            'categorical_str': [f'a{x}' for x in (
                    list(range(n_category_values)) * (
                        n_points // n_category_values))],
            'categorical_int': [x for x in (list(range(n_category_values)) * (
                    n_points // n_category_values))],
            'categorical_binary': [0, 1] * (n_points // 2),
            'sequential_array': [f"1,2,3,4,5,{i}" for i in range(n_points)]
        }, index=list(range(n_points)))

        model_data = F.analyse_dataset(from_data=input_dataframe)
        for col, col_data in model_data['data_analysis_v2'].items():
            expected_type = test_column_types[col][0]
            expected_subtype = test_column_types[col][1]
            assert col_data['typing']['data_type'] == expected_type
            assert col_data['typing']['data_subtype'] == expected_subtype

            assert col_data['empty']
            assert col_data['histogram']
            assert 'percentage_buckets' in col_data
            assert 'nr_warnings' in col_data
            assert not col_data['is_foreign_key']

        assert isinstance(json.dumps(model_data), str)
Пример #8
0
    def analyse_dataset(self, ds):
        from mindsdb_datasources import FileDS, ClickhouseDS, MariaDS, MySqlDS, PostgresDS, MSSQLDS, MongoDS, SnowflakeDS, AthenaDS
        from mindsdb_native import F

        ds = eval(ds['class'])(*ds['args'], **ds['kwargs'])
        analysis = F.analyse_dataset(ds)
        return self._pack(analysis)
Пример #9
0
    def test_multilabel_prediction(self, tmp_path):
        train_file_name = os.path.join(str(tmp_path), 'train_data.csv')
        test_file_name = os.path.join(str(tmp_path), 'test_data.csv')
        data_len = 60

        features = generate_value_cols(['int', 'float', 'int', 'float'], data_len)
        labels = []
        labels.append(generate_log_labels(features))
        labels.append(generate_timeseries_labels(features))

        feature_headers = list(map(lambda col: col[0], features))
        label_headers = list(map(lambda col: col[0], labels))

        # Create the training dataset and save it to a file
        columns_train = list(
            map(lambda col: col[1:int(len(col) * 3 / 4)], features))
        columns_train.extend(
            list(map(lambda col: col[1:int(len(col) * 3 / 4)], labels)))
        columns_to_file(columns_train, train_file_name,
                        headers=[*feature_headers, *label_headers])

        # Create the testing dataset and save it to a file
        columns_test = list(
            map(lambda col: col[int(len(col) * 3 / 4):], features))
        columns_to_file(columns_test, test_file_name,
                        headers=feature_headers)

        mdb = Predictor(name='test_multilabel_prediction')
        mdb.learn(
            from_data=train_file_name,
            to_predict=label_headers,
            stop_training_in_x_seconds=1,
            use_gpu=False,
            advanced_args={'force_predict': True}
        )

        results = mdb.predict(when_data=test_file_name)
        models = F.get_models()
        model_data = F.get_model_data(models[0]['name'])
        assert model_data

        for i in range(len(results)):
            row = results[i]
            for label in label_headers:
                expect_columns = [label, label + '_confidence']
                for col in expect_columns:
                    assert col in row
Пример #10
0
    def test_category_tags_output(self):
        vocab = random.sample(SMALL_VOCAB, 10)
        vocab = {i: word for i, word in enumerate(vocab)}
        # x1 contains the index of first tag present
        # x2 contains the index of second tag present
        # if a tag is missing then x1/x2 contain -1 instead
        # Thus the dataset should be perfectly predicted
        n_points = 5000
        x1 = [
            random.randint(0,
                           len(vocab) - 1) if random.random() > 0.1 else -1
            for i in range(n_points)
        ]
        x2 = [
            random.randint(0,
                           len(vocab) - 1) if random.random() > 0.1 else -1
            for i in range(n_points)
        ]
        tags = []
        for x1_index, x2_index in zip(x1, x2):
            row_tags = set([vocab.get(x1_index), vocab.get(x2_index)])
            row_tags = [x for x in row_tags if x is not None]
            tags.append(','.join(row_tags))

        df = pd.DataFrame({'x1': x1, 'x2': x2, 'tags': tags})

        df_train = df.iloc[:round(n_points * 0.9)]
        df_test = df.iloc[round(n_points * 0.9):]

        predictor = Predictor('test')

        predictor.learn(from_data=df_train,
                        to_predict='tags',
                        advanced_args=dict(deduplicate_data=False),
                        stop_training_in_x_seconds=60,
                        use_gpu=False)

        model_data = F.get_model_data('test')
        assert model_data['data_analysis_v2']['tags']['typing'][
            'data_type'] == DATA_TYPES.CATEGORICAL
        assert model_data['data_analysis_v2']['tags']['typing'][
            'data_subtype'] == DATA_SUBTYPES.TAGS

        predictions = predictor.predict(when_data=df_test)
        test_tags = df_test.tags.apply(lambda x: x.split(','))

        predicted_tags = []
        for i in range(len(predictions)):
            predicted_tags.append(predictions[i]['tags'])

        test_tags_encoded = predictor.transaction.model_backend.predictor._mixer.encoders[
            'tags'].encode(test_tags)
        pred_labels_encoded = predictor.transaction.model_backend.predictor._mixer.encoders[
            'tags'].encode(predicted_tags)
        score = f1_score(test_tags_encoded,
                         pred_labels_encoded,
                         average='weighted')

        assert score >= 0.3
Пример #11
0
    def test_mongodb_ds(self):

        mongodb_ds = MongoDS(collection=self.COLLECTION,
                             query={},
                             host=self.HOST,
                             port=self.PORT,
                             user=self.USER,
                             password=self.PASSWORD,
                             database=self.DATABASE)

        F.analyse_dataset(from_data=mongodb_ds)

        for val in mongodb_ds.filter([['location', 'like',
                                       'ood']])['location']:
            assert val == 'good'

        assert len(mongodb_ds.filter([['rental_price', '>', 2500]], 3)) == 3
        assert len(mongodb_ds.filter([['initial_price', '<', 0]], 3)) == 0
Пример #12
0
    def test_postgres_ds(self):
        from mindsdb_datasources import PostgresDS

        LIMIT = 100

        postgres_ds = PostgresDS(host=self.HOST,
                                 user=self.USER,
                                 password=self.PASSWORD,
                                 database=self.DATABASE,
                                 port=self.PORT,
                                 query='SELECT * FROM {}.{} LIMIT {}'.format(
                                     'test_data', self.TABLE, LIMIT))

        postgres_ds.df = break_dataset(postgres_ds.df)

        assert len(postgres_ds) == LIMIT

        F.analyse_dataset(postgres_ds)
Пример #13
0
def test_postgres_ds():
    import pg8000
    from mindsdb_native.libs.data_sources.postgres_ds import PostgresDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = ''
    DBNAME = 'postgres'
    PORT = 5432

    con = pg8000.connect(database=DBNAME,
                         user=USER,
                         password=PASSWORD,
                         host=HOST,
                         port=PORT)
    cur = con.cursor()

    cur.execute('DROP TABLE IF EXISTS test_mindsdb')
    cur.execute(
        'CREATE TABLE test_mindsdb(col_1 Text, col_2 Int,  col_3 Boolean, col_4 Date, col_5 Int [])'
    )
    for i in range(0, 200):
        dt = datetime.datetime.now() - datetime.timedelta(days=i)
        dt_str = dt.strftime('%Y-%m-%d')
        cur.execute(
            f'INSERT INTO test_mindsdb VALUES (\'String {i}\', {i}, {i % 2 == 0}, \'{dt_str}\', ARRAY [1, 2, {i}])'
        )
    con.commit()
    con.close()

    postgres_ds = PostgresDS(table='test_mindsdb',
                             host=HOST,
                             user=USER,
                             password=PASSWORD,
                             database=DBNAME,
                             port=PORT)

    assert postgres_ds.name() == 'PostgresDS: postgres/test_mindsdb'

    assert (len(postgres_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    F.analyse_dataset(from_data=postgres_ds)
Пример #14
0
    def test_data_source_setting(self):
        data_url = 'https://raw.githubusercontent.com/mindsdb/mindsdb-examples/master/classics/german_credit_data/processed_data/test.csv'
        data_source = FileDS(data_url)
        data_source.set_subtypes({})

        data_source_mod = FileDS(data_url)
        data_source_mod.set_subtypes({
            'credit_usage': 'Int',
            'Average_Credit_Balance': 'Short Text',
            'existing_credits': 'Binary Category'
        })

        analysis = F.analyse_dataset(data_source)
        analysis_mod = F.analyse_dataset(data_source_mod)

        a1 = analysis['data_analysis_v2']
        a2 = analysis_mod['data_analysis_v2']
        assert (len(a1) == len(a2))
        assert (a1['over_draft']['typing']['data_type'] == a2['over_draft']
                ['typing']['data_type'])

        assert (a1['credit_usage']['typing']['data_type'] == a2['credit_usage']
                ['typing']['data_type'])
        assert (a1['credit_usage']['typing']['data_subtype'] !=
                a2['credit_usage']['typing']['data_subtype'])
        assert (
            a2['credit_usage']['typing']['data_subtype'] == DATA_SUBTYPES.INT)

        assert (a1['Average_Credit_Balance']['typing']['data_type'] !=
                a2['Average_Credit_Balance']['typing']['data_type'])
        assert (a1['Average_Credit_Balance']['typing']['data_subtype'] !=
                a2['Average_Credit_Balance']['typing']['data_subtype'])
        assert (a2['Average_Credit_Balance']['typing']['data_subtype'] ==
                DATA_SUBTYPES.SHORT)
        assert (a2['Average_Credit_Balance']['typing']['data_type'] ==
                DATA_TYPES.TEXT)

        assert (a1['existing_credits']['typing']['data_type'] ==
                a2['existing_credits']['typing']['data_type'])
        assert (a1['existing_credits']['typing']['data_subtype'] !=
                a2['existing_credits']['typing']['data_subtype'])
        assert (a2['existing_credits']['typing']['data_subtype'] ==
                DATA_SUBTYPES.SINGLE)
Пример #15
0
    def test_analyze_dataset_empty_column(self):
        n_points = 100
        input_dataframe = pd.DataFrame({
            'numeric_int': [x % 10 for x in list(range(n_points))],
            'empty_column': [None for i in range(n_points)]
        }, index=list(range(n_points)))

        model_data = F.analyse_dataset(from_data=input_dataframe)

        assert model_data['data_analysis_v2']['empty_column']['empty']['is_empty'] is True
Пример #16
0
def test_mongodb_ds():
    from pymongo import MongoClient
    from mindsdb_native.libs.data_sources.mongodb_ds import MongoDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = '******'
    DATABASE = 'database'
    COLLECTION_NAME = 'test_mindsdb'
    PORT = 27017

    con = MongoClient(host=HOST, port=PORT, username=USER, password=PASSWORD)

    db = con[DATABASE]

    if COLLECTION_NAME in db.list_collection_names():
        db[COLLECTION_NAME].drop()

    collection = db[COLLECTION_NAME]

    for i in range(0, 200):
        collection.insert_one({
            'col_1': "This is string number {}".format(i),
            'col_2': i,
            'col_3': (i % 2) == 0
        })

    mongodb_ds = MongoDS(collection=COLLECTION_NAME,
                         query={},
                         host=HOST,
                         port=PORT,
                         user=USER,
                         password=PASSWORD,
                         database=DATABASE)

    assert mongodb_ds.name() == 'MongoDS: database/test_mindsdb'

    assert (len(mongodb_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    F.analyse_dataset(from_data=mongodb_ds)
Пример #17
0
    def test_analyze_dataset_empty_values(self):
        n_points = 100
        input_dataframe = pd.DataFrame({
            'numeric_int': [x % 10 for x in list(range(n_points))],
            'numeric_int2': list(range(n_points)),
        }, index=list(range(n_points)))
        input_dataframe['numeric_int'].iloc[::2] = None

        model_data = F.analyse_dataset(from_data=input_dataframe)

        assert model_data['data_analysis_v2']['numeric_int']['empty']['empty_percentage'] == 50
Пример #18
0
def test_mysql_ds():
    import mysql.connector
    from mindsdb_native.libs.data_sources.mysql_ds import MySqlDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = ''
    DATABASE = 'mysql'
    PORT = 3306

    con = mysql.connector.connect(host=HOST,
                                  port=PORT,
                                  user=USER,
                                  password=PASSWORD,
                                  database=DATABASE)
    cur = con.cursor()

    cur.execute('DROP TABLE IF EXISTS test_mindsdb')
    cur.execute(
        'CREATE TABLE test_mindsdb(col_1 Text, col_2 BIGINT, col_3 BOOL)')
    for i in range(0, 200):
        cur.execute(
            f'INSERT INTO test_mindsdb VALUES ("This is string number {i}", {i}, {i % 2 == 0})'
        )
    con.commit()
    con.close()

    mysql_ds = MySqlDS(table='test_mindsdb',
                       host=HOST,
                       user=USER,
                       password=PASSWORD,
                       database=DATABASE,
                       port=PORT)

    assert mysql_ds.name() == 'MySqlDS: mysql/test_mindsdb'

    assert (len(mysql_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    F.analyse_dataset(from_data=mysql_ds)
Пример #19
0
def test_mssql_ds():
    import pytds
    from mindsdb_native.libs.data_sources.ms_sql_ds import MSSQLDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = '******'
    DATABASE = 'master'
    PORT = 1433

    with pytds.connect(dsn=HOST,
                       user=USER,
                       password=PASSWORD,
                       database=DATABASE) as con:
        with con.cursor() as cur:
            cur.execute(
                "IF OBJECT_ID('dbo.test_mindsdb') IS NOT NULL DROP TABLE dbo.test_mindsdb"
            )
            cur.execute(
                'CREATE TABLE test_mindsdb(col_1 Text, col_2 BIGINT, col_3 BIT)'
            )
            for i in range(0, 200):
                cur.execute(
                    f"INSERT INTO test_mindsdb ([col_1], [col_2], [col_3]) VALUES ('This is string number {i}', {i}, {i % 2})"
                )
        con.commit()

    mssql_ds = MSSQLDS(table='test_mindsdb',
                       host=HOST,
                       user=USER,
                       password=PASSWORD,
                       database=DATABASE,
                       port=PORT)

    assert mssql_ds.name() == 'MSSQLDS: master/test_mindsdb'

    assert (len(mssql_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    F.analyse_dataset(from_data=mssql_ds)
Пример #20
0
    def test_sample_for_analysis(self):
        n_points = 100
        n_category_values = 4
        input_dataframe = pd.DataFrame(
            {
                'numeric_int': [x % 10 for x in list(range(n_points))],
                'numeric_float':
                np.linspace(0, n_points, n_points),
                'date_timestamp':
                [(datetime.now() - timedelta(minutes=int(i))).isoformat()
                 for i in range(n_points)],
                'date_date': [
                    (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d')
                    for i in range(n_points)
                ],
                'categorical_str': [
                    f'a{x}' for x in (list(range(n_category_values)) *
                                      (n_points // n_category_values))
                ],
                'categorical_int': [
                    x for x in (list(range(n_category_values)) *
                                (n_points // n_category_values))
                ],
                'categorical_binary': [0, 1] * (n_points // 2),
                'sequential_array':
                [f"1,2,3,4,5,{i}" for i in range(n_points)]
            },
            index=list(range(n_points)))

        mock_function = PickableMock(spec=sample_data, wraps=sample_data)
        setattr(mock_function, '__name__', 'mock_sample_data')
        with mock.patch(
                'mindsdb_native.libs.controllers.predictor.sample_data',
                mock_function):
            model_data = F.analyse_dataset(
                from_data=input_dataframe,
                sample_settings={'sample_for_analysis': True})
            assert mock_function.called

        for col, col_data in model_data['data_analysis_v2'].items():
            if col == 'columns':
                continue
            expected_type = test_column_types[col][0]
            expected_subtype = test_column_types[col][1]
            assert col_data['typing']['data_type'] == expected_type
            assert col_data['typing']['data_subtype'] == expected_subtype

            assert col_data['empty']
            assert col_data['histogram']
            assert 'percentage_buckets' in col_data
            assert 'nr_warnings' in col_data
            assert col_data['identifier'] is None
Пример #21
0
    def get_models(self, status='any'):
        models = F.get_models()
        if status != 'any':
            models = [x for x in models if x['status'] == status]

        for i in range(len(models)):
            for k in ['train_end_at', 'updated_at', 'created_at']:
                if k in models[i] and models[i][k] is not None:
                    try:
                        models[i][k] = parse_datetime(
                            str(models[i][k]).split('.')[0])
                    except Exception as e:
                        models[i][k] = parse_datetime(str(models[i][k]))
        return models
Пример #22
0
    def get_model_data(self, name, native_view=False):
        model = F.get_model_data(name)
        if native_view:
            return model

        data_analysis = model['data_analysis_v2']
        for column in data_analysis['columns']:
            if len(data_analysis[column]) == 0 or data_analysis[column].get(
                    'empty', {}).get('is_empty', False):
                data_analysis[column]['typing'] = {
                    'data_subtype': DATA_SUBTYPES.INT
                }

        return model
Пример #23
0
def test_clickhouse_ds():
    from mindsdb_native.libs.data_sources.clickhouse_ds import ClickhouseDS

    HOST = 'localhost'
    PORT = 8123

    clickhouse_url = f'http://{HOST}:{PORT}'
    queries = [
        'CREATE DATABASE IF NOT EXISTS test', 'DROP TABLE IF EXISTS test.mock',
        '''
            CREATE TABLE test.mock(
                col1 String
                ,col2 Int64
                ,col3 Array(UInt8)
            ) ENGINE = MergeTree()
                ORDER BY col2
                PARTITION BY col1
        ''', "INSERT INTO test.mock VALUES ('a',1,[1,2,3])",
        "INSERT INTO test.mock VALUES ('b',2,[2,3,1])",
        "INSERT INTO test.mock VALUES ('c',3,[3,1,2])"
    ]
    for q in queries:
        r = requests.post(clickhouse_url, data=q)
        assert r.status_code == 200

    clickhouse_ds = ClickhouseDS(
        'SELECT * FROM test.mock ORDER BY col2 DESC LIMIT 2',
        host=HOST,
        port=PORT)

    assert (len(clickhouse_ds.df) == 2)
    assert (sum(map(int, clickhouse_ds.df['col2'])) == 5)
    assert (len(list(clickhouse_ds.df['col3'][1])) == 3)
    assert (set(clickhouse_ds.df.columns) == set(['col1', 'col2', 'col3']))

    mdb = Predictor(name='analyse_dataset_test_predictor')
    F.analyse_dataset(from_data=clickhouse_ds)
Пример #24
0
    def test_maria_ds(self):
        from mindsdb_datasources import MariaDS

        LIMIT = 200

        maria_ds = MariaDS(host=self.HOST,
                           user=self.USER,
                           password=self.PASSWORD,
                           database=self.DATABASE,
                           port=self.PORT,
                           query='SELECT * FROM `{}` LIMIT {}'.format(
                               self.TABLE, LIMIT))

        maria_ds.df = break_dataset(maria_ds.df)

        assert len(maria_ds) <= LIMIT

        F.analyse_dataset(from_data=maria_ds)

        # Our SQL parsing succeds here, but the query fails, test if we're still able to filter via the dataframe fallback
        maria_ds._query = maria_ds._query.replace(self.TABLE,
                                                  'wrongly_named_table')
        assert len(maria_ds.filter([['Population', '<', 33098932]], 8)) == 8
        assert len(maria_ds.filter([['Development_Index', '!=', 3]], 12)) == 12
Пример #25
0
    def test_category_tags_input(self):
        vocab = random.sample(SMALL_VOCAB, 10)
        # tags contains up to 2 randomly selected tags
        # y contains the sum of indices of tags
        # the dataset should be nearly perfectly predicted
        n_points = 5000
        tags = []
        y = []
        for i in range(n_points):
            row_tags = []
            row_y = 0
            for k in range(2):
                if random.random() > 0.2:
                    selected_index = random.randint(0, len(vocab) - 1)
                    if vocab[selected_index] not in row_tags:
                        row_tags.append(vocab[selected_index])
                        row_y += selected_index
            tags.append(','.join(row_tags))
            y.append(row_y)

        df = pd.DataFrame({'tags': tags, 'y': y})

        df_train = df.iloc[:round(n_points * 0.9)]
        df_test = df.iloc[round(n_points * 0.9):]

        predictor = Predictor(name='test')

        predictor.learn(from_data=df_train,
                        to_predict='y',
                        advanced_args=dict(deduplicate_data=False),
                        stop_training_in_x_seconds=40,
                        use_gpu=False)

        model_data = F.get_model_data('test')
        assert model_data['data_analysis_v2']['tags']['typing'][
            'data_type'] == DATA_TYPES.CATEGORICAL
        assert model_data['data_analysis_v2']['tags']['typing'][
            'data_subtype'] == DATA_SUBTYPES.TAGS

        predictions = predictor.predict(when_data=df_test)
        test_y = df_test.y.apply(str)

        predicted_y = []
        for i in range(len(predictions)):
            predicted_y.append(predictions[i]['y'])

        score = accuracy_score(test_y, predicted_y)
        assert score >= 0.2
Пример #26
0
    def get_model_data(self, name, db_fix=True):
        model = F.get_model_data(name)

        # Make some corrections for databases not to break when dealing with empty columns
        if db_fix:
            data_analysis = model['data_analysis_v2']
            for column in data_analysis['columns']:
                analysis = data_analysis.get(column)
                if isinstance(analysis,
                              dict) and (len(analysis) == 0 or analysis.get(
                                  'empty', {}).get('is_empty', False)):
                    data_analysis[column]['typing'] = {
                        'data_subtype': DATA_SUBTYPES.INT
                    }

        return model
Пример #27
0
    def test_mssql_ds(self):
        from mindsdb_datasources import MSSQLDS

        HOST = DB_CREDENTIALS['mssql']['host']
        USER = DB_CREDENTIALS['mssql']['user']
        PASSWORD = DB_CREDENTIALS['mssql']['password']
        DATABASE = DB_CREDENTIALS['mssql']['database']
        PORT = DB_CREDENTIALS['mssql']['port']

        mssql_ds = MSSQLDS(query='SELECT * FROM dbo.insurance LIMIT',
                           host=HOST,
                           user=USER,
                           password=PASSWORD,
                           database=DATABASE,
                           port=PORT)

        assert (len(mssql_ds.df) > 200)
        analysis = F.analyse_dataset(from_data=mssql_ds)
Пример #28
0
    def get_model_data(self, name, db_fix=True):
        from mindsdb_native import F
        from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES
        from mindsdb.interfaces.storage.db import session, Predictor

        predictor_record = Predictor.query.filter_by(
            company_id=self.company_id, name=name, is_custom=False).first()
        predictor_record = self._try_outdate_db_status(predictor_record)
        model = predictor_record.data
        if model is None or model['status'] == 'training':
            try:
                self.fs_store.get(
                    name, f'predictor_{self.company_id}_{predictor_record.id}',
                    self.config['paths']['predictors'])
                new_model_data = F.get_model_data(name)
            except Exception:
                new_model_data = None

            if predictor_record.data is None or (
                    new_model_data is not None
                    and len(new_model_data) > len(predictor_record.data)):
                predictor_record.data = new_model_data
                model = new_model_data
                session.commit()

        # Make some corrections for databases not to break when dealing with empty columns
        if db_fix:
            data_analysis = model['data_analysis_v2']
            for column in model['columns']:
                analysis = data_analysis.get(column)
                if isinstance(analysis,
                              dict) and (len(analysis) == 0 or analysis.get(
                                  'empty', {}).get('is_empty', False)):
                    data_analysis[column]['typing'] = {
                        'data_subtype': DATA_SUBTYPES.INT
                    }

        model['created_at'] = str(
            parse_datetime(str(predictor_record.created_at).split('.')[0]))
        model['updated_at'] = str(
            parse_datetime(str(predictor_record.updated_at).split('.')[0]))
        model['predict'] = predictor_record.to_predict
        model['update'] = predictor_record.update_status
        return self._pack(model)
Пример #29
0
    def predict(self, name, when_data=None, kwargs={}):
        if name not in self.predictor_cache:
            # Clear the cache entirely if we have less than .12 GB left
            if psutil.virtual_memory().available < 1.2 * pow(10, 9):
                self.predictor_cache = {}

            if F.get_model_data(name)['status'] == 'complete':
                self.predictor_cache[name] = {
                    'predictor':
                    mindsdb_native.Predictor(name=name,
                                             run_env={'trigger': 'mindsdb'}),
                    'created':
                    datetime.datetime.now()
                }

        predictions = self.predictor_cache[name]['predictor'].predict(
            when_data=when_data, **kwargs)

        return predictions
Пример #30
0
    def get_models(self, status='any'):
        models = F.get_models()
        if status != 'any':
            models = [x for x in models if x['status'] == status]
        models = [
            x for x in models
            if x['status'] != 'training' or parse_datetime(x['created_at']) >
            parse_datetime(self.config['mindsdb_last_started_at'])
        ]

        for i in range(len(models)):
            for k in ['train_end_at', 'updated_at', 'created_at']:
                if k in models[i] and models[i][k] is not None:
                    try:
                        models[i][k] = parse_datetime(
                            str(models[i][k]).split('.')[0])
                    except Exception:
                        models[i][k] = parse_datetime(str(models[i][k]))
        return models