示例#1
0
    def test_explain_prediction(self):
        mdb = Predictor(name='test_home_rentals')

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_x': list(range(n_points)),
                'categorical_x': [int(x % 2 == 0) for x in range(n_points)],
            },
            index=list(range(n_points)))

        input_dataframe[
            'numeric_y'] = input_dataframe.numeric_x + 2 * input_dataframe.categorical_x

        mdb.learn(from_data=input_dataframe,
                  to_predict='numeric_y',
                  stop_training_in_x_seconds=1)

        result = mdb.predict(when={"numeric_x": 10, 'categorical_x': 1})
        explanation_new = result[0].explanation['numeric_y']
        assert explanation_new['predicted_value'] is not None
        assert explanation_new['confidence_interval']
        assert explanation_new['confidence'] >= 0.8
        assert explanation_new['important_missing_information'] == []
        assert explanation_new['prediction_quality'] == 'very confident'
示例#2
0
    def test_analyze_dataset_empty_values(self):
        predictor = Predictor(name='test')

        n_points = 100
        input_dataframe = pd.DataFrame({
            'numeric_int': list(range(n_points)),
        }, index=list(range(n_points)))
        input_dataframe['numeric_int'].iloc[::2] = None

        model_data = predictor.analyse_dataset(from_data=input_dataframe)

        assert model_data['data_analysis_v2']['numeric_int']['empty']['empty_percentage'] == 50
示例#3
0
    def test_analyze_dataset_empty_column(self):
        predictor = Predictor(name='test')

        n_points = 100
        input_dataframe = pd.DataFrame({
            'numeric_int': list(range(n_points)),
            'empty_column': [None for i in range(n_points)]
        }, index=list(range(n_points)))

        model_data = predictor.analyse_dataset(from_data=input_dataframe)

        assert model_data['data_analysis_v2']['empty_column']['empty']['is_empty'] is True
示例#4
0
def test():
    from mindsdb.libs.controllers.predictor import Predictor
    from mindsdb import CONFIG

    CONFIG.DEBUG_BREAK_POINT = PHASE_DATA_EXTRACTOR

    mdb = Predictor(name='home_rentals')

    mdb.learn(
        from_data=
        "https://raw.githubusercontent.com/mindsdb/mindsdb/master/docs/examples/basic/home_rentals.csv",
        # the path to the file where we can learn from, (note: can be url)
        to_predict=
        'rental_price',  # the column we want to learn to predict given all the data in the file
        sample_margin_of_error=0.02)
示例#5
0
    def test_explain_prediction(self):
        mdb = Predictor(name='test_home_rentals')

        mdb.learn(
            from_data="https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
            to_predict='rental_price',
            stop_training_in_x_seconds=1
        )

        result = mdb.predict(when={"number_of_rooms": 2, "sqft": 1384})
        explanation = result[0].explain()['rental_price'][0]
        assert explanation['value']
        assert explanation['confidence']
        assert explanation['explanation']
        assert explanation['simple']
        assert explanation['model_result']
示例#6
0
    def test_analyze_dataset(self):
        predictor = Predictor(name='test')

        n_points = 100
        n_category_values = 4
        input_dataframe = pd.DataFrame(
            {
                'numeric_int':
                list(range(n_points)),
                'numeric_float':
                np.linspace(0, n_points, n_points),
                'date_timestamp': [
                    (datetime.now() - timedelta(minutes=int(i))).isoformat()
                    for i in range(n_points)
                ],
                'date_date': [
                    (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d')
                    for i in range(n_points)
                ],
                'categorical_str': [
                    f'a{x}' for x in (list(range(n_category_values)) *
                                      (n_points // n_category_values))
                ],
                'categorical_int': [
                    x for x in (list(range(n_category_values)) *
                                (n_points // n_category_values))
                ],
                'categorical_binary': [0, 1] * (n_points // 2),
                'sequential_array':
                [f"1,2,3,4,5,{i}" for i in range(n_points)],
                'sequential_text':
                [f'lorem ipsum long text {i}' for i in range(n_points)],
            },
            index=list(range(n_points)))

        model_data = predictor.analyse_dataset(from_data=input_dataframe)
        for col, col_data in model_data['data_analysis_v2'].items():
            expected_type = test_column_types[col][0]
            expected_subtype = test_column_types[col][1]
            assert col_data['typing']['data_type'] == expected_type
            assert col_data['typing']['data_subtype'] == expected_subtype

            assert col_data['empty']
            assert col_data['histogram']
            assert 'percentage_buckets' in col_data
            assert 'nr_warnings' in col_data
            assert not col_data['is_foreign_key']
示例#7
0
    def test_data_source_setting(self):
        data_url = 'https://raw.githubusercontent.com/mindsdb/mindsdb-examples/master/benchmarks/german_credit_data/processed_data/test.csv'
        data_source = FileDS(data_url)
        data_source.set_subtypes({})

        data_source_mod = FileDS(data_url)
        data_source_mod.set_subtypes({
            'credit_usage': 'Int',
            'Average_Credit_Balance': 'Text',
            'existing_credits': 'Binary Category'
        })

        analysis = Predictor('analyzer1').analyse_dataset(data_source)
        analysis_mod = Predictor('analyzer2').analyse_dataset(data_source_mod)

        a1 = analysis['data_analysis_v2']
        a2 = analysis_mod['data_analysis_v2']
        assert (len(a1) == len(a2))
        assert (a1['over_draft']['typing']['data_type'] == a2['over_draft']
                ['typing']['data_type'])

        assert (a1['credit_usage']['typing']['data_type'] == a2['credit_usage']
                ['typing']['data_type'])
        assert (a1['credit_usage']['typing']['data_subtype'] !=
                a2['credit_usage']['typing']['data_subtype'])
        assert (
            a2['credit_usage']['typing']['data_subtype'] == DATA_SUBTYPES.INT)

        assert (a1['Average_Credit_Balance']['typing']['data_type'] !=
                a2['Average_Credit_Balance']['typing']['data_type'])
        assert (a1['Average_Credit_Balance']['typing']['data_subtype'] !=
                a2['Average_Credit_Balance']['typing']['data_subtype'])
        assert (a2['Average_Credit_Balance']['typing']['data_subtype'] ==
                DATA_SUBTYPES.TEXT)
        assert (a2['Average_Credit_Balance']['typing']['data_type'] ==
                DATA_TYPES.SEQUENTIAL)

        assert (a1['existing_credits']['typing']['data_type'] ==
                a2['existing_credits']['typing']['data_type'])
        assert (a1['existing_credits']['typing']['data_subtype'] !=
                a2['existing_credits']['typing']['data_subtype'])
        assert (a2['existing_credits']['typing']['data_subtype'] ==
                DATA_SUBTYPES.SINGLE)
示例#8
0
def test():
    from mindsdb.libs.controllers.predictor import Predictor
    from mindsdb import CONFIG

    mdb = Predictor(name='home_rentals')

    mdb.learn(
        from_data=
        "https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
        # the path to the file where we can learn from, (note: can be url)
        to_predict=
        'rental_price',  # the column we want to learn to predict given all the data in the file
        #sample_margin_of_error=0.02,
        stop_training_in_x_seconds=6)

    #use the model to make predictions
    result = mdb.predict(when={"number_of_rooms": 2, "sqft": 1384})

    result[0].explain()

    when = {"number_of_rooms": 1, "sqft": 384}

    # use the model to make predictions
    result = mdb.predict(when=when)

    result[0].explain()
示例#9
0
    def test_timeseries(self, tmp_path):
        ts_hours = 12
        data_len = 120
        train_file_name = os.path.join(str(tmp_path), 'train_data.csv')
        test_file_name = os.path.join(str(tmp_path), 'test_data.csv')

        features = generate_value_cols(['date', 'int'], data_len,
                                       ts_hours * 3600)
        labels = [generate_timeseries_labels(features)]

        feature_headers = list(map(lambda col: col[0], features))
        label_headers = list(map(lambda col: col[0], labels))

        # Create the training dataset and save it to a file
        columns_train = list(
            map(lambda col: col[1:int(len(col) * 3 / 4)], features))
        columns_train.extend(
            list(map(lambda col: col[1:int(len(col) * 3 / 4)], labels)))
        columns_to_file(columns_train,
                        train_file_name,
                        headers=[*feature_headers, *label_headers])
        # Create the testing dataset and save it to a file
        columns_test = list(
            map(lambda col: col[int(len(col) * 3 / 4):], features))
        columns_to_file(columns_test, test_file_name, headers=feature_headers)

        mdb = Predictor(name='test_timeseries')

        mdb.learn(
            from_data=train_file_name,
            to_predict=label_headers,
            order_by=feature_headers[0],
            # ,window_size_seconds=ts_hours* 3600 * 1.5
            window_size=3,
            stop_training_in_x_seconds=1)

        results = mdb.predict(when_data=test_file_name, use_gpu=False)

        for row in results:
            expect_columns = [
                label_headers[0], label_headers[0] + '_confidence'
            ]
            for col in expect_columns:
                assert col in row

        models = mdb.get_models()
        model_data = mdb.get_model_data(models[0]['name'])
        assert model_data
示例#10
0
    def test_multilabel_prediction(self, tmp_path):
        train_file_name = os.path.join(str(tmp_path), 'train_data.csv')
        test_file_name = os.path.join(str(tmp_path), 'test_data.csv')
        data_len = 60

        features = generate_value_cols(['int', 'float', 'int', 'float'],
                                       data_len)
        labels = []
        labels.append(generate_log_labels(features))
        labels.append(generate_timeseries_labels(features))

        feature_headers = list(map(lambda col: col[0], features))
        label_headers = list(map(lambda col: col[0], labels))

        # Create the training dataset and save it to a file
        columns_train = list(
            map(lambda col: col[1:int(len(col) * 3 / 4)], features))
        columns_train.extend(
            list(map(lambda col: col[1:int(len(col) * 3 / 4)], labels)))
        columns_to_file(columns_train,
                        train_file_name,
                        headers=[*feature_headers, *label_headers])

        # Create the testing dataset and save it to a file
        columns_test = list(
            map(lambda col: col[int(len(col) * 3 / 4):], features))
        columns_to_file(columns_test, test_file_name, headers=feature_headers)

        mdb = Predictor(name='test_multilabel_prediction')
        mdb.learn(from_data=train_file_name,
                  to_predict=label_headers,
                  stop_training_in_x_seconds=1)

        results = mdb.predict(when_data=test_file_name)
        models = mdb.get_models()
        model_data = mdb.get_model_data(models[0]['name'])
        assert model_data

        for i in range(len(results)):
            row = results[i]
            for label in label_headers:
                expect_columns = [label, label + '_confidence']
                for col in expect_columns:
                    assert col in row
示例#11
0
    def test_custom_backend(self):
        predictor = Predictor(name='custom_model_test_predictor')

        class CustomDTModel():
            def __init__(self):
                self.clf = LinearRegression()
                le = preprocessing.LabelEncoder()

            def set_transaction(self, transaction):
                self.transaction = transaction
                self.output_columns = self.transaction.lmd['predict_columns']
                self.input_columns = [x for x in self.transaction.lmd['columns']
                                      if x not in self.output_columns]
                self.train_df = self.transaction.input_data.train_df
                self.test_dt = train_df = self.transaction.input_data.test_df

            def train(self):
                self.le_arr = {}
                for col in [*self.output_columns, *self.input_columns]:
                    self.le_arr[col] = preprocessing.LabelEncoder()
                    self.le_arr[col].fit(pd.concat(
                        [self.transaction.input_data.train_df,
                         self.transaction.input_data.test_df,
                         self.transaction.input_data.validation_df])[col])

                X = []
                for col in self.input_columns:
                    X.append(self.le_arr[col].transform(
                        self.transaction.input_data.train_df[col]))

                X = np.swapaxes(X, 1, 0)

                # Only works with one output column
                Y = self.le_arr[self.output_columns[0]].transform(
                    self.transaction.input_data.train_df[self.output_columns[0]])

                self.clf.fit(X, Y)

            def predict(self, mode='predict', ignore_columns=[]):
                if mode == 'predict':
                    df = self.transaction.input_data.data_frame
                if mode == 'validate':
                    df = self.transaction.input_data.validation_df
                elif mode == 'test':
                    df = self.transaction.input_data.test_df

                X = []
                for col in self.input_columns:
                    X.append(self.le_arr[col].transform(df[col]))

                X = np.swapaxes(X, 1, 0)

                predictions = self.clf.predict(X)

                formated_predictions = {self.output_columns[0]: predictions}

                return formated_predictions

        dt_model = CustomDTModel()

        predictor.learn(to_predict='rental_price',
                        from_data="https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
                        backend=dt_model)
        predictions = predictor.predict(
            when_data="https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
            backend=dt_model)

        assert predictions
示例#12
0
def session():
    return Predictor(name='test')