def test_multiple_categories_as_input(self):
        vocab = self.get_vocab(10)
        # tags contains up to 2 randomly selected tags
        # y contains the sum of indices of tags
        # the dataset should be nearly perfectly predicted
        n_points = 10000
        tags = []
        y = []
        for i in range(n_points):
            row_tags = []
            row_y = 0
            for k in range(2):
                if random.random() > 0.2:
                    selected_index = random.randint(0, len(vocab)-1)
                    if vocab[selected_index] not in row_tags:
                        row_tags.append(vocab[selected_index])
                        row_y += selected_index
            tags.append(row_tags)
            y.append(row_y)

        df = pd.DataFrame({'tags': tags, 'y': y})

        config = {
            'input_features': [
                {'name': 'tags', 'type': ColumnDataTypes.MULTIPLE_CATEGORICAL}
            ],
            'output_features': [
                {'name': 'y', 'type': ColumnDataTypes.NUMERIC}
            ],
            'mixer': {
                'class': NnMixer,
                'kwargs': {'stop_training_after_seconds': 10}
            }
        }
        df_train = df.iloc[:round(n_points * 0.9)]
        df_test = df.iloc[round(n_points * 0.9):]

        predictor = Predictor(config)

        predictor.learn(from_data=df_train)

        predictions = predictor.predict(when_data=df_test)

        test_y = df_test.y
        predicted_y = predictions['y']['predictions']

        score = r2_score(test_y, predicted_y)
        print('Test R2 score', score)
        # The score check is very light because we only allow the model to train for a few seconds
        # We are just checking that it learns something and predicts properly, not benchmarking here
        self.assertGreaterEqual(score, 0.15)
Exemplo n.º 2
0
    def test_learn_and_predict_nnmixer(self):
        config = {
            'input_features': [{
                'name': 'sqft',
                'type': 'numeric'
            }, {
                'name': 'days_on_market',
                'type': 'numeric'
            }, {
                'name': 'neighborhood',
                'type': 'categorical',
                'dropout': 0.4
            }],
            'output_features': [{
                'name': 'number_of_rooms',
                'type': 'categorical',
                'weights': {
                    '0': 0.8,
                    '1': 0.6,
                    '2': 0.5,
                    '3': 0.7,
                    '4': 1
                }
            }, {
                'name': 'number_of_bathrooms',
                'type': 'categorical',
                'weights': {
                    '0': 0.8,
                    '1': 0.6,
                    '2': 4
                }
            }, {
                'name': 'rental_price',
                'type': 'numeric'
            }, {
                'name': 'location',
                'type': 'categorical'
            }],
            'mixer': {
                'class': NnMixer,
                'kwargs': {
                    'eval_every_x_epochs': 4,
                    'stop_training_after_seconds': 10
                }
            }
        }

        df = pd.read_csv(
            'https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv'
        )

        predictor = Predictor(config)
        predictor.learn(from_data=df)

        df = df.drop([x['name'] for x in config['output_features']], axis=1)
        predictor.predict(when_data=df)
    def test_multiple_categories_as_output(self):
        pass # fails: AssertionError: 0.0 not greater than or equal to 0.15
        vocab = self.get_vocab(10)
        # x1 contains the index of first tag present
        # x2 contains the index of second tag present
        # if a tag is missing then x1/x2 contain -1 instead
        # Thus the dataset should be perfectly predicted
        n_points = 10000
        x1 = [random.randint(0, len(vocab) - 1) if random.random() > 0.2 else -1 for i in range(n_points)]
        x2 = [random.randint(0, len(vocab) - 1) if random.random() > 0.2 else -1 for i in range(n_points)]
        tags = []
        for x1_index, x2_index in zip(x1, x2):
            row_tags = set([vocab.get(x1_index), vocab.get(x2_index)])
            row_tags = [x for x in row_tags if x is not None]
            tags.append(row_tags)

        df = pd.DataFrame({'x1': x1, 'x2': x2, 'tags': tags})

        config = {
            'input_features': [
                {'name': 'x1', 'type': ColumnDataTypes.CATEGORICAL},
                {'name': 'x2', 'type': ColumnDataTypes.CATEGORICAL}
            ],
            'output_features': [
                {'name': 'tags', 'type': ColumnDataTypes.MULTIPLE_CATEGORICAL}
            ],
            'mixer': {'class': NnMixer, 'kwargs': {'stop_training_after_seconds': 25}}
        }
        df_train = df.iloc[:round(n_points * 0.9)]
        df_test = df.iloc[round(n_points * 0.9):]

        predictor = Predictor(config)

        predictor.learn(from_data=df_train)

        predictions = predictor.predict(when_data=df_train)
        train_tags = df_train.tags
        predicted_tags = predictions['tags']['predictions']
        train_tags_encoded = predictor._mixer.encoders['tags'].encode(train_tags)
        pred_labels_encoded = predictor._mixer.encoders['tags'].encode(predicted_tags)
        score = f1_score(train_tags_encoded, pred_labels_encoded, average='weighted')
        print('Train f1 score', score)
        self.assertGreaterEqual(score, 0.15)

        predictions = predictor.predict(when_data=df_test)

        test_tags = df_test.tags
        predicted_tags = predictions['tags']['predictions']

        test_tags_encoded = predictor._mixer.encoders['tags'].encode(test_tags)
        pred_labels_encoded = predictor._mixer.encoders['tags'].encode(predicted_tags)
        score = f1_score(test_tags_encoded, pred_labels_encoded, average='weighted')
        print('Test f1 score', score)
        self.assertGreaterEqual(score, 0.15)
Exemplo n.º 4
0
        'name': 'next',
        'type': 'numeric'
    }]
}


def iter_function(epoch, error, test_error, test_error_gradient):
    print(
        'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}'
        .format(iter=epoch,
                error=error,
                test_error=test_error,
                test_error_gradient=test_error_gradient,
                accuracy=predictor.train_accuracy))


data = pandas.DataFrame(ts_data, columns=['time', 'ts', 'next'])

predictor = Predictor(config)

predictor.learn(from_data=data,
                callback_on_iter=iter_function,
                eval_every_x_epochs=10)

ret = predictor.predict(when={
    'ts':
    " ".join([str(math.sin(i / max)) for i in range(10 + 1, 10 + ts_len)])
})
print(" ".join(
    [str(math.sin(i / max)) for i in range(10 + 1, 10 + ts_len + 1)]))
print(ret)
Exemplo n.º 5
0
import pandas as pd
from lightwood import Predictor

####################
config = {'input_features': [{'name': 'number_of_rooms', 'type': 'numeric'},
                    {'name': 'number_of_bathrooms', 'type': 'numeric'}, {'name': 'sqft', 'type': 'numeric'},
                    {'name': 'location', 'type': 'categorical'}, {'name': 'days_on_market', 'type': 'numeric'},
                    {'name': 'neighborhood', 'type': 'categorical'}],
 'output_features': [{'name': 'rental_price', 'type': 'numeric'}]}



df=pd.read_csv("https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv")

predictor = Predictor(config)

def iter_function(epoch, error, test_error, test_error_gradient):
    print(
        'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}'.format(
            iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient,
            accuracy=predictor.train_accuracy))


predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=10)

print(predictor.predict(when={'number_of_rooms':3, 'number_of_bathrooms':2, 'sqft':700, 'location':'great'}))
Exemplo n.º 6
0
predictor = None
file = '/tmp/predictor.mdb'

def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy):
    print(
        'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}, test_accuracy: {test_accuracy}'.format(
            iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient,
            accuracy=predictor.train_accuracy, test_accuracy=test_accuracy))

config = config_rp

# predictor = Predictor(config)
# predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=10)
# predictor.save('/tmp/predictor.mdb')

predictor = Predictor(load_from_path=file)

print(predictor.overall_certainty)
ret = []
for i in range(int(100-predictor.overall_certainty*100)):
    ret_val = predictor.predict(when={'number_of_bedrooms': 2, 'sqft': 2300})[
                config['output_features'][0]['name']]['predictions'][0]
    if ret_val is not None:
        ret += [ret_val]



hist, bin_edges = numpy.histogram(ret)
net = sum(hist)

for j, count in enumerate(hist):
Exemplo n.º 7
0
def run_full_test(USE_CUDA, CACHE_ENCODED_DATA, SELFAWARE, PLINEAR):
    '''
    Run full test example with home_rentals dataset
    '''
    lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA
    lightwood.config.config.CONFIG.PLINEAR = PLINEAR

    config = {
        'input_features': [{
            'name': 'number_of_bathrooms',
            'type': 'numeric'
        }, {
            'name': 'sqft',
            'type': 'numeric'
        }, {
            'name': 'location',
            'type': 'categorical'
        }, {
            'name': 'days_on_market',
            'type': 'numeric'
        }, {
            'name': 'neighborhood',
            'type': 'categorical',
            'dropout': 0.4
        }, {
            'name': 'rental_price',
            'type': 'numeric'
        }],
        'output_features': [{
            'name': 'number_of_rooms',
            'type': 'categorical',
            # 'weights':{
            #       '0': 0.8,
            #       '1': 0.6,
            #       '2': 0.5,
            #       '3': 0.7,
            #       '4': 1,
            # }
        }],
        'data_source': {
            'cache_transformed_data': CACHE_ENCODED_DATA
        },
        'mixer': {
            'class': lightwood.BUILTIN_MIXERS.NnMixer,
            'selfaware': SELFAWARE
        }
    }

    df = pd.read_csv(
        "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv"
    )

    def iter_function(epoch, error, test_error, test_error_gradient,
                      test_accuracy):
        print(
            'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}'
            .format(iter=epoch,
                    error=error,
                    test_error=test_error,
                    test_error_gradient=test_error_gradient,
                    accuracy=predictor.train_accuracy,
                    test_accuracy=test_accuracy))

    predictor = Predictor(config)
    # stop_training_after_seconds given in order to not get timeouts in travis
    predictor.learn(from_data=df,
                    callback_on_iter=iter_function,
                    eval_every_x_epochs=4,
                    stop_training_after_seconds=40)

    df = df.drop([x['name'] for x in config['output_features']], axis=1)
    predictor.predict(when_data=df)

    predictor.save('test.pkl')
    predictor = Predictor(load_from_path='test.pkl')

    preds = {}
    for j in range(100):
        pred = predictor.predict(
            when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0]
        if pred not in preds:
            preds[pred] = 0
        preds[pred] += 1
Exemplo n.º 8
0
        'type': 'image',
        'encoder_attrs': {
            'aim': 'speed'
        }
    }],
    'output_features': [{
        'name': 'superclass',
        'type': 'categorical',
        'encoder_attrs': {}
    }, {
        'name': 'class',
        'type': 'categorical',
        'encoder_attrs': {}
    }]
}
predictor = Predictor(config)


def iter_function(epoch, error, test_error, test_error_gradient):
    print(
        'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}'
        .format(iter=epoch,
                error=error,
                test_error=test_error,
                test_error_gradient=test_error_gradient,
                accuracy=predictor.train_accuracy))


predictor.learn(from_data=pd.read_csv('train_sample.csv'),
                callback_on_iter=iter_function,
                eval_every_x_epochs=1)
Exemplo n.º 9
0
        'name': 'number_of_rooms',
        'type': 'categorical'
    }, {
        'name': 'rental_price',
        'type': 'numeric'
    }],
    'mixer': {
        'class': lightwood.BUILTIN_MIXERS.BayesianNnMixer
    }
}
#'mixer':{'class': lightwood.BUILTIN_MIXERS.NnMixer}}

df = pd.read_csv(
    "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv")

predictor = Predictor(config)


def iter_function(epoch, error, test_error, test_error_gradient,
                  test_accuracy):
    print(
        'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}'
        .format(iter=epoch,
                error=error,
                test_error=test_error,
                test_error_gradient=test_error_gradient,
                accuracy=predictor.train_accuracy,
                test_accuracy=test_accuracy))


predictor.learn(from_data=df,
Exemplo n.º 10
0
### Generate a dataset
datapoints = 1000

# generate random numbers between -10 and 10
data = {'x': [random.randint(-10, 10) for i in range(datapoints)],
        'y': [random.randint(-10, 10) for i in range(datapoints)]}

# target variable to be the multiplication of the two
data['z'] = [data['x'][i] * data['y'][i] for i in range(datapoints)]


data_frame = pandas.DataFrame(data)
print(data_frame)

predictor = Predictor(output=['z'])


def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy):
    print(
        'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}'.format(
            iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient,
            accuracy=predictor.train_accuracy, test_accuracy=test_accuracy))



predictor.learn(from_data=data_frame, callback_on_iter=iter_function)
print('accuracy')
print(predictor.train_accuracy)
print('accuracy over all dataset')
print(predictor.calculate_accuracy(from_data=data_frame))
Exemplo n.º 11
0
config = {
    'input_features': [{
        'name': 'ts',
        'type': COLUMN_DATA_TYPES.TIME_SERIES
    }],
    'output_features': [{
        'name': 'next',
        'type': 'numeric'
    }]
}


def iter_function(epoch, training_error, test_error, test_error_gradient,
                  test_accuracy):
    print(
        f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}'
    )


data = pandas.DataFrame(ts_data, columns=['time', 'ts', 'next'])
predictor = Predictor(config)

predictor.learn(from_data=data)

print('\n\n')
ret = predictor.predict(
    when={'ts': [math.sin(i / max) for i in range(10 + 1, 10 + ts_len)]})
print([math.sin(i / max) for i in range(10 + 1, 10 + ts_len + 1)])
print('Got predictions: ')
print(ret)
Exemplo n.º 12
0
    def test_home_rentals(self):
        lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA
        lightwood.config.config.CONFIG.PLINEAR = PLINEAR

        config = {
            'input_features': [
                {'name': 'sqft', 'type': 'numeric'},
                {'name': 'days_on_market', 'type': 'numeric'},
                {'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4}
            ],
            'output_features': [
                {'name': 'number_of_rooms', 'type': 'categorical', 'weights': {'0': 0.8, '1': 0.6, '2': 0.5, '3': 0.7, '4': 1}},
                {'name': 'number_of_bathrooms', 'type': 'categorical', 'weights': {'0': 0.8, '1': 0.6, '2': 4}},
                {'name': 'rental_price', 'type': 'numeric'},
                {'name': 'location', 'type': 'categorical'}
            ],
            'data_source': {'cache_transformed_data': CACHE_ENCODED_DATA},
            'mixer': {
                'class': NnMixer,
                'kwargs': {
                    'selfaware': SELFAWARE,
                    'eval_every_x_epochs': 4,
                    'stop_training_after_seconds': 80
                }
            }
        }

        df = pd.read_csv('https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv')

        predictor = Predictor(config)
        predictor.learn(from_data=df)

        df = df.drop([x['name'] for x in config['output_features']], axis=1)
        predictor.predict(when_data=df)

        predictor.save('test.pkl')
        predictor = Predictor(load_from_path='test.pkl')

        for j in range(100):
            pred = predictor.predict(when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0]
            assert isinstance(pred, (str, int))
Exemplo n.º 13
0
            data_train['y'][i] = 1
if op == '/':
    for i in range(m):
        if data_test['y'][i] == 0:
            data_test['y'][i] = 1

# target variable to be the multiplication of the two
data_train['z'] = eval(
    f"""[data_train['x'][i] {op} data_train['y'][i] for i in range(n)]""")
data_test['z'] = eval(
    f"""[data_test['x'][i] {op} data_test['y'][i] for i in range(m)]""")

df_train = pandas.DataFrame(data_train)
df_test = pandas.DataFrame(data_test)

predictor = Predictor(output=['z'])


def iter_function(epoch, training_error, test_error, test_error_gradient,
                  test_accuracy):
    print(
        f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}'
    )


predictor.learn(from_data=df_train,
                callback_on_iter=iter_function,
                eval_every_x_epochs=200)
predictor.save('ok.pkl')

predictor = Predictor(load_from_path='ok.pkl')
Exemplo n.º 14
0
def run_test(USE_CUDA, CACHE_ENCODED_DATA, SELFAWARE, PLINEAR):
    lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA
    lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = CACHE_ENCODED_DATA
    lightwood.config.config.CONFIG.SELFAWARE = SELFAWARE
    lightwood.config.config.CONFIG.PLINEAR = PLINEAR

    ####################
    config = {
        'input_features': [{
            'name': 'number_of_bathrooms',
            'type': 'numeric'
        }, {
            'name': 'sqft',
            'type': 'numeric'
        }, {
            'name': 'location',
            'type': 'categorical'
        }, {
            'name': 'days_on_market',
            'type': 'numeric'
        }, {
            'name': 'neighborhood',
            'type': 'categorical',
            'dropout': 0.4
        }, {
            'name': 'rental_price',
            'type': 'numeric'
        }],
        'output_features': [{
            'name': 'number_of_rooms',
            'type': 'categorical',
            # 'weights':{
            #       '0': 0.8,
            #       '1': 0.6,
            #       '2': 0.5,
            #       '3': 0.7,
            #       '4': 1,
            # }
        }],
        'mixer': {
            'class': lightwood.BUILTIN_MIXERS.NnMixer
        }
    }

    # AX doesn't seem to work on the travis version of windows, so don't test it there as of now
    if sys.platform not in ['win32', 'cygwin', 'windows']:
        pass
        #config['optimizer'] = lightwood.model_building.BasicAxOptimizer

    df = pd.read_csv(
        "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv"
    )

    def iter_function(epoch, error, test_error, test_error_gradient,
                      test_accuracy):
        print(
            'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}'
            .format(iter=epoch,
                    error=error,
                    test_error=test_error,
                    test_error_gradient=test_error_gradient,
                    accuracy=predictor.train_accuracy,
                    test_accuracy=test_accuracy))

    predictor = Predictor(config)
    # stop_training_after_seconds given in order to not get timeouts in travis
    predictor.learn(from_data=df,
                    callback_on_iter=iter_function,
                    eval_every_x_epochs=1,
                    stop_training_after_seconds=1)

    predictor.save('test.pkl')

    predictor = Predictor(load_from_path='test.pkl')

    df = df.drop([x['name'] for x in config['output_features']], axis=1)
    predictor.predict(when_data=df)

    predictor.save('test.pkl')
    predictor = Predictor(load_from_path='test.pkl')

    preds = {}
    for j in range(100):
        pred = predictor.predict(
            when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0]
        if pred not in preds:
            preds[pred] = 0
        preds[pred] += 1