def test_multiple_categories_as_input(self): vocab = self.get_vocab(10) # tags contains up to 2 randomly selected tags # y contains the sum of indices of tags # the dataset should be nearly perfectly predicted n_points = 10000 tags = [] y = [] for i in range(n_points): row_tags = [] row_y = 0 for k in range(2): if random.random() > 0.2: selected_index = random.randint(0, len(vocab)-1) if vocab[selected_index] not in row_tags: row_tags.append(vocab[selected_index]) row_y += selected_index tags.append(row_tags) y.append(row_y) df = pd.DataFrame({'tags': tags, 'y': y}) config = { 'input_features': [ {'name': 'tags', 'type': ColumnDataTypes.MULTIPLE_CATEGORICAL} ], 'output_features': [ {'name': 'y', 'type': ColumnDataTypes.NUMERIC} ], 'mixer': { 'class': NnMixer, 'kwargs': {'stop_training_after_seconds': 10} } } df_train = df.iloc[:round(n_points * 0.9)] df_test = df.iloc[round(n_points * 0.9):] predictor = Predictor(config) predictor.learn(from_data=df_train) predictions = predictor.predict(when_data=df_test) test_y = df_test.y predicted_y = predictions['y']['predictions'] score = r2_score(test_y, predicted_y) print('Test R2 score', score) # The score check is very light because we only allow the model to train for a few seconds # We are just checking that it learns something and predicts properly, not benchmarking here self.assertGreaterEqual(score, 0.15)
def test_learn_and_predict_nnmixer(self): config = { 'input_features': [{ 'name': 'sqft', 'type': 'numeric' }, { 'name': 'days_on_market', 'type': 'numeric' }, { 'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4 }], 'output_features': [{ 'name': 'number_of_rooms', 'type': 'categorical', 'weights': { '0': 0.8, '1': 0.6, '2': 0.5, '3': 0.7, '4': 1 } }, { 'name': 'number_of_bathrooms', 'type': 'categorical', 'weights': { '0': 0.8, '1': 0.6, '2': 4 } }, { 'name': 'rental_price', 'type': 'numeric' }, { 'name': 'location', 'type': 'categorical' }], 'mixer': { 'class': NnMixer, 'kwargs': { 'eval_every_x_epochs': 4, 'stop_training_after_seconds': 10 } } } df = pd.read_csv( 'https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv' ) predictor = Predictor(config) predictor.learn(from_data=df) df = df.drop([x['name'] for x in config['output_features']], axis=1) predictor.predict(when_data=df)
def test_multiple_categories_as_output(self): pass # fails: AssertionError: 0.0 not greater than or equal to 0.15 vocab = self.get_vocab(10) # x1 contains the index of first tag present # x2 contains the index of second tag present # if a tag is missing then x1/x2 contain -1 instead # Thus the dataset should be perfectly predicted n_points = 10000 x1 = [random.randint(0, len(vocab) - 1) if random.random() > 0.2 else -1 for i in range(n_points)] x2 = [random.randint(0, len(vocab) - 1) if random.random() > 0.2 else -1 for i in range(n_points)] tags = [] for x1_index, x2_index in zip(x1, x2): row_tags = set([vocab.get(x1_index), vocab.get(x2_index)]) row_tags = [x for x in row_tags if x is not None] tags.append(row_tags) df = pd.DataFrame({'x1': x1, 'x2': x2, 'tags': tags}) config = { 'input_features': [ {'name': 'x1', 'type': ColumnDataTypes.CATEGORICAL}, {'name': 'x2', 'type': ColumnDataTypes.CATEGORICAL} ], 'output_features': [ {'name': 'tags', 'type': ColumnDataTypes.MULTIPLE_CATEGORICAL} ], 'mixer': {'class': NnMixer, 'kwargs': {'stop_training_after_seconds': 25}} } df_train = df.iloc[:round(n_points * 0.9)] df_test = df.iloc[round(n_points * 0.9):] predictor = Predictor(config) predictor.learn(from_data=df_train) predictions = predictor.predict(when_data=df_train) train_tags = df_train.tags predicted_tags = predictions['tags']['predictions'] train_tags_encoded = predictor._mixer.encoders['tags'].encode(train_tags) pred_labels_encoded = predictor._mixer.encoders['tags'].encode(predicted_tags) score = f1_score(train_tags_encoded, pred_labels_encoded, average='weighted') print('Train f1 score', score) self.assertGreaterEqual(score, 0.15) predictions = predictor.predict(when_data=df_test) test_tags = df_test.tags predicted_tags = predictions['tags']['predictions'] test_tags_encoded = predictor._mixer.encoders['tags'].encode(test_tags) pred_labels_encoded = predictor._mixer.encoders['tags'].encode(predicted_tags) score = f1_score(test_tags_encoded, pred_labels_encoded, average='weighted') print('Test f1 score', score) self.assertGreaterEqual(score, 0.15)
'name': 'next', 'type': 'numeric' }] } def iter_function(epoch, error, test_error, test_error_gradient): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy)) data = pandas.DataFrame(ts_data, columns=['time', 'ts', 'next']) predictor = Predictor(config) predictor.learn(from_data=data, callback_on_iter=iter_function, eval_every_x_epochs=10) ret = predictor.predict(when={ 'ts': " ".join([str(math.sin(i / max)) for i in range(10 + 1, 10 + ts_len)]) }) print(" ".join( [str(math.sin(i / max)) for i in range(10 + 1, 10 + ts_len + 1)])) print(ret)
import pandas as pd from lightwood import Predictor #################### config = {'input_features': [{'name': 'number_of_rooms', 'type': 'numeric'}, {'name': 'number_of_bathrooms', 'type': 'numeric'}, {'name': 'sqft', 'type': 'numeric'}, {'name': 'location', 'type': 'categorical'}, {'name': 'days_on_market', 'type': 'numeric'}, {'name': 'neighborhood', 'type': 'categorical'}], 'output_features': [{'name': 'rental_price', 'type': 'numeric'}]} df=pd.read_csv("https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv") predictor = Predictor(config) def iter_function(epoch, error, test_error, test_error_gradient): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}'.format( iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy)) predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=10) print(predictor.predict(when={'number_of_rooms':3, 'number_of_bathrooms':2, 'sqft':700, 'location':'great'}))
predictor = None file = '/tmp/predictor.mdb' def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}, test_accuracy: {test_accuracy}'.format( iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) config = config_rp # predictor = Predictor(config) # predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=10) # predictor.save('/tmp/predictor.mdb') predictor = Predictor(load_from_path=file) print(predictor.overall_certainty) ret = [] for i in range(int(100-predictor.overall_certainty*100)): ret_val = predictor.predict(when={'number_of_bedrooms': 2, 'sqft': 2300})[ config['output_features'][0]['name']]['predictions'][0] if ret_val is not None: ret += [ret_val] hist, bin_edges = numpy.histogram(ret) net = sum(hist) for j, count in enumerate(hist):
def run_full_test(USE_CUDA, CACHE_ENCODED_DATA, SELFAWARE, PLINEAR): ''' Run full test example with home_rentals dataset ''' lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA lightwood.config.config.CONFIG.PLINEAR = PLINEAR config = { 'input_features': [{ 'name': 'number_of_bathrooms', 'type': 'numeric' }, { 'name': 'sqft', 'type': 'numeric' }, { 'name': 'location', 'type': 'categorical' }, { 'name': 'days_on_market', 'type': 'numeric' }, { 'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4 }, { 'name': 'rental_price', 'type': 'numeric' }], 'output_features': [{ 'name': 'number_of_rooms', 'type': 'categorical', # 'weights':{ # '0': 0.8, # '1': 0.6, # '2': 0.5, # '3': 0.7, # '4': 1, # } }], 'data_source': { 'cache_transformed_data': CACHE_ENCODED_DATA }, 'mixer': { 'class': lightwood.BUILTIN_MIXERS.NnMixer, 'selfaware': SELFAWARE } } df = pd.read_csv( "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv" ) def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) predictor = Predictor(config) # stop_training_after_seconds given in order to not get timeouts in travis predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=4, stop_training_after_seconds=40) df = df.drop([x['name'] for x in config['output_features']], axis=1) predictor.predict(when_data=df) predictor.save('test.pkl') predictor = Predictor(load_from_path='test.pkl') preds = {} for j in range(100): pred = predictor.predict( when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0] if pred not in preds: preds[pred] = 0 preds[pred] += 1
'type': 'image', 'encoder_attrs': { 'aim': 'speed' } }], 'output_features': [{ 'name': 'superclass', 'type': 'categorical', 'encoder_attrs': {} }, { 'name': 'class', 'type': 'categorical', 'encoder_attrs': {} }] } predictor = Predictor(config) def iter_function(epoch, error, test_error, test_error_gradient): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy)) predictor.learn(from_data=pd.read_csv('train_sample.csv'), callback_on_iter=iter_function, eval_every_x_epochs=1)
'name': 'number_of_rooms', 'type': 'categorical' }, { 'name': 'rental_price', 'type': 'numeric' }], 'mixer': { 'class': lightwood.BUILTIN_MIXERS.BayesianNnMixer } } #'mixer':{'class': lightwood.BUILTIN_MIXERS.NnMixer}} df = pd.read_csv( "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv") predictor = Predictor(config) def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) predictor.learn(from_data=df,
### Generate a dataset datapoints = 1000 # generate random numbers between -10 and 10 data = {'x': [random.randint(-10, 10) for i in range(datapoints)], 'y': [random.randint(-10, 10) for i in range(datapoints)]} # target variable to be the multiplication of the two data['z'] = [data['x'][i] * data['y'][i] for i in range(datapoints)] data_frame = pandas.DataFrame(data) print(data_frame) predictor = Predictor(output=['z']) def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}'.format( iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) predictor.learn(from_data=data_frame, callback_on_iter=iter_function) print('accuracy') print(predictor.train_accuracy) print('accuracy over all dataset') print(predictor.calculate_accuracy(from_data=data_frame))
config = { 'input_features': [{ 'name': 'ts', 'type': COLUMN_DATA_TYPES.TIME_SERIES }], 'output_features': [{ 'name': 'next', 'type': 'numeric' }] } def iter_function(epoch, training_error, test_error, test_error_gradient, test_accuracy): print( f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}' ) data = pandas.DataFrame(ts_data, columns=['time', 'ts', 'next']) predictor = Predictor(config) predictor.learn(from_data=data) print('\n\n') ret = predictor.predict( when={'ts': [math.sin(i / max) for i in range(10 + 1, 10 + ts_len)]}) print([math.sin(i / max) for i in range(10 + 1, 10 + ts_len + 1)]) print('Got predictions: ') print(ret)
def test_home_rentals(self): lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA lightwood.config.config.CONFIG.PLINEAR = PLINEAR config = { 'input_features': [ {'name': 'sqft', 'type': 'numeric'}, {'name': 'days_on_market', 'type': 'numeric'}, {'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4} ], 'output_features': [ {'name': 'number_of_rooms', 'type': 'categorical', 'weights': {'0': 0.8, '1': 0.6, '2': 0.5, '3': 0.7, '4': 1}}, {'name': 'number_of_bathrooms', 'type': 'categorical', 'weights': {'0': 0.8, '1': 0.6, '2': 4}}, {'name': 'rental_price', 'type': 'numeric'}, {'name': 'location', 'type': 'categorical'} ], 'data_source': {'cache_transformed_data': CACHE_ENCODED_DATA}, 'mixer': { 'class': NnMixer, 'kwargs': { 'selfaware': SELFAWARE, 'eval_every_x_epochs': 4, 'stop_training_after_seconds': 80 } } } df = pd.read_csv('https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv') predictor = Predictor(config) predictor.learn(from_data=df) df = df.drop([x['name'] for x in config['output_features']], axis=1) predictor.predict(when_data=df) predictor.save('test.pkl') predictor = Predictor(load_from_path='test.pkl') for j in range(100): pred = predictor.predict(when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0] assert isinstance(pred, (str, int))
data_train['y'][i] = 1 if op == '/': for i in range(m): if data_test['y'][i] == 0: data_test['y'][i] = 1 # target variable to be the multiplication of the two data_train['z'] = eval( f"""[data_train['x'][i] {op} data_train['y'][i] for i in range(n)]""") data_test['z'] = eval( f"""[data_test['x'][i] {op} data_test['y'][i] for i in range(m)]""") df_train = pandas.DataFrame(data_train) df_test = pandas.DataFrame(data_test) predictor = Predictor(output=['z']) def iter_function(epoch, training_error, test_error, test_error_gradient, test_accuracy): print( f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}' ) predictor.learn(from_data=df_train, callback_on_iter=iter_function, eval_every_x_epochs=200) predictor.save('ok.pkl') predictor = Predictor(load_from_path='ok.pkl')
def run_test(USE_CUDA, CACHE_ENCODED_DATA, SELFAWARE, PLINEAR): lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = CACHE_ENCODED_DATA lightwood.config.config.CONFIG.SELFAWARE = SELFAWARE lightwood.config.config.CONFIG.PLINEAR = PLINEAR #################### config = { 'input_features': [{ 'name': 'number_of_bathrooms', 'type': 'numeric' }, { 'name': 'sqft', 'type': 'numeric' }, { 'name': 'location', 'type': 'categorical' }, { 'name': 'days_on_market', 'type': 'numeric' }, { 'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4 }, { 'name': 'rental_price', 'type': 'numeric' }], 'output_features': [{ 'name': 'number_of_rooms', 'type': 'categorical', # 'weights':{ # '0': 0.8, # '1': 0.6, # '2': 0.5, # '3': 0.7, # '4': 1, # } }], 'mixer': { 'class': lightwood.BUILTIN_MIXERS.NnMixer } } # AX doesn't seem to work on the travis version of windows, so don't test it there as of now if sys.platform not in ['win32', 'cygwin', 'windows']: pass #config['optimizer'] = lightwood.model_building.BasicAxOptimizer df = pd.read_csv( "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv" ) def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) predictor = Predictor(config) # stop_training_after_seconds given in order to not get timeouts in travis predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=1, stop_training_after_seconds=1) predictor.save('test.pkl') predictor = Predictor(load_from_path='test.pkl') df = df.drop([x['name'] for x in config['output_features']], axis=1) predictor.predict(when_data=df) predictor.save('test.pkl') predictor = Predictor(load_from_path='test.pkl') preds = {} for j in range(100): pred = predictor.predict( when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0] if pred not in preds: preds[pred] = 0 preds[pred] += 1