def test_predict_start(self): # Setup variables primitives = [ 'sklearn.preprocessing.StandardScaler', 'sklearn.linear_model.LogisticRegression' ] pipeline = MLPipeline(primitives) pipeline.fit(self.X_train, self.y_train) # Mock the first block block_mock = Mock() pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock # Run first block context = { 'X': self.X_train, } int_start = 1 str_start = 'sklearn.linear_model.LogisticRegression#1' pipeline.predict(start_=int_start, **context) pipeline.predict(start_=str_start, **context) # Assert that mock has not been called block_mock.predict.assert_not_called()
def test_fit_output(self): # Setup variables primitives = [ 'sklearn.preprocessing.StandardScaler', 'sklearn.linear_model.LogisticRegression' ] pipeline = MLPipeline(primitives) int_block = 0 invalid_int = 10 str_block = 'sklearn.preprocessing.StandardScaler#1' invalid_block = 'InvalidBlockName' str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y' invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' # Run int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block) str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block) str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block_variable) no_output = pipeline.fit(self.X_train, self.y_train) # Assert successful calls X = np.array([[0.71269665, -1.45152899, 0.55344946, 0.31740553], [0.26726124, 1.23648766, -1.1557327, -1.0932857], [-1.95991577, 0.967686, -1.1557327, -1.0932857], [0.71269665, -0.645124, 0.39067021, 0.31740553], [0.26726124, -0.10752067, 1.36734573, 1.55176035]]) y = np.array([1, 0, 0, 1, 2]) context = {'X': X, 'y': y} almost_equal(context, int_out) almost_equal(context, str_out) almost_equal(y, str_out_variable) assert no_output is None # Run asserting exceptions with self.assertRaises(IndexError): pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int) with self.assertRaises(ValueError): pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block) with self.assertRaises(ValueError): pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable)
def run(): print("============================================") print("Testing Multi Table Pipeline") print("============================================") orders = pd.read_csv("data/Retail/orders.csv") order_products = pd.read_csv("data/Retail/order_products.csv") label_times = pd.read_csv("data/Retail/label_times.csv") X_train = label_times.sample(frac=0.8) X_test = label_times.drop(X_train.index) y_train = X_train["label"] y_test = X_test["label"] entity_set = make_entity_set(orders, order_products) multitable = MLPipeline(['dfs', 'random_forest_classifier']) updated_hyperparam = MLHyperparam('max_depth', 'int', [1, 10]) updated_hyperparam.block_name = 'dfs' # multitable.update_tunable_hyperparams([updated_hyperparam]) # Check that the hyperparameters are correct. for hyperparam in multitable.get_tunable_hyperparams(): print(hyperparam) # Check that the blocks are correct. expected_blocks = {'dfs', 'rf_classifier'} blocks = set(multitable.blocks.keys()) assert expected_blocks == blocks # Check that we can score properly. produce_params = { ('dfs', 'entityset'): entity_set, ('dfs', 'cutoff_time_in_index'): True } print("\nFitting pipeline...") fit_params = { ('dfs', 'entityset'): entity_set, ('dfs', 'target_entity'): "users", ('dfs', 'training_window'): ft.Timedelta("60 days") } multitable.fit(X_train, y_train, fit_params=fit_params, produce_params=produce_params) print("\nFit pipeline.") print("\nScoring pipeline...") predicted_y_val = multitable.predict(X_test, predict_params=produce_params) score = f1_score(predicted_y_val, y_test, average='micro') print("\nf1 micro score: %f" % score) return score
def run(train_size=160, test_size=40): print("============================================") print("Testing Audio Pipeline") print("============================================") # Data loading. classes = [ 'street_music', 'siren', 'jackhammer', 'gun_shot', 'engine_idling', 'drilling', 'dog_bark', 'children_playing', 'car_horn', 'air_conditioner' ] labels = [] all_filepaths = [] for label_class in classes: for filepath in glob.glob( os.path.join('data/UrbanSound/data', label_class, '*.wav')): all_filepaths.append(filepath) labels.append(label_class) filepaths, filepaths_test, y, y_test = train_test_split( all_filepaths, labels, train_size=train_size, test_size=test_size) audio_pipeline = MLPipeline([ 'audio_featurizer', 'audio_padder', 'pca', 'random_forest_classifier' ]) # Check that the hyperparameters are correct. for hyperparam in audio_pipeline.get_tunable_hyperparams(): print(hyperparam) # Check that the blocks are correct. expected_blocks = { 'audio_featurizer', 'audio_padder', 'pca', 'rf_classifier' } blocks = set(audio_pipeline.blocks.keys()) assert expected_blocks == blocks # Check that we can score properly. print("\nFitting pipeline...") X, sample_freqs = load_and_segment(filepaths) produce_params = {('audio_featurizer', 'sample_freqs'): sample_freqs} audio_pipeline.fit(X, y, produce_params=produce_params) print("\nFit pipeline.") print("\nScoring pipeline...") X_test, sample_freqs_test = load_and_segment(filepaths_test) predict_params = {('audio_featurizer', 'sample_freqs'): sample_freqs_test} predicted_y_val = audio_pipeline.predict(X_test, predict_params) score = f1_score(predicted_y_val, y_test, average='micro') print("\nf1 micro score: %f" % score) return score
def test_fit_produce_debug_str(self): outputs = { 'default': [ { 'name': 'a_name', 'variable': 'a_primitive#1.a_variable', 'type': 'a_type', } ] } mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) mlpipeline.blocks['a_primitive#1'].fit_args = [ { 'name': 'fit_input', 'type': 'whatever' } ] mlpipeline.blocks['a_primitive#1'].produce_args = [ { 'name': 'input', 'type': 'whatever' } ] mlpipeline.blocks['a_primitive#1'].produce_output = [ { 'name': 'a_name', 'type': 'a_type' } ] expected_return = dict() expected_return['debug'] = 'tm' expected_return['fit'] = { 'a_primitive#1': { 'time': 0, 'memory': 0, } } expected_return['produce'] = { 'a_primitive#1': { 'time': 0, 'memory': 0, } } returned, debug_returned = mlpipeline.fit(output_='default', debug='tm') assert len([returned]) == len(outputs['default']) assert isinstance(debug_returned, dict) assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys()) assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys()) for block_name, dictionary in expected_return['fit'].items(): assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys()) for block_name, dictionary in expected_return['produce'].items(): assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys())
def test_fit_debug_str(self): mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks['a_primitive#1'].fit_args = [ { 'name': 'fit_input', 'type': 'whatever' } ] expected_return = dict() expected_return['debug'] = 'tm' expected_return['fit'] = { 'a_primitive#1': { 'time': 0, 'memory': 0, } } returned = mlpipeline.fit(debug='tm') assert isinstance(returned, dict) assert set(returned.keys()) == set(expected_return.keys()) # fit / produce assert set(returned['fit'].keys()) == set(expected_return['fit'].keys()) # block name for block_name, dictionary in expected_return['fit'].items(): assert set(returned['fit'][block_name].keys()) == set(dictionary.keys())
def test_fit_debug(self): mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks['a_primitive#1'].fit_args = [{ 'name': 'fit_input', 'type': 'whatever' }] expected_return = dict() expected_return["fit"] = { "a_primitive#1": { "elapsed": 0, "input": {"whatever"} } } returned = mlpipeline.fit(debug=True) print(returned) assert isinstance(returned, dict) assert set(returned.keys()) == set( expected_return.keys()) # fit / produce assert set(returned["fit"].keys()) == set( expected_return["fit"].keys()) # block name for block_name, dictionary in expected_return["fit"].items(): assert set(returned["fit"][block_name].keys()) == set( dictionary.keys())
def test_fit_produce_debug(self): outputs = { 'default': [{ 'name': 'a_name', 'variable': 'a_primitive#1.a_variable', 'type': 'a_type', }] } mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) mlpipeline.blocks['a_primitive#1'].fit_args = [{ 'name': 'fit_input', 'type': 'whatever' }] mlpipeline.blocks['a_primitive#1'].produce_args = [{ 'name': 'input', 'type': 'whatever' }] mlpipeline.blocks['a_primitive#1'].produce_output = [{ 'name': 'a_name', 'type': 'a_type' }] expected_return = dict() expected_return["fit"] = { "a_primitive#1": { "elapsed": 0, "input": {"whatever"} } } expected_return["produce"] = { "a_primitive#1": { "elapsed": 0, "input": {"whatever"}, "output": {"whatever"} } } returned, debug_returned = mlpipeline.fit(output_='default', debug=True) assert len([returned]) == len(outputs["default"]) assert isinstance(debug_returned, dict) assert set(debug_returned.keys()) == set( expected_return.keys()) # fit / produce assert set(debug_returned["fit"].keys()) == set( expected_return["fit"].keys()) assert set(debug_returned["produce"].keys()) == set( expected_return["produce"].keys()) for block_name, dictionary in expected_return["fit"].items(): assert set(debug_returned["fit"][block_name].keys()) == set( dictionary.keys()) for block_name, dictionary in expected_return["produce"].items(): assert set(debug_returned["produce"][block_name].keys()) == set( dictionary.keys())
def test_fit_no_debug(self): mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks['a_primitive#1'].fit_args = [{ 'name': 'fit_input', 'type': 'whatever' }] returned = mlpipeline.fit(debug=False) assert returned is None
def test_fit_pending_one_primitive(self): block_1 = get_mlblock_mock() block_2 = get_mlblock_mock() blocks = OrderedDict(( ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), )) self_ = MagicMock(autospec=MLPipeline) self_.blocks = blocks self_._last_fit_block = 'a.primitive.Name#1' MLPipeline.fit(self_) expected = [ call('a.primitive.Name#1'), ] self_._fit_block.call_args_list = expected assert not self_._produce_block.called
def test_fit_output(self): # Setup variables primitives = [ 'sklearn.preprocessing.StandardScaler', 'sklearn.linear_model.LogisticRegression' ] pipeline = MLPipeline(primitives) named = 'default' list_ = ['default', 0] int_block = 0 invalid_int = 10 str_block = 'sklearn.preprocessing.StandardScaler#1' invalid_block = 'InvalidBlockName' str_block_variable = 'sklearn.preprocessing.StandardScaler#1.X' invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' # Run named_out = pipeline.fit(self.X, self.y, output_=named) list_out = pipeline.fit(self.X, self.y, output_=list_) int_out = pipeline.fit(self.X, self.y, output_=int_block) str_out = pipeline.fit(self.X, self.y, output_=str_block) str_out_variable = pipeline.fit(self.X, self.y, output_=str_block_variable) no_output = pipeline.fit(self.X, self.y) # Assert successful calls X = np.array([ [2., -0.5, -0.5, -0.5, -0.5], [-0.5, 2., -0.5, -0.5, -0.5], [-0.5, -0.5, 2., -0.5, -0.5], [-0.5, -0.5, -0.5, 2., -0.5], [-0.5, -0.5, -0.5, -0.5, 2.], ]) y = np.array([0, 0, 0, 0, 1]) context = {'X': X, 'y': y} almost_equal(named_out, y) assert len(list_out) == 2 almost_equal(list_out[0], y) almost_equal(list_out[1], context) almost_equal(context, int_out) almost_equal(context, str_out) almost_equal(X, str_out_variable) assert no_output is None # Run asserting exceptions with self.assertRaises(IndexError): pipeline.fit(self.X, self.y, output_=invalid_int) with self.assertRaises(ValueError): pipeline.fit(self.X, self.y, output_=invalid_block) with self.assertRaises(ValueError): pipeline.fit(self.X, self.y, output_=invalid_variable)