def test_label_softclass(): # Given problem_type = SOFTCLASS input_labels = pd.Series([2, 4, 2, 2, 4, 1]) # Raise exception with pytest.raises(NotImplementedError): LabelCleaner.construct(problem_type=problem_type, y=input_labels, y_uncleaned=None)
def test_label_cleaner_binary(): # Given problem_type = BINARY input_labels_numpy = np.array(['l1', 'l2', 'l2', 'l1', 'l1', 'l2']) input_labels = pd.Series(input_labels_numpy) input_labels_category = input_labels.astype('category') input_labels_with_shifted_index = input_labels.copy() input_labels_with_shifted_index.index += 5 input_labels_new = np.array(['new', 'l1', 'l2']) expected_output_labels = pd.Series([0, 1, 1, 0, 0, 1]) expected_output_labels_new = pd.Series([np.nan, 0, 1]) expected_output_labels_new_inverse = pd.Series([np.nan, 'l1', 'l2']) # When label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=input_labels) # Raise exception with pytest.raises(AssertionError): LabelCleaner.construct(problem_type=problem_type, y=input_labels_new) # Then assert isinstance(label_cleaner, LabelCleanerBinary) assert label_cleaner.problem_type_transform == BINARY assert label_cleaner.cat_mappings_dependent_var == {0: 'l1', 1: 'l2'} output_labels = label_cleaner.transform(input_labels) output_labels_with_numpy = label_cleaner.transform(input_labels_numpy) output_labels_category = label_cleaner.transform(input_labels_category) output_labels_with_shifted_index = label_cleaner.transform( input_labels_with_shifted_index) output_labels_new = label_cleaner.transform(input_labels_new) output_labels_inverse = label_cleaner.inverse_transform(output_labels) output_labels_with_shifted_index_inverse = label_cleaner.inverse_transform( output_labels_with_shifted_index) output_labels_new_inverse = label_cleaner.inverse_transform( output_labels_new) assert expected_output_labels.equals(output_labels) assert expected_output_labels.equals(output_labels_with_numpy) assert expected_output_labels.equals(output_labels_category) assert not expected_output_labels.equals(output_labels_with_shifted_index) output_labels_with_shifted_index.index -= 5 assert expected_output_labels.equals(output_labels_with_shifted_index) assert expected_output_labels_new.equals(output_labels_new) assert input_labels.equals(output_labels_inverse) assert input_labels_with_shifted_index.equals( output_labels_with_shifted_index_inverse) assert expected_output_labels_new_inverse.equals(output_labels_new_inverse)
def test_label_cleaner_regression(): # Given problem_type = REGRESSION input_labels_numpy = np.array([2, 4, 2, 2, 4, 1]) input_labels = pd.Series(input_labels_numpy) input_labels_new = pd.Series([3, 5, 2]) expected_output_labels = input_labels.copy() expected_output_labels_new = input_labels_new.copy() expected_output_labels_new_inverse = input_labels_new.copy() # When label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=input_labels, y_uncleaned=None) # Then assert isinstance(label_cleaner, LabelCleanerDummy) assert label_cleaner.problem_type_transform == REGRESSION output_labels = label_cleaner.transform(input_labels) output_labels_with_numpy = label_cleaner.transform(input_labels_numpy) output_labels_new = label_cleaner.transform(input_labels_new) output_labels_inverse = label_cleaner.inverse_transform(output_labels) output_labels_new_inverse = label_cleaner.inverse_transform( output_labels_new) assert expected_output_labels.equals(output_labels) assert expected_output_labels.equals(output_labels_with_numpy) assert expected_output_labels_new.equals(output_labels_new) assert input_labels.equals(output_labels_inverse) assert expected_output_labels_new_inverse.equals(output_labels_new_inverse)
def test_label_cleaner_multiclass(): # Given problem_type = MULTICLASS input_labels_numpy = np.array([2, 4, 2, 2, 4, 1]) input_labels = pd.Series(input_labels_numpy) input_labels_category = input_labels.astype('category') input_labels_with_shifted_index = input_labels.copy() input_labels_with_shifted_index.index += 5 input_labels_new = np.array([3, 5, 2]) expected_output_labels = pd.Series([1, 2, 1, 1, 2, 0]) expected_output_labels_new = pd.Series([np.nan, np.nan, 1]) expected_output_labels_new_inverse = pd.Series([np.nan, np.nan, 2]) # When label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=input_labels, y_uncleaned=input_labels) # Then assert isinstance(label_cleaner, LabelCleanerMulticlass) assert label_cleaner.problem_type_transform == MULTICLASS assert label_cleaner.cat_mappings_dependent_var == {0: 1, 1: 2, 2: 4} output_labels = label_cleaner.transform(input_labels) output_labels_with_numpy = label_cleaner.transform(input_labels_numpy) output_labels_category = label_cleaner.transform(input_labels_category) output_labels_with_shifted_index = label_cleaner.transform( input_labels_with_shifted_index) output_labels_new = label_cleaner.transform(input_labels_new) output_labels_inverse = label_cleaner.inverse_transform(output_labels) output_labels_with_shifted_index_inverse = label_cleaner.inverse_transform( output_labels_with_shifted_index) output_labels_new_inverse = label_cleaner.inverse_transform( output_labels_new) assert expected_output_labels.equals(output_labels) assert expected_output_labels.equals(output_labels_with_numpy) assert expected_output_labels.equals(output_labels_category) assert not expected_output_labels.equals(output_labels_with_shifted_index) output_labels_with_shifted_index.index -= 5 assert expected_output_labels.equals(output_labels_with_shifted_index) assert expected_output_labels_new.equals(output_labels_new) assert input_labels.equals(output_labels_inverse) assert input_labels_with_shifted_index.equals( output_labels_with_shifted_index_inverse) assert expected_output_labels_new_inverse.equals(output_labels_new_inverse)
def test_label_cleaner_multiclass_to_binary(): # Given problem_type = MULTICLASS input_labels_numpy = np.array(['l1', 'l2', 'l2', 'l1', 'l1', 'l2']) input_labels = pd.Series(input_labels_numpy) input_labels_uncleaned = pd.Series( ['l0', 'l1', 'l2', 'l2', 'l1', 'l1', 'l2', 'l3', 'l4']) input_labels_category = input_labels.astype('category') input_labels_with_shifted_index = input_labels.copy() input_labels_with_shifted_index.index += 5 input_labels_new = np.array(['l0', 'l1', 'l2']) input_labels_proba_transformed = pd.Series([0.7, 0.2, 0.5], index=[5, 2, 8]) expected_output_labels = pd.Series([0, 1, 1, 0, 0, 1]) expected_output_labels_new = pd.Series([np.nan, 0, 1]) expected_output_labels_new_inverse = pd.Series([np.nan, 'l1', 'l2']) expected_output_labels_proba_transformed_inverse = pd.DataFrame( data=[[0, 0.3, 0.7, 0, 0], [0, 0.8, 0.2, 0, 0], [0, 0.5, 0.5, 0, 0]], index=[5, 2, 8], columns=['l0', 'l1', 'l2', 'l3', 'l4'], dtype=np.float64) # When label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=input_labels, y_uncleaned=input_labels_uncleaned) # Then assert isinstance(label_cleaner, LabelCleanerMulticlassToBinary) assert label_cleaner.problem_type_transform == BINARY assert label_cleaner.cat_mappings_dependent_var == {0: 'l1', 1: 'l2'} output_labels = label_cleaner.transform(input_labels) output_labels_with_numpy = label_cleaner.transform(input_labels_numpy) output_labels_category = label_cleaner.transform(input_labels_category) output_labels_with_shifted_index = label_cleaner.transform( input_labels_with_shifted_index) output_labels_new = label_cleaner.transform(input_labels_new) output_labels_inverse = label_cleaner.inverse_transform(output_labels) output_labels_with_shifted_index_inverse = label_cleaner.inverse_transform( output_labels_with_shifted_index) output_labels_new_inverse = label_cleaner.inverse_transform( output_labels_new) assert expected_output_labels.equals(output_labels) assert expected_output_labels.equals(output_labels_with_numpy) assert expected_output_labels.equals(output_labels_category) assert not expected_output_labels.equals(output_labels_with_shifted_index) output_labels_with_shifted_index.index -= 5 assert expected_output_labels.equals(output_labels_with_shifted_index) assert expected_output_labels_new.equals(output_labels_new) assert input_labels.equals(output_labels_inverse) assert input_labels_with_shifted_index.equals( output_labels_with_shifted_index_inverse) assert expected_output_labels_new_inverse.equals(output_labels_new_inverse) output_labels_proba_transformed_inverse = label_cleaner.inverse_transform_proba( input_labels_proba_transformed, as_pandas=True) pd.testing.assert_frame_equal( expected_output_labels_proba_transformed_inverse, output_labels_proba_transformed_inverse)
label_column = 'class' # specifies which column do we want to predict train_data = train_data.head(1000) # subsample for faster demo ############################################# # Training custom model outside of task.fit # ############################################# # Separate features and labels X_train = train_data.drop(columns=[label_column]) y_train = train_data[label_column] problem_type = infer_problem_type(y=y_train) # Infer problem type (or else specify directly) naive_bayes_model = NaiveBayesModel(path='AutogluonModels/', name='CustomNaiveBayes', problem_type=problem_type) # Construct a LabelCleaner to neatly convert labels to float/integers during model training/inference, can also use to inverse_transform back to original. label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=y_train) y_train_clean = label_cleaner.transform(y_train) naive_bayes_model.fit(X_train=X_train, y_train=y_train_clean) # Fit custom model # To save to disk and load the model, do the following: # load_path = naive_bayes_model.path # naive_bayes_model.save() # del naive_bayes_model # naive_bayes_model = NaiveBayesModel.load(path=load_path) # Prepare test data X_test = test_data.drop(columns=[label_column]) y_test = test_data[label_column] y_test_clean = label_cleaner.transform(y_test)