def setUp(self): super(CannedEstimatorsTest, self).setUp() self.eps = 0.001 # UCI Statlog (Heart) dataset. heart_csv_file = tf.keras.utils.get_file( 'heart.csv', 'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv') heart_df = pd.read_csv(heart_csv_file) heart_target = heart_df.pop('target') heart_train_size = int(len(heart_df) * 0.8) self.heart_train_x = heart_df[:heart_train_size] self.heart_train_y = heart_target[:heart_train_size] self.heart_test_x = heart_df[heart_train_size:] self.heart_test_y = heart_target[heart_train_size:] # Feature columns. # - age # - sex # - cp chest pain type (4 values) # - trestbps resting blood pressure # - chol serum cholestoral in mg/dl # - fbs fasting blood sugar > 120 mg/dl # - restecg resting electrocardiographic results (values 0,1,2) # - thalach maximum heart rate achieved # - exang exercise induced angina # - oldpeak ST depression induced by exercise relative to rest # - slope the slope of the peak exercise ST segment # - ca number of major vessels (0-3) colored by flourosopy # - thal 3 = normal; 6 = fixed defect; 7 = reversable defect self.heart_feature_columns = [ fc.numeric_column('age', default_value=-1), fc.categorical_column_with_vocabulary_list('sex', [0, 1]), fc.numeric_column('cp'), fc.numeric_column('trestbps', default_value=-1), fc.numeric_column('chol'), fc.categorical_column_with_vocabulary_list('fbs', [0, 1]), fc.categorical_column_with_vocabulary_list('restecg', [0, 1, 2]), fc.numeric_column('thalach'), fc.categorical_column_with_vocabulary_list('exang', [0, 1]), fc.numeric_column('oldpeak'), fc.categorical_column_with_vocabulary_list('slope', [0, 1, 2]), fc.numeric_column('ca'), fc.categorical_column_with_vocabulary_list( 'thal', ['normal', 'fixed', 'reversible']), ] # Feature configs. Each model can pick and choose which features to use. self.heart_feature_configs = [ configs.FeatureConfig( name='age', lattice_size=3, pwl_calibration_num_keypoints=5, monotonicity=1, pwl_calibration_clip_max=100, ), configs.FeatureConfig( name='cp', pwl_calibration_num_keypoints=4, pwl_calibration_input_keypoints='uniform', monotonicity='increasing', ), configs.FeatureConfig( name='chol', pwl_calibration_input_keypoints=[126.0, 210.0, 247.0, 286.0, 564.0], monotonicity=1, pwl_calibration_clip_min=130, pwl_calibration_clamp_min=True, pwl_calibration_clamp_max=True, regularizer_configs=[ configs.RegularizerConfig(name='calib_hessian', l2=1e-4), ], ), configs.FeatureConfig( name='fbs', monotonicity=[(0, 1)], ), configs.FeatureConfig( name='trestbps', pwl_calibration_num_keypoints=5, monotonicity='decreasing', ), configs.FeatureConfig( name='thalach', pwl_calibration_num_keypoints=5, monotonicity=-1, ), configs.FeatureConfig( name='restecg', monotonicity=[(0, 1), (0, 2)], ), configs.FeatureConfig( name='exang', monotonicity=[(0, 1)], ), configs.FeatureConfig( name='oldpeak', pwl_calibration_num_keypoints=5, monotonicity=1, ), configs.FeatureConfig( name='slope', monotonicity=[(0, 1), (1, 2)], ), configs.FeatureConfig( name='ca', pwl_calibration_num_keypoints=4, monotonicity='increasing', ), configs.FeatureConfig( name='thal', monotonicity=[('normal', 'fixed'), ('normal', 'reversible')], ), ] # UCI Boston dataset. boston_dataset = load_boston() boston_df = pd.DataFrame( boston_dataset.data, columns=boston_dataset.feature_names) boston_df['CHAS'] = boston_df['CHAS'].astype(np.int32) boston_target = pd.Series(boston_dataset.target) boston_train_size = int(len(boston_df) * 0.8) self.boston_train_x = boston_df[:boston_train_size] self.boston_train_y = boston_target[:boston_train_size] self.boston_test_x = boston_df[boston_train_size:] self.boston_test_y = boston_target[boston_train_size:] # Feature columns. # - CRIM per capita crime rate by town # - ZN proportion of residential land zoned for lots over 25,000 sq.ft # - INDUS proportion of non-retail business acres per town # - CHAS Charles River dummy variable (= 1 if tract bounds river) # - NOX nitric oxides concentration (parts per 10 million) # - RM average number of rooms per dwelling # - AGE proportion of owner-occupied units built prior to 1940 # - DIS weighted distances to five Boston employment centres # - RAD index of accessibility to radial highways # - TAX full-value property-tax rate per $10,000 # - PTRATIO pupil-teacher ratio by town # - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town # - LSTAT % lower status of the population # - Target Median value of owner-occupied homes in $1000's self.boston_feature_columns = [ fc.numeric_column('CRIM'), fc.numeric_column('ZN'), fc.numeric_column('INDUS'), fc.categorical_column_with_vocabulary_list('CHAS', [0, 1]), fc.numeric_column('NOX'), fc.numeric_column('RM'), fc.numeric_column('AGE'), fc.numeric_column('DIS'), fc.numeric_column('RAD'), fc.numeric_column('TAX'), fc.numeric_column('PTRATIO'), fc.numeric_column('B'), fc.numeric_column('LSTAT'), ] # Feature configs. Each model can pick and choose which features to use. self.boston_feature_configs = [ configs.FeatureConfig( name='CRIM', lattice_size=3, monotonicity=-1, pwl_calibration_convexity=1, ), configs.FeatureConfig( name='ZN', pwl_calibration_input_keypoints=[0.0, 25.0, 50.0, 75.0, 100.0], monotonicity=1, reflects_trust_in=[ configs.TrustConfig(feature_name='RM', trust_type='trapezoid'), ], ), configs.FeatureConfig( name='INDUS', pwl_calibration_input_keypoints='uniform', pwl_calibration_always_monotonic=False, reflects_trust_in=[ configs.TrustConfig( feature_name='RM', trust_type='edgeworth', direction='negative'), ], regularizer_configs=[ configs.RegularizerConfig(name='calib_wrinkle', l2=1e-4), ], ), configs.FeatureConfig(name='CHAS',), configs.FeatureConfig(name='NOX',), configs.FeatureConfig( name='RM', monotonicity='increasing', pwl_calibration_convexity='concave', ), configs.FeatureConfig( name='AGE', monotonicity=-1, ), configs.FeatureConfig( name='DIS', lattice_size=3, unimodality=1, ), configs.FeatureConfig(name='RAD',), configs.FeatureConfig(name='TAX',), configs.FeatureConfig( name='PTRATIO', monotonicity='decreasing', ), configs.FeatureConfig(name='B',), configs.FeatureConfig( name='LSTAT', monotonicity=-1, dominates=[ configs.DominanceConfig( feature_name='AGE', dominance_type='monotonic'), ], ), ]
def setUp(self): super(PremadeTest, self).setUp() # UCI Statlog (Heart) dataset. heart_csv_file = tf.keras.utils.get_file( 'heart.csv', 'http://storage.googleapis.com/applied-dl/heart.csv') heart_df = pd.read_csv(heart_csv_file) heart_train_size = int(len(heart_df) * 0.8) heart_train_dataframe = heart_df[:heart_train_size] heart_test_dataframe = heart_df[heart_train_size:] # Features: # - age # - sex # - cp chest pain type (4 values) # - trestbps resting blood pressure # - chol serum cholestoral in mg/dl # - fbs fasting blood sugar > 120 mg/dl # - restecg resting electrocardiographic results (values 0,1,2) # - thalach maximum heart rate achieved # - exang exercise induced angina # - oldpeak ST depression induced by exercise relative to rest # - slope the slope of the peak exercise ST segment # - ca number of major vessels (0-3) colored by flourosopy # - thal 3 = normal; 6 = fixed defect; 7 = reversable defect # # This ordering of feature names will be the exact same order that we # construct our model to expect. self.heart_feature_names = [ 'age', 'sex', 'cp', 'chol', 'fbs', 'trestbps', 'thalach', 'restecg', 'exang', 'oldpeak', 'slope', 'ca', 'thal' ] feature_name_indices = { name: index for index, name in enumerate(self.heart_feature_names) } # This is the vocab list and mapping we will use for the 'thal' categorical # feature. thal_vocab_list = ['normal', 'fixed', 'reversible'] thal_map = {category: i for i, category in enumerate(thal_vocab_list)} # Custom function for converting thal categories to buckets def convert_thal_features(thal_features): # Note that two examples in the test set are already converted. return np.array([ thal_map[feature] if feature in thal_vocab_list else feature for feature in thal_features ]) # Custom function for extracting each feature. def extract_features(dataframe, label_name='target'): features = [] for feature_name in self.heart_feature_names: if feature_name == 'thal': features.append( convert_thal_features( dataframe[feature_name].values).astype(float)) else: features.append(dataframe[feature_name].values.astype(float)) labels = dataframe[label_name].values.astype(float) return features, labels self.heart_train_x, self.heart_train_y = extract_features( heart_train_dataframe) self.heart_test_x, self.heart_test_y = extract_features( heart_test_dataframe) # Let's define our label minimum and maximum. self.heart_min_label = float(np.min(self.heart_train_y)) self.heart_max_label = float(np.max(self.heart_train_y)) # Our lattice models may have predictions above 1.0 due to numerical errors. # We can subtract this small epsilon value from our output_max to make sure # we do not predict values outside of our label bound. self.numerical_error_epsilon = 1e-5 def compute_quantiles(features, num_keypoints=10, clip_min=None, clip_max=None, missing_value=None): # Clip min and max if desired. if clip_min is not None: features = np.maximum(features, clip_min) features = np.append(features, clip_min) if clip_max is not None: features = np.minimum(features, clip_max) features = np.append(features, clip_max) # Make features unique. unique_features = np.unique(features) # Remove missing values if specified. if missing_value is not None: unique_features = np.delete(unique_features, np.where(unique_features == missing_value)) # Compute and return quantiles over unique non-missing feature values. return np.quantile( unique_features, np.linspace(0., 1., num=num_keypoints), interpolation='nearest').astype(float) self.heart_feature_configs = [ configs.FeatureConfig( name='age', lattice_size=3, monotonicity='increasing', # We must set the keypoints manually. pwl_calibration_num_keypoints=5, pwl_calibration_input_keypoints=compute_quantiles( self.heart_train_x[feature_name_indices['age']], num_keypoints=5, clip_max=100), # Per feature regularization. regularizer_configs=[ configs.RegularizerConfig(name='calib_wrinkle', l2=0.1), ], ), configs.FeatureConfig( name='sex', num_buckets=2, ), configs.FeatureConfig( name='cp', monotonicity='increasing', # Keypoints that are uniformly spaced. pwl_calibration_num_keypoints=4, pwl_calibration_input_keypoints=np.linspace( np.min(self.heart_train_x[feature_name_indices['cp']]), np.max(self.heart_train_x[feature_name_indices['cp']]), num=4), ), configs.FeatureConfig( name='chol', monotonicity='increasing', # Explicit input keypoints initialization. pwl_calibration_input_keypoints=[126.0, 210.0, 247.0, 286.0, 564.0], # Calibration can be forced to span the full output range # by clamping. pwl_calibration_clamp_min=True, pwl_calibration_clamp_max=True, # Per feature regularization. regularizer_configs=[ configs.RegularizerConfig(name='calib_hessian', l2=1e-4), ], ), configs.FeatureConfig( name='fbs', # Partial monotonicity: output(0) <= output(1) monotonicity=[(0, 1)], num_buckets=2, ), configs.FeatureConfig( name='trestbps', monotonicity='decreasing', pwl_calibration_num_keypoints=5, pwl_calibration_input_keypoints=compute_quantiles( self.heart_train_x[feature_name_indices['trestbps']], num_keypoints=5), ), configs.FeatureConfig( name='thalach', monotonicity='decreasing', pwl_calibration_num_keypoints=5, pwl_calibration_input_keypoints=compute_quantiles( self.heart_train_x[feature_name_indices['thalach']], num_keypoints=5), ), configs.FeatureConfig( name='restecg', # Partial monotonicity: # output(0) <= output(1), output(0) <= output(2) monotonicity=[(0, 1), (0, 2)], num_buckets=3, ), configs.FeatureConfig( name='exang', # Partial monotonicity: output(0) <= output(1) monotonicity=[(0, 1)], num_buckets=2, ), configs.FeatureConfig( name='oldpeak', monotonicity='increasing', pwl_calibration_num_keypoints=5, pwl_calibration_input_keypoints=compute_quantiles( self.heart_train_x[feature_name_indices['oldpeak']], num_keypoints=5), ), configs.FeatureConfig( name='slope', # Partial monotonicity: # output(0) <= output(1), output(1) <= output(2) monotonicity=[(0, 1), (1, 2)], num_buckets=3, ), configs.FeatureConfig( name='ca', monotonicity='increasing', pwl_calibration_num_keypoints=4, pwl_calibration_input_keypoints=compute_quantiles( self.heart_train_x[feature_name_indices['ca']], num_keypoints=4), ), configs.FeatureConfig( name='thal', # Partial monotonicity: # output(normal) <= output(fixed) # output(normal) <= output(reversible) monotonicity=[('normal', 'fixed'), ('normal', 'reversible')], num_buckets=3, # We must specify the vocabulary list in order to later set the # monotonicities since we used names and not indices. vocabulary_list=thal_vocab_list, ), ] premade_lib.set_categorical_monotonicities(self.heart_feature_configs)
def test_updates(self): model_config = configs.CalibratedLatticeConfig( output_min=0, regularizer_configs=[ configs.RegularizerConfig(name='torsion', l2=2e-3), ], feature_configs=[ configs.FeatureConfig( name='feature_a', pwl_calibration_input_keypoints='quantiles', pwl_calibration_num_keypoints=8, monotonicity=1, pwl_calibration_clip_max=100, ), configs.FeatureConfig( name='feature_b', lattice_size=3, unimodality='valley', pwl_calibration_input_keypoints='uniform', pwl_calibration_num_keypoints=5, pwl_calibration_clip_min=130, pwl_calibration_convexity='convex', regularizer_configs=[ configs.RegularizerConfig(name='calib_hessian', l2=3e-3), ], ), configs.FeatureConfig( name='feature_c', pwl_calibration_input_keypoints=[0.0, 0.5, 1.0], reflects_trust_in=[ configs.TrustConfig(feature_name='feature_a'), configs.TrustConfig(feature_name='feature_b', direction=-1), ], ), configs.FeatureConfig( name='feature_d', num_buckets=3, vocabulary_list=['a', 'b', 'c'], default_value=-1, ), ]) updates = [ # Update values can be passed in as numbers. ('output_max', 1.0), # update ('regularizer__torsion__l2', 0.004), # update ('regularizer__calib_hessian__l1', 0.005), # insert ('feature__feature_a__lattice_size', 3), # update ('feature__feature_e__lattice_size', 4), # insert # Update values can be strings. ('unrelated_hparams_not_affecting_config', 'unrelated'), ('feature__feature_a__regularizer__calib_wrinkle__l1', '0.6'), # insert ('feature__feature_b__regularizer__calib_hessian__l1', '0.7'), # update ('yet__another__unrelated_config', '4'), ] self.assertEqual(configs.apply_updates(model_config, updates), 7) model_config.feature_config_by_name('feature_a').monotonicity = 'none' model_config.feature_config_by_name( 'feature_f').num_buckets = 4 # insert feature_names = [ feature_config.name for feature_config in model_config.feature_configs ] expected_feature_names = [ 'feature_a', 'feature_b', 'feature_c', 'feature_d', 'feature_e', 'feature_f' ] self.assertCountEqual(feature_names, expected_feature_names) global_regularizer_names = [ regularizer_config.name for regularizer_config in model_config.regularizer_configs ] expected_global_regularizer_names = ['torsion', 'calib_hessian'] self.assertCountEqual(global_regularizer_names, expected_global_regularizer_names) self.assertEqual(model_config.output_max, 1.0) self.assertEqual( model_config.feature_config_by_name('feature_a').lattice_size, 3) self.assertEqual( model_config.feature_config_by_name( 'feature_b').pwl_calibration_convexity, 'convex') self.assertEqual( model_config.feature_config_by_name('feature_e').lattice_size, 4) self.assertEqual( model_config.regularizer_config_by_name('torsion').l2, 0.004) self.assertEqual( model_config.regularizer_config_by_name('calib_hessian').l1, 0.005) self.assertEqual( model_config.feature_config_by_name( 'feature_a').regularizer_config_by_name('calib_wrinkle').l1, 0.6) self.assertEqual( model_config.feature_config_by_name( 'feature_b').regularizer_config_by_name('calib_hessian').l1, 0.7)
import copy import json from absl import logging import numpy as np import pandas as pd import tensorflow as tf from tensorflow_lattice.python import configs from tensorflow_lattice.python import premade from tensorflow_lattice.python import premade_lib unspecified_feature_configs = [ configs.FeatureConfig( name='numerical_1', lattice_size=2, pwl_calibration_input_keypoints=np.linspace(0.0, 1.0, num=10), ), configs.FeatureConfig( name='numerical_2', lattice_size=2, pwl_calibration_input_keypoints=np.linspace(0.0, 1.0, num=10), ), configs.FeatureConfig( name='categorical', lattice_size=2, num_buckets=2, monotonicity=[('0.0', '1.0')], vocabulary_list=['0.0', '1.0'], ), ]
def test_from_config(self): feature_configs = [ configs.FeatureConfig( name='feature_a', pwl_calibration_input_keypoints='quantiles', pwl_calibration_num_keypoints=8, monotonicity=1, pwl_calibration_clip_max=100, ), configs.FeatureConfig( name='feature_b', lattice_size=3, unimodality='valley', pwl_calibration_input_keypoints='uniform', pwl_calibration_num_keypoints=5, pwl_calibration_clip_min=130, pwl_calibration_convexity='convex', regularizer_configs=[ configs.RegularizerConfig(name='calib_hesian', l2=3e-3), ], ), configs.FeatureConfig( name='feature_c', pwl_calibration_input_keypoints=[0.0, 0.5, 1.0], reflects_trust_in=[ configs.TrustConfig(feature_name='feature_a'), configs.TrustConfig(feature_name='feature_b', direction=-1), ], dominates=[ configs.DominanceConfig( feature_name='feature_d', dominance_type='monotonic'), ], ), configs.FeatureConfig( name='feature_d', num_buckets=3, vocabulary_list=['a', 'b', 'c'], default_value=-1, ), ] # First we test CalibratedLatticeEnsembleConfig model_config = configs.CalibratedLatticeEnsembleConfig( feature_configs=feature_configs, lattices=[['feature_a', 'feature_b'], ['feature_c', 'feature_d']], separate_calibrators=True, regularizer_configs=[ configs.RegularizerConfig('torsion', l2=1e-4), ], output_min=0.0, output_max=1.0, output_calibration=True, output_calibration_num_keypoints=5, output_initialization=[0.0, 1.0]) model_config_copy = configs.CalibratedLatticeEnsembleConfig.from_config( model_config.get_config(), tfl_custom_objects) self.assertDictEqual(model_config.get_config(), model_config_copy.get_config()) # Next we test CalibratedLatticeConfig model_config = configs.CalibratedLatticeConfig( feature_configs=feature_configs, regularizer_configs=[ configs.RegularizerConfig('torsion', l2=1e-4), ], output_min=0.0, output_max=1.0, output_calibration=True, output_calibration_num_keypoints=8, output_initialization='quantiles') model_config_copy = configs.CalibratedLatticeConfig.from_config( model_config.get_config(), tfl_custom_objects) self.assertDictEqual(model_config.get_config(), model_config_copy.get_config()) # Last we test CalibratedLinearConfig model_config = configs.CalibratedLinearConfig( feature_configs=feature_configs, regularizer_configs=[ configs.RegularizerConfig('calib_hessian', l2=1e-4), ], use_bias=True, output_min=0.0, output_max=None, output_calibration=True, output_initialization='uniform') model_config_copy = configs.CalibratedLinearConfig.from_config( model_config.get_config(), tfl_custom_objects) self.assertDictEqual(model_config.get_config(), model_config_copy.get_config())