예제 #1
0
  def setUp(self):
    super(CannedEstimatorsTest, self).setUp()
    self.eps = 0.001

    # UCI Statlog (Heart) dataset.
    heart_csv_file = tf.keras.utils.get_file(
        'heart.csv',
        'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')
    heart_df = pd.read_csv(heart_csv_file)
    heart_target = heart_df.pop('target')
    heart_train_size = int(len(heart_df) * 0.8)
    self.heart_train_x = heart_df[:heart_train_size]
    self.heart_train_y = heart_target[:heart_train_size]
    self.heart_test_x = heart_df[heart_train_size:]
    self.heart_test_y = heart_target[heart_train_size:]

    # Feature columns.
    # - age
    # - sex
    # - cp        chest pain type (4 values)
    # - trestbps  resting blood pressure
    # - chol      serum cholestoral in mg/dl
    # - fbs       fasting blood sugar > 120 mg/dl
    # - restecg   resting electrocardiographic results (values 0,1,2)
    # - thalach   maximum heart rate achieved
    # - exang     exercise induced angina
    # - oldpeak   ST depression induced by exercise relative to rest
    # - slope     the slope of the peak exercise ST segment
    # - ca        number of major vessels (0-3) colored by flourosopy
    # - thal      3 = normal; 6 = fixed defect; 7 = reversable defect
    self.heart_feature_columns = [
        fc.numeric_column('age', default_value=-1),
        fc.categorical_column_with_vocabulary_list('sex', [0, 1]),
        fc.numeric_column('cp'),
        fc.numeric_column('trestbps', default_value=-1),
        fc.numeric_column('chol'),
        fc.categorical_column_with_vocabulary_list('fbs', [0, 1]),
        fc.categorical_column_with_vocabulary_list('restecg', [0, 1, 2]),
        fc.numeric_column('thalach'),
        fc.categorical_column_with_vocabulary_list('exang', [0, 1]),
        fc.numeric_column('oldpeak'),
        fc.categorical_column_with_vocabulary_list('slope', [0, 1, 2]),
        fc.numeric_column('ca'),
        fc.categorical_column_with_vocabulary_list(
            'thal', ['normal', 'fixed', 'reversible']),
    ]

    # Feature configs. Each model can pick and choose which features to use.
    self.heart_feature_configs = [
        configs.FeatureConfig(
            name='age',
            lattice_size=3,
            pwl_calibration_num_keypoints=5,
            monotonicity=1,
            pwl_calibration_clip_max=100,
        ),
        configs.FeatureConfig(
            name='cp',
            pwl_calibration_num_keypoints=4,
            pwl_calibration_input_keypoints='uniform',
            monotonicity='increasing',
        ),
        configs.FeatureConfig(
            name='chol',
            pwl_calibration_input_keypoints=[126.0, 210.0, 247.0, 286.0, 564.0],
            monotonicity=1,
            pwl_calibration_clip_min=130,
            pwl_calibration_clamp_min=True,
            pwl_calibration_clamp_max=True,
            regularizer_configs=[
                configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
            ],
        ),
        configs.FeatureConfig(
            name='fbs',
            monotonicity=[(0, 1)],
        ),
        configs.FeatureConfig(
            name='trestbps',
            pwl_calibration_num_keypoints=5,
            monotonicity='decreasing',
        ),
        configs.FeatureConfig(
            name='thalach',
            pwl_calibration_num_keypoints=5,
            monotonicity=-1,
        ),
        configs.FeatureConfig(
            name='restecg',
            monotonicity=[(0, 1), (0, 2)],
        ),
        configs.FeatureConfig(
            name='exang',
            monotonicity=[(0, 1)],
        ),
        configs.FeatureConfig(
            name='oldpeak',
            pwl_calibration_num_keypoints=5,
            monotonicity=1,
        ),
        configs.FeatureConfig(
            name='slope',
            monotonicity=[(0, 1), (1, 2)],
        ),
        configs.FeatureConfig(
            name='ca',
            pwl_calibration_num_keypoints=4,
            monotonicity='increasing',
        ),
        configs.FeatureConfig(
            name='thal',
            monotonicity=[('normal', 'fixed'), ('normal', 'reversible')],
        ),
    ]

    # UCI Boston dataset.
    boston_dataset = load_boston()
    boston_df = pd.DataFrame(
        boston_dataset.data, columns=boston_dataset.feature_names)
    boston_df['CHAS'] = boston_df['CHAS'].astype(np.int32)
    boston_target = pd.Series(boston_dataset.target)
    boston_train_size = int(len(boston_df) * 0.8)
    self.boston_train_x = boston_df[:boston_train_size]
    self.boston_train_y = boston_target[:boston_train_size]
    self.boston_test_x = boston_df[boston_train_size:]
    self.boston_test_y = boston_target[boston_train_size:]

    # Feature columns.
    # - CRIM     per capita crime rate by town
    # - ZN       proportion of residential land zoned for lots over 25,000 sq.ft
    # - INDUS    proportion of non-retail business acres per town
    # - CHAS     Charles River dummy variable (= 1 if tract bounds river)
    # - NOX      nitric oxides concentration (parts per 10 million)
    # - RM       average number of rooms per dwelling
    # - AGE      proportion of owner-occupied units built prior to 1940
    # - DIS      weighted distances to five Boston employment centres
    # - RAD      index of accessibility to radial highways
    # - TAX      full-value property-tax rate per $10,000
    # - PTRATIO  pupil-teacher ratio by town
    # - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
    # - LSTAT    % lower status of the population
    # - Target   Median value of owner-occupied homes in $1000's
    self.boston_feature_columns = [
        fc.numeric_column('CRIM'),
        fc.numeric_column('ZN'),
        fc.numeric_column('INDUS'),
        fc.categorical_column_with_vocabulary_list('CHAS', [0, 1]),
        fc.numeric_column('NOX'),
        fc.numeric_column('RM'),
        fc.numeric_column('AGE'),
        fc.numeric_column('DIS'),
        fc.numeric_column('RAD'),
        fc.numeric_column('TAX'),
        fc.numeric_column('PTRATIO'),
        fc.numeric_column('B'),
        fc.numeric_column('LSTAT'),
    ]

    # Feature configs. Each model can pick and choose which features to use.
    self.boston_feature_configs = [
        configs.FeatureConfig(
            name='CRIM',
            lattice_size=3,
            monotonicity=-1,
            pwl_calibration_convexity=1,
        ),
        configs.FeatureConfig(
            name='ZN',
            pwl_calibration_input_keypoints=[0.0, 25.0, 50.0, 75.0, 100.0],
            monotonicity=1,
            reflects_trust_in=[
                configs.TrustConfig(feature_name='RM', trust_type='trapezoid'),
            ],
        ),
        configs.FeatureConfig(
            name='INDUS',
            pwl_calibration_input_keypoints='uniform',
            pwl_calibration_always_monotonic=False,
            reflects_trust_in=[
                configs.TrustConfig(
                    feature_name='RM',
                    trust_type='edgeworth',
                    direction='negative'),
            ],
            regularizer_configs=[
                configs.RegularizerConfig(name='calib_wrinkle', l2=1e-4),
            ],
        ),
        configs.FeatureConfig(name='CHAS',),
        configs.FeatureConfig(name='NOX',),
        configs.FeatureConfig(
            name='RM',
            monotonicity='increasing',
            pwl_calibration_convexity='concave',
        ),
        configs.FeatureConfig(
            name='AGE',
            monotonicity=-1,
        ),
        configs.FeatureConfig(
            name='DIS',
            lattice_size=3,
            unimodality=1,
        ),
        configs.FeatureConfig(name='RAD',),
        configs.FeatureConfig(name='TAX',),
        configs.FeatureConfig(
            name='PTRATIO',
            monotonicity='decreasing',
        ),
        configs.FeatureConfig(name='B',),
        configs.FeatureConfig(
            name='LSTAT',
            monotonicity=-1,
            dominates=[
                configs.DominanceConfig(
                    feature_name='AGE', dominance_type='monotonic'),
            ],
        ),
    ]
예제 #2
0
  def setUp(self):
    super(PremadeTest, self).setUp()

    # UCI Statlog (Heart) dataset.
    heart_csv_file = tf.keras.utils.get_file(
        'heart.csv', 'http://storage.googleapis.com/applied-dl/heart.csv')
    heart_df = pd.read_csv(heart_csv_file)
    heart_train_size = int(len(heart_df) * 0.8)
    heart_train_dataframe = heart_df[:heart_train_size]
    heart_test_dataframe = heart_df[heart_train_size:]

    # Features:
    # - age
    # - sex
    # - cp        chest pain type (4 values)
    # - trestbps  resting blood pressure
    # - chol      serum cholestoral in mg/dl
    # - fbs       fasting blood sugar > 120 mg/dl
    # - restecg   resting electrocardiographic results (values 0,1,2)
    # - thalach   maximum heart rate achieved
    # - exang     exercise induced angina
    # - oldpeak   ST depression induced by exercise relative to rest
    # - slope     the slope of the peak exercise ST segment
    # - ca        number of major vessels (0-3) colored by flourosopy
    # - thal      3 = normal; 6 = fixed defect; 7 = reversable defect
    #
    # This ordering of feature names will be the exact same order that we
    # construct our model to expect.
    self.heart_feature_names = [
        'age', 'sex', 'cp', 'chol', 'fbs', 'trestbps', 'thalach', 'restecg',
        'exang', 'oldpeak', 'slope', 'ca', 'thal'
    ]
    feature_name_indices = {
        name: index for index, name in enumerate(self.heart_feature_names)
    }
    # This is the vocab list and mapping we will use for the 'thal' categorical
    # feature.
    thal_vocab_list = ['normal', 'fixed', 'reversible']
    thal_map = {category: i for i, category in enumerate(thal_vocab_list)}

    # Custom function for converting thal categories to buckets
    def convert_thal_features(thal_features):
      # Note that two examples in the test set are already converted.
      return np.array([
          thal_map[feature] if feature in thal_vocab_list else feature
          for feature in thal_features
      ])

    # Custom function for extracting each feature.
    def extract_features(dataframe, label_name='target'):
      features = []
      for feature_name in self.heart_feature_names:
        if feature_name == 'thal':
          features.append(
              convert_thal_features(
                  dataframe[feature_name].values).astype(float))
        else:
          features.append(dataframe[feature_name].values.astype(float))
      labels = dataframe[label_name].values.astype(float)
      return features, labels

    self.heart_train_x, self.heart_train_y = extract_features(
        heart_train_dataframe)
    self.heart_test_x, self.heart_test_y = extract_features(
        heart_test_dataframe)

    # Let's define our label minimum and maximum.
    self.heart_min_label = float(np.min(self.heart_train_y))
    self.heart_max_label = float(np.max(self.heart_train_y))
    # Our lattice models may have predictions above 1.0 due to numerical errors.
    # We can subtract this small epsilon value from our output_max to make sure
    # we do not predict values outside of our label bound.
    self.numerical_error_epsilon = 1e-5

    def compute_quantiles(features,
                          num_keypoints=10,
                          clip_min=None,
                          clip_max=None,
                          missing_value=None):
      # Clip min and max if desired.
      if clip_min is not None:
        features = np.maximum(features, clip_min)
        features = np.append(features, clip_min)
      if clip_max is not None:
        features = np.minimum(features, clip_max)
        features = np.append(features, clip_max)
      # Make features unique.
      unique_features = np.unique(features)
      # Remove missing values if specified.
      if missing_value is not None:
        unique_features = np.delete(unique_features,
                                    np.where(unique_features == missing_value))
      # Compute and return quantiles over unique non-missing feature values.
      return np.quantile(
          unique_features,
          np.linspace(0., 1., num=num_keypoints),
          interpolation='nearest').astype(float)

    self.heart_feature_configs = [
        configs.FeatureConfig(
            name='age',
            lattice_size=3,
            monotonicity='increasing',
            # We must set the keypoints manually.
            pwl_calibration_num_keypoints=5,
            pwl_calibration_input_keypoints=compute_quantiles(
                self.heart_train_x[feature_name_indices['age']],
                num_keypoints=5,
                clip_max=100),
            # Per feature regularization.
            regularizer_configs=[
                configs.RegularizerConfig(name='calib_wrinkle', l2=0.1),
            ],
        ),
        configs.FeatureConfig(
            name='sex',
            num_buckets=2,
        ),
        configs.FeatureConfig(
            name='cp',
            monotonicity='increasing',
            # Keypoints that are uniformly spaced.
            pwl_calibration_num_keypoints=4,
            pwl_calibration_input_keypoints=np.linspace(
                np.min(self.heart_train_x[feature_name_indices['cp']]),
                np.max(self.heart_train_x[feature_name_indices['cp']]),
                num=4),
        ),
        configs.FeatureConfig(
            name='chol',
            monotonicity='increasing',
            # Explicit input keypoints initialization.
            pwl_calibration_input_keypoints=[126.0, 210.0, 247.0, 286.0, 564.0],
            # Calibration can be forced to span the full output range
            # by clamping.
            pwl_calibration_clamp_min=True,
            pwl_calibration_clamp_max=True,
            # Per feature regularization.
            regularizer_configs=[
                configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
            ],
        ),
        configs.FeatureConfig(
            name='fbs',
            # Partial monotonicity: output(0) <= output(1)
            monotonicity=[(0, 1)],
            num_buckets=2,
        ),
        configs.FeatureConfig(
            name='trestbps',
            monotonicity='decreasing',
            pwl_calibration_num_keypoints=5,
            pwl_calibration_input_keypoints=compute_quantiles(
                self.heart_train_x[feature_name_indices['trestbps']],
                num_keypoints=5),
        ),
        configs.FeatureConfig(
            name='thalach',
            monotonicity='decreasing',
            pwl_calibration_num_keypoints=5,
            pwl_calibration_input_keypoints=compute_quantiles(
                self.heart_train_x[feature_name_indices['thalach']],
                num_keypoints=5),
        ),
        configs.FeatureConfig(
            name='restecg',
            # Partial monotonicity:
            # output(0) <= output(1), output(0) <= output(2)
            monotonicity=[(0, 1), (0, 2)],
            num_buckets=3,
        ),
        configs.FeatureConfig(
            name='exang',
            # Partial monotonicity: output(0) <= output(1)
            monotonicity=[(0, 1)],
            num_buckets=2,
        ),
        configs.FeatureConfig(
            name='oldpeak',
            monotonicity='increasing',
            pwl_calibration_num_keypoints=5,
            pwl_calibration_input_keypoints=compute_quantiles(
                self.heart_train_x[feature_name_indices['oldpeak']],
                num_keypoints=5),
        ),
        configs.FeatureConfig(
            name='slope',
            # Partial monotonicity:
            # output(0) <= output(1), output(1) <= output(2)
            monotonicity=[(0, 1), (1, 2)],
            num_buckets=3,
        ),
        configs.FeatureConfig(
            name='ca',
            monotonicity='increasing',
            pwl_calibration_num_keypoints=4,
            pwl_calibration_input_keypoints=compute_quantiles(
                self.heart_train_x[feature_name_indices['ca']],
                num_keypoints=4),
        ),
        configs.FeatureConfig(
            name='thal',
            # Partial monotonicity:
            # output(normal) <= output(fixed)
            # output(normal) <= output(reversible)
            monotonicity=[('normal', 'fixed'), ('normal', 'reversible')],
            num_buckets=3,
            # We must specify the vocabulary list in order to later set the
            # monotonicities since we used names and not indices.
            vocabulary_list=thal_vocab_list,
        ),
    ]
    premade_lib.set_categorical_monotonicities(self.heart_feature_configs)
예제 #3
0
    def test_updates(self):
        model_config = configs.CalibratedLatticeConfig(
            output_min=0,
            regularizer_configs=[
                configs.RegularizerConfig(name='torsion', l2=2e-3),
            ],
            feature_configs=[
                configs.FeatureConfig(
                    name='feature_a',
                    pwl_calibration_input_keypoints='quantiles',
                    pwl_calibration_num_keypoints=8,
                    monotonicity=1,
                    pwl_calibration_clip_max=100,
                ),
                configs.FeatureConfig(
                    name='feature_b',
                    lattice_size=3,
                    unimodality='valley',
                    pwl_calibration_input_keypoints='uniform',
                    pwl_calibration_num_keypoints=5,
                    pwl_calibration_clip_min=130,
                    pwl_calibration_convexity='convex',
                    regularizer_configs=[
                        configs.RegularizerConfig(name='calib_hessian',
                                                  l2=3e-3),
                    ],
                ),
                configs.FeatureConfig(
                    name='feature_c',
                    pwl_calibration_input_keypoints=[0.0, 0.5, 1.0],
                    reflects_trust_in=[
                        configs.TrustConfig(feature_name='feature_a'),
                        configs.TrustConfig(feature_name='feature_b',
                                            direction=-1),
                    ],
                ),
                configs.FeatureConfig(
                    name='feature_d',
                    num_buckets=3,
                    vocabulary_list=['a', 'b', 'c'],
                    default_value=-1,
                ),
            ])

        updates = [
            # Update values can be passed in as numbers.
            ('output_max', 1.0),  # update
            ('regularizer__torsion__l2', 0.004),  # update
            ('regularizer__calib_hessian__l1', 0.005),  # insert
            ('feature__feature_a__lattice_size', 3),  # update
            ('feature__feature_e__lattice_size', 4),  # insert
            # Update values can be strings.
            ('unrelated_hparams_not_affecting_config', 'unrelated'),
            ('feature__feature_a__regularizer__calib_wrinkle__l1',
             '0.6'),  # insert
            ('feature__feature_b__regularizer__calib_hessian__l1',
             '0.7'),  # update
            ('yet__another__unrelated_config', '4'),
        ]
        self.assertEqual(configs.apply_updates(model_config, updates), 7)

        model_config.feature_config_by_name('feature_a').monotonicity = 'none'
        model_config.feature_config_by_name(
            'feature_f').num_buckets = 4  # insert

        feature_names = [
            feature_config.name
            for feature_config in model_config.feature_configs
        ]
        expected_feature_names = [
            'feature_a', 'feature_b', 'feature_c', 'feature_d', 'feature_e',
            'feature_f'
        ]
        self.assertCountEqual(feature_names, expected_feature_names)

        global_regularizer_names = [
            regularizer_config.name
            for regularizer_config in model_config.regularizer_configs
        ]
        expected_global_regularizer_names = ['torsion', 'calib_hessian']
        self.assertCountEqual(global_regularizer_names,
                              expected_global_regularizer_names)

        self.assertEqual(model_config.output_max, 1.0)
        self.assertEqual(
            model_config.feature_config_by_name('feature_a').lattice_size, 3)
        self.assertEqual(
            model_config.feature_config_by_name(
                'feature_b').pwl_calibration_convexity, 'convex')
        self.assertEqual(
            model_config.feature_config_by_name('feature_e').lattice_size, 4)
        self.assertEqual(
            model_config.regularizer_config_by_name('torsion').l2, 0.004)
        self.assertEqual(
            model_config.regularizer_config_by_name('calib_hessian').l1, 0.005)
        self.assertEqual(
            model_config.feature_config_by_name(
                'feature_a').regularizer_config_by_name('calib_wrinkle').l1,
            0.6)
        self.assertEqual(
            model_config.feature_config_by_name(
                'feature_b').regularizer_config_by_name('calib_hessian').l1,
            0.7)
예제 #4
0
import copy
import json

from absl import logging
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow_lattice.python import configs
from tensorflow_lattice.python import premade
from tensorflow_lattice.python import premade_lib

unspecified_feature_configs = [
    configs.FeatureConfig(
        name='numerical_1',
        lattice_size=2,
        pwl_calibration_input_keypoints=np.linspace(0.0, 1.0, num=10),
    ),
    configs.FeatureConfig(
        name='numerical_2',
        lattice_size=2,
        pwl_calibration_input_keypoints=np.linspace(0.0, 1.0, num=10),
    ),
    configs.FeatureConfig(
        name='categorical',
        lattice_size=2,
        num_buckets=2,
        monotonicity=[('0.0', '1.0')],
        vocabulary_list=['0.0', '1.0'],
    ),
]
예제 #5
0
 def test_from_config(self):
   feature_configs = [
       configs.FeatureConfig(
           name='feature_a',
           pwl_calibration_input_keypoints='quantiles',
           pwl_calibration_num_keypoints=8,
           monotonicity=1,
           pwl_calibration_clip_max=100,
       ),
       configs.FeatureConfig(
           name='feature_b',
           lattice_size=3,
           unimodality='valley',
           pwl_calibration_input_keypoints='uniform',
           pwl_calibration_num_keypoints=5,
           pwl_calibration_clip_min=130,
           pwl_calibration_convexity='convex',
           regularizer_configs=[
               configs.RegularizerConfig(name='calib_hesian', l2=3e-3),
           ],
       ),
       configs.FeatureConfig(
           name='feature_c',
           pwl_calibration_input_keypoints=[0.0, 0.5, 1.0],
           reflects_trust_in=[
               configs.TrustConfig(feature_name='feature_a'),
               configs.TrustConfig(feature_name='feature_b', direction=-1),
           ],
           dominates=[
               configs.DominanceConfig(
                   feature_name='feature_d', dominance_type='monotonic'),
           ],
       ),
       configs.FeatureConfig(
           name='feature_d',
           num_buckets=3,
           vocabulary_list=['a', 'b', 'c'],
           default_value=-1,
       ),
   ]
   # First we test CalibratedLatticeEnsembleConfig
   model_config = configs.CalibratedLatticeEnsembleConfig(
       feature_configs=feature_configs,
       lattices=[['feature_a', 'feature_b'], ['feature_c', 'feature_d']],
       separate_calibrators=True,
       regularizer_configs=[
           configs.RegularizerConfig('torsion', l2=1e-4),
       ],
       output_min=0.0,
       output_max=1.0,
       output_calibration=True,
       output_calibration_num_keypoints=5,
       output_initialization=[0.0, 1.0])
   model_config_copy = configs.CalibratedLatticeEnsembleConfig.from_config(
       model_config.get_config(), tfl_custom_objects)
   self.assertDictEqual(model_config.get_config(),
                        model_config_copy.get_config())
   # Next we test CalibratedLatticeConfig
   model_config = configs.CalibratedLatticeConfig(
       feature_configs=feature_configs,
       regularizer_configs=[
           configs.RegularizerConfig('torsion', l2=1e-4),
       ],
       output_min=0.0,
       output_max=1.0,
       output_calibration=True,
       output_calibration_num_keypoints=8,
       output_initialization='quantiles')
   model_config_copy = configs.CalibratedLatticeConfig.from_config(
       model_config.get_config(), tfl_custom_objects)
   self.assertDictEqual(model_config.get_config(),
                        model_config_copy.get_config())
   # Last we test CalibratedLinearConfig
   model_config = configs.CalibratedLinearConfig(
       feature_configs=feature_configs,
       regularizer_configs=[
           configs.RegularizerConfig('calib_hessian', l2=1e-4),
       ],
       use_bias=True,
       output_min=0.0,
       output_max=None,
       output_calibration=True,
       output_initialization='uniform')
   model_config_copy = configs.CalibratedLinearConfig.from_config(
       model_config.get_config(), tfl_custom_objects)
   self.assertDictEqual(model_config.get_config(),
                        model_config_copy.get_config())