def test_identification(self): feature_value_map = preprocessing_util.read_data() types = {} for name, values in feature_value_map.items(): types[name] = identify_types.identify_type(values) # Examples through manual inspection self.assertEqual(types[identify_types.BINARY], identify_types.BINARY) self.assertEqual(types['normal'], identify_types.CONTINUOUS) self.assertEqual(types['boxcox'], identify_types.CONTINUOUS) # We don't yet know the quantile type self.assertEqual(types[identify_types.QUANTILE], identify_types.CONTINUOUS) self.assertEqual(types[identify_types.ENUM], identify_types.ENUM) self.assertEqual(types[identify_types.PROBABILITY], identify_types.PROBABILITY)
def test_identification(self): feature_value_map = read_data() types = {} for name, values in feature_value_map.items(): types[name] = identify_types.identify_type(values) # Examples through manual inspection self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY) self.assertEqual(types[CONTINUOUS_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the boxcox type self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the quantile type self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS) self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM) self.assertEqual(types[PROBABILITY_FEATURE_ID], identify_types.PROBABILITY)
def test_identification(self): feature_value_map = read_data() types = {} for name, values in feature_value_map.items(): types[name] = identify_types.identify_type(values) # Examples through manual inspection self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY) self.assertEqual(types[CONTINUOUS_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the boxcox type self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the quantile type self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS) self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM) self.assertEqual(types[PROBABILITY_FEATURE_ID], identify_types.PROBABILITY)
def identify_parameter( feature_name, values, max_unique_enum_values=DEFAULT_MAX_UNIQUE_ENUM, quantile_size=DEFAULT_MAX_QUANTILE_SIZE, quantile_k2_threshold=DEFAULT_QUANTILE_K2_THRESHOLD, skip_box_cox=False, skip_quantiles=False, feature_type=None, ): if feature_type is None: feature_type = identify_types.identify_type(values, max_unique_enum_values) boxcox_lambda = None boxcox_shift = 0.0 mean = 0.0 stddev = 1.0 possible_values = None quantiles = None assert feature_type in [ identify_types.CONTINUOUS, identify_types.PROBABILITY, identify_types.BINARY, identify_types.ENUM, identify_types.CONTINUOUS_ACTION, ], "unknown type {}".format(feature_type) assert (len(values) >= MINIMUM_SAMPLES_TO_IDENTIFY ), "insufficient information to identify parameter" min_value = np.min(values) max_value = np.max(values) if feature_type == identify_types.CONTINUOUS: if min_value == max_value: return no_op_feature() k2_original, p_original = stats.normaltest(values) # shift can be estimated but not in scipy boxcox_shift = float(min_value * -1) candidate_values, lmbda = stats.boxcox( np.maximum(values + boxcox_shift, BOX_COX_MARGIN)) k2_boxcox, p_boxcox = stats.normaltest(candidate_values) logger.info( "Feature stats. Original K2: {} P: {} Boxcox K2: {} P: {}".format( k2_original, p_original, k2_boxcox, p_boxcox)) if lmbda < 0.9 or lmbda > 1.1: # Lambda is far enough from 1.0 to be worth doing boxcox if k2_original > k2_boxcox * 10 and k2_boxcox <= quantile_k2_threshold: # The boxcox output is significantly more normally distributed # than the original data and is normal enough to apply # effectively. stddev = np.std(candidate_values, ddof=1) # Unclear whether this happens in practice or not if (np.isfinite(stddev) and stddev < BOX_COX_MAX_STDDEV and not np.isclose(stddev, 0)): values = candidate_values boxcox_lambda = float(lmbda) if boxcox_lambda is None or skip_box_cox: boxcox_shift = None boxcox_lambda = None if boxcox_lambda is not None: feature_type = identify_types.BOXCOX if (boxcox_lambda is None and k2_original > quantile_k2_threshold and (not skip_quantiles)): feature_type = identify_types.QUANTILE quantiles = (np.unique( mquantiles( values, np.arange(quantile_size + 1, dtype=np.float64) / float(quantile_size), alphap=0.0, betap=1.0, )).astype(float).tolist()) logger.info( "Feature is non-normal, using quantiles: {}".format(quantiles)) if (feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX or feature_type == identify_types.CONTINUOUS_ACTION): mean = float(np.mean(values)) values = values - mean stddev = max(float(np.std(values, ddof=1)), 1.0) if not np.isfinite(stddev): logger.info( "Std. dev not finite for feature {}".format(feature_name)) return None values /= stddev if feature_type == identify_types.ENUM: possible_values = np.unique(values.astype(int)).tolist() return NormalizationParameters( feature_type, boxcox_lambda, boxcox_shift, mean, stddev, possible_values, quantiles, min_value, max_value, )
def identify_parameter( values, max_unique_enum_values=DEFAULT_MAX_UNIQUE_ENUM, quantile_size=DEFAULT_MAX_QUANTILE_SIZE, quantile_k2_threshold=DEFAULT_QUANTILE_K2_THRESHOLD, ): feature_type = identify_types.identify_type(values, max_unique_enum_values) boxcox_lambda = None boxcox_shift = 0 mean = 0 stddev = 1 possible_values = None quantiles = None assert feature_type in [ identify_types.CONTINUOUS, identify_types.PROBABILITY, identify_types.BINARY, identify_types.ENUM, identify_types.QUANTILE ], "unknown type {}".format(feature_type) assert len( values ) >= MINIMUM_SAMPLES_TO_IDENTIFY, "insufficient information to identify parameter" min_value = np.min(values) max_value = np.max(values) if feature_type == identify_types.CONTINUOUS: assert min_value < max_value, "Binary feature marked as continuous" k2_original, p_original = stats.normaltest(values) # shift can be estimated but not in scipy boxcox_shift = float(min_value * -1) candidate_values, lmbda = stats.boxcox( np.maximum(values + boxcox_shift, BOX_COX_MARGIN)) k2_boxcox, p_boxcox = stats.normaltest(candidate_values) logger.info( "Feature stats. Original K2: {} P: {} Boxcox K2: {} P: {}".format( k2_original, p_original, k2_boxcox, p_boxcox)) if lmbda < 0.9 or lmbda > 1.1: # Lambda is far enough from 1.0 to be worth doing boxcox if k2_original > k2_boxcox * 10 and k2_boxcox <= quantile_k2_threshold: # The boxcox output is significantly more normally distributed # than the original data and is normal enough to apply # effectively. stddev = np.std(candidate_values, ddof=1) # Unclear whether this happens in practice or not if np.isfinite(stddev) and stddev < BOX_COX_MAX_STDDEV and \ not np.isclose(stddev, 0): values = candidate_values boxcox_lambda = float(lmbda) if boxcox_lambda is None: boxcox_shift = None if boxcox_lambda is None and k2_original > quantile_k2_threshold: feature_type = identify_types.QUANTILE quantiles = mquantiles( values, np.arange(quantile_size, dtype=np.float32) / float(quantile_size)).astype(float).tolist() logger.info( "Feature is non-normal, using quantiles: {}".format(quantiles)) if feature_type == identify_types.CONTINUOUS: mean = float(np.mean(values)) values = values - mean stddev = float(np.std(values, ddof=1)) if np.isclose(stddev, 0) or not np.isfinite(stddev): stddev = 1 values /= stddev if feature_type == identify_types.ENUM: possible_values = np.unique(values).astype(float).tolist() return NormalizationParameters(feature_type, boxcox_lambda, boxcox_shift, mean, stddev, possible_values, quantiles)
def identify_parameter( feature_name, values, max_unique_enum_values=DEFAULT_MAX_UNIQUE_ENUM, quantile_size=DEFAULT_MAX_QUANTILE_SIZE, quantile_k2_threshold=DEFAULT_QUANTILE_K2_THRESHOLD, skip_box_cox=False, skip_quantiles=False, feature_type=None, ): if feature_type is None: feature_type = identify_types.identify_type(values, max_unique_enum_values) boxcox_lambda = None boxcox_shift = 0.0 mean = 0.0 stddev = 1.0 possible_values = None quantiles = None assert feature_type in [ identify_types.CONTINUOUS, identify_types.PROBABILITY, identify_types.BINARY, identify_types.ENUM, identify_types.CONTINUOUS_ACTION, ], "unknown type {}".format(feature_type) assert ( len(values) >= MINIMUM_SAMPLES_TO_IDENTIFY ), "insufficient information to identify parameter" min_value = np.min(values) max_value = np.max(values) if feature_type == identify_types.CONTINUOUS: if min_value == max_value: return no_op_feature() k2_original, p_original = stats.normaltest(values) # shift can be estimated but not in scipy boxcox_shift = float(min_value * -1) candidate_values, lmbda = stats.boxcox( np.maximum(values + boxcox_shift, BOX_COX_MARGIN) ) k2_boxcox, p_boxcox = stats.normaltest(candidate_values) logger.info( "Feature stats. Original K2: {} P: {} Boxcox K2: {} P: {}".format( k2_original, p_original, k2_boxcox, p_boxcox ) ) if lmbda < 0.9 or lmbda > 1.1: # Lambda is far enough from 1.0 to be worth doing boxcox if k2_original > k2_boxcox * 10 and k2_boxcox <= quantile_k2_threshold: # The boxcox output is significantly more normally distributed # than the original data and is normal enough to apply # effectively. stddev = np.std(candidate_values, ddof=1) # Unclear whether this happens in practice or not if ( np.isfinite(stddev) and stddev < BOX_COX_MAX_STDDEV and not np.isclose(stddev, 0) ): values = candidate_values boxcox_lambda = float(lmbda) if boxcox_lambda is None or skip_box_cox: boxcox_shift = None boxcox_lambda = None if boxcox_lambda is not None: feature_type = identify_types.BOXCOX if ( boxcox_lambda is None and k2_original > quantile_k2_threshold and (not skip_quantiles) ): feature_type = identify_types.QUANTILE quantiles = ( np.unique( mquantiles( values, np.arange(quantile_size + 1, dtype=np.float64) / float(quantile_size), alphap=0.0, betap=1.0, ) ) .astype(float) .tolist() ) logger.info("Feature is non-normal, using quantiles: {}".format(quantiles)) if ( feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX or feature_type == identify_types.CONTINUOUS_ACTION ): mean = float(np.mean(values)) values = values - mean stddev = max(float(np.std(values, ddof=1)), 1.0) if not np.isfinite(stddev): logger.info("Std. dev not finite for feature {}".format(feature_name)) return None values /= stddev if feature_type == identify_types.ENUM: possible_values = np.unique(values.astype(int)).tolist() return NormalizationParameters( feature_type, boxcox_lambda, boxcox_shift, mean, stddev, possible_values, quantiles, min_value, max_value, )