def test_replace_missing_values_with_median(self): """Ensures correct output from replace_missing_values. In this case, missing values of feature F are replaced with the median F-value. """ this_feature_table, this_replacement_dict = ( feature_trans.replace_missing_values( FEATURE_TABLE, replacement_method=feature_trans. MEDIAN_VALUE_REPLACEMENT_METHOD)) self.assertTrue( numpy.allclose(FEATURE_MATRIX_MISSING_TO_MEDIAN, this_feature_table.to_numpy(), atol=TOLERANCE)) self.assertTrue( set(this_replacement_dict.keys()) == set( REPLACEMENT_DICT_MEDIAN.keys())) self.assertTrue( numpy.allclose( this_replacement_dict[feature_trans.ORIGINAL_MEDIANS_KEY], REPLACEMENT_DICT_MEDIAN[feature_trans.ORIGINAL_MEDIANS_KEY], atol=TOLERANCE))
def _preprocess_data_for_learning( input_table, feature_names, learning_phase, replace_missing, standardize, transform_via_svd, fraction_of_explained_variance_for_svd= DEFAULT_EXP_VARIANCE_FRACTION_FOR_SVD, replacement_method=feature_trans.MEAN_VALUE_REPLACEMENT_METHOD, replacement_dict_for_training_data=None, standardization_dict_for_training_data=None, svd_dict_for_training_data=None): """Pre-processes data for input to any machine-learning algorithm. "Input" to a machine-learning algorithm means training, validation, or testing. Data must be pre-processed in the same way for all three phases of learning. For training, `*dict_for_training_data` should all be left as None, because they will be computed on the fly. However, for validation and testing, `*dict_for_training_data` should be the dictionaries created for training data. In other words, these values are *not* computed on the fly for validation or testing. If transform_via_svd = True, data will be standardized and missing values will be replaced. Thus, transform_via_svd = True implies that replace_missing = standardize = True. Similarly, if standardize = True, missing values will be replaced. Thus, standardize = True implies that replace_missing = True. :param input_table: pandas DataFrame, where each row is one example (data point). :param feature_names: 1-D list with names of features (predictor variables). Each feature must be a column of input_table. :param learning_phase: Learning phase ("training", "validation", or "testing"). :param replace_missing: Boolean flag. If True, missing values of feature F will be replaced with the mean or median F-value. :param standardize: Boolean flag. If True, each feature will be standardized to z-scores. :param transform_via_svd: Boolean flag. If True, will transform features to empirical orthogonal functions (EOFs), using singular-value decomposition (SVD). :param fraction_of_explained_variance_for_svd: [used only if transform_via_svd = True] Determines number of modes (transformed features) to keep. Will select modes in descending order of explained variance, until cumulative explained variance >= `fraction_of_explained_variance_for_svd` of variance in full dataset. :param replacement_method: [used only if replace_missing = True and learning_phase == "training"] See doc for `feature_transformation.repalce_missing_values`. :param replacement_dict_for_training_data: [used only if replace_missing = True and learning_phase != "training"] Dictionary created earlier for training data. See doc for `feature_transformation.repalce_missing_values`. :param standardization_dict_for_training_data: [used only if standardize = True or transform_via_svd = True] If learning phase is "training", this will not be used (means and standard deviations are created on the fly, from the training data themselves). Otherwise, this must be the dictionary created earlier for training data. :param svd_dict_for_training_data: [used only if transform_via_svd = True] If learning phase is "training", this will not be used (SVD parameters are created on the fly, from the training data themselves). Otherwise, this must be the dictionary created earlier for training data. :return: transformed_input_table: Same as input_table, except that feature columns may have been transformed by standardization or SVD. All non- feature columns are unchanged. If SVD transformation was used, feature names (ergo, column names) are different. :return: transformed_feature_names: 1-D list with names of transformed features (predictor variables). Each transformed feature is a column of transformed_input_table. :return: replacement_dict_for_training_data: [None if replace_missing = False] See documentation for `feature_transformation.replace_missing_values`. If learning phase is "training", this dictionary was just created on the fly. Otherwise, this is merely the input dictionary. :return: standardization_dict_for_training_data: [None if standardize = transform_via_svd = False] See documentation for `feature_transformation.standardize_features`. If learning phase is "training", this dictionary was just created on the fly. Otherwise, this is merely the input dictionary. :return: svd_dict_for_training_data: [None if transform_via_svd = False] See documentation for `feature_transformation.perform_svd`. If learning phase is "training", this dictionary was just created on the fly. Otherwise, this is merely the input dictionary. """ error_checking.assert_is_boolean(replace_missing) error_checking.assert_is_boolean(standardize) error_checking.assert_is_boolean(transform_via_svd) # If no pre-processing, exit now. if not (replace_missing or standardize or transform_via_svd): return (input_table, feature_names, replacement_dict_for_training_data, standardization_dict_for_training_data, svd_dict_for_training_data) _check_learning_phase(learning_phase) _check_input_data_for_learning( input_table=input_table, feature_names=feature_names, target_name=None) if transform_via_svd: if learning_phase == TRAINING_PHASE: (standardization_dict_for_training_data, svd_dict_for_training_data) = feature_trans.perform_svd( input_table[feature_names]) svd_dict_for_training_data = ( feature_trans.filter_svd_by_explained_variance( svd_dict_for_training_data, fraction_of_variance_to_keep= fraction_of_explained_variance_for_svd)) transformed_input_table = pandas.DataFrame( feature_trans.transform_features_via_svd( feature_table=input_table[feature_names], standardization_dict=standardization_dict_for_training_data, svd_dictionary=svd_dict_for_training_data)) transformed_input_table = _rename_svd_transformed_features( transformed_input_table) elif standardize: if learning_phase == TRAINING_PHASE: transformed_input_table, standardization_dict_for_training_data = ( feature_trans.standardize_features( feature_table=input_table[feature_names], standardization_dict=None)) else: transformed_input_table, _ = feature_trans.standardize_features( feature_table=input_table[feature_names], standardization_dict=standardization_dict_for_training_data) elif replace_missing: if learning_phase == TRAINING_PHASE: transformed_input_table, replacement_dict_for_training_data = ( feature_trans.replace_missing_values( feature_table=input_table[feature_names], replacement_method=replacement_method, replacement_dict=None)) else: transformed_input_table, _ = feature_trans.replace_missing_values( feature_table=input_table[feature_names], replacement_dict=replacement_dict_for_training_data) transformed_feature_names = list(transformed_input_table) non_feature_columns = [ s for s in list(input_table) if s not in feature_names] for this_column in non_feature_columns: transformed_input_table = transformed_input_table.assign( **{this_column: input_table[this_column].values}) if transform_via_svd: return (transformed_input_table, transformed_feature_names, None, standardization_dict_for_training_data, svd_dict_for_training_data) if standardize: return (transformed_input_table, transformed_feature_names, None, standardization_dict_for_training_data, None) return (transformed_input_table, transformed_feature_names, replacement_dict_for_training_data, None, None)