Exemplo n.º 1
0
def write_narr_grids_to_file(frontal_grid_table, pickle_file_name):
    """Writes one or more NARR* grids to file.

    * NARR = North American Regional Reanalysis

    :param frontal_grid_table: pandas DataFrame with the following columns.
        Each row is one valid time.
    frontal_grid_table.unix_time_sec: Valid time.
    frontal_grid_table.warm_front_row_indices: length-W numpy array with row
        indices (integers) of grid cells intersected by a warm front.
    frontal_grid_table.warm_front_column_indices: Same as above, except for
        columns.
    frontal_grid_table.cold_front_row_indices: length-C numpy array with row
        indices (integers) of grid cells intersected by a cold front.
    frontal_grid_table.cold_front_column_indices: Same as above, except for
        columns.

    :param pickle_file_name: Path to output file.
    """

    error_checking.assert_columns_in_dataframe(frontal_grid_table,
                                               REQUIRED_GRID_COLUMNS)

    file_system_utils.mkdir_recursive_if_necessary(file_name=pickle_file_name)
    pickle_file_handle = open(pickle_file_name, 'wb')
    pickle.dump(frontal_grid_table[REQUIRED_GRID_COLUMNS], pickle_file_handle)
    pickle_file_handle.close()
Exemplo n.º 2
0
def write_station_metadata_to_processed_file(station_metadata_table,
                                             csv_file_name):
    """Writes metadata for weather stations to file.

    This is considered a "processed file," as opposed to a "raw file".  A "raw
    file" is one taken directly from another database, in the native format of
    said database.  For examples, see
    `hfmetar_io.read_station_metadata_from_raw_file` and
    `ok_mesonet_io.read_station_metadata_from_raw_file`.

    :param station_metadata_table: pandas DataFrame with the following columns.
    station_metadata_table.station_id: String ID for station.
    station_metadata_table.station_name: Verbose name for station.
    station_metadata_table.latitude_deg: Latitude (deg N).
    station_metadata_table.longitude_deg: Longitude (deg E).
    station_metadata_table.elevation_m_asl: Elevation (metres above sea level).
    station_metadata_table.utc_offset_hours [optional]: Local time minus UTC.
    :param csv_file_name: Path to output file.
    """

    error_checking.assert_columns_in_dataframe(
        station_metadata_table, REQUIRED_STATION_METADATA_COLUMNS)

    file_system_utils.mkdir_recursive_if_necessary(file_name=csv_file_name)
    station_metadata_table.to_csv(csv_file_name,
                                  header=True,
                                  columns=STATION_METADATA_COLUMNS,
                                  index=False)
Exemplo n.º 3
0
def write_processed_file(tornado_table, csv_file_name):
    """Writes tornado reports to CSV file.

    This is considered a "processed file," as opposed to a "raw file" (one taken
    directly from the Storm Events database).  Raw files with tornado reports
    are handled by storm_events_io.py.

    :param tornado_table: pandas DataFrame with the following columns.
    tornado_table.start_time_unix_sec: Start time.
    tornado_table.end_time_unix_sec: End time.
    tornado_table.start_latitude_deg: Latitude (deg N) of start point.
    tornado_table.start_longitude_deg: Longitude (deg E) of start point.
    tornado_table.end_latitude_deg: Latitude (deg N) of end point.
    tornado_table.end_longitude_deg: Longitude (deg E) of end point.
    tornado_table.fujita_rating: F-scale or EF-scale rating (integer from
        0...5).
    tornado_table.width_metres: Tornado width (metres).

    :param csv_file_name: Path to output file.
    """

    error_checking.assert_columns_in_dataframe(tornado_table,
                                               MANDATORY_COLUMNS)
    file_system_utils.mkdir_recursive_if_necessary(file_name=csv_file_name)

    tornado_table.to_csv(csv_file_name,
                         header=True,
                         columns=MANDATORY_COLUMNS,
                         index=False)
Exemplo n.º 4
0
def _check_input_data_for_learning(
        input_table, feature_names, target_name=None):
    """Checks input data (to machine-learning model) for errors.

    :param input_table: pandas DataFrame, where each row is one example (data
        point).
    :param feature_names: 1-D list with names of features (predictor variables).
        Each feature must be a column of input_table.
    :param target_name: Name of target variable (predictand).  Must be a column
        of input_table.  All values must be 0 or 1.
    """

    error_checking.assert_is_string_list(feature_names)
    error_checking.assert_is_numpy_array(
        numpy.array(feature_names), num_dimensions=1)

    if target_name is None:
        error_checking.assert_columns_in_dataframe(input_table, feature_names)
        return

    error_checking.assert_is_string(target_name)
    error_checking.assert_columns_in_dataframe(
        input_table, feature_names + [target_name])

    target_values = input_table[target_name].values
    error_checking.assert_is_integer_numpy_array(target_values)
    error_checking.assert_is_geq_numpy_array(target_values, 0)
    error_checking.assert_is_leq_numpy_array(target_values, 1)
Exemplo n.º 5
0
def write_processed_file(wind_table, csv_file_name=None, write_mode='w'):
    """Writes wind observations to file.

    This is considered a "processed file," as opposed to a "raw file".  A "raw
    file" is one taken directly from another database, in the native format of
    said database.  For examples, see `madis_io.read_winds_from_raw_file` and
    `ok_mesonet_io.read_winds_from_raw_file`.

    :param wind_table: pandas DataFrame with the following columns.
    wind_table.station_id: String ID for station.
    wind_table.station_name: Verbose name for station.
    wind_table.latitude_deg: Latitude (deg N).
    wind_table.longitude_deg: Longitude (deg E).
    wind_table.elevation_m_asl: Elevation (metres above sea level).
    wind_table.unix_time_sec: Valid time in Unix format.
    wind_table.u_wind_m_s01: u-wind (metres per second).
    wind_table.v_wind_m_s01: v-wind (metres per second).

    :param csv_file_name: Path to output file.
    :param write_mode: Any string accepted by the built-in method `open`.
    """

    error_checking.assert_columns_in_dataframe(wind_table, WIND_COLUMNS)
    file_system_utils.mkdir_recursive_if_necessary(file_name=csv_file_name)

    write_header = not os.path.isfile(csv_file_name) or 'w' in write_mode
    wind_table.to_csv(csv_file_name,
                      header=write_header,
                      columns=WIND_COLUMNS,
                      index=False,
                      mode=write_mode)
Exemplo n.º 6
0
    def test_assert_columns_in_dataframe_true(self):
        """Checks assert_columns_in_dataframe.

        In this case, input is pandas DataFrame with all desired columns.
        """

        error_checking.assert_columns_in_dataframe(DATAFRAME,
                                                   COLUMNS_IN_DATAFRAME)
Exemplo n.º 7
0
    def test_assert_columns_in_dataframe_missing_columns(self):
        """Checks assert_columns_in_dataframe.

        In this case, input is pandas DataFrame but is missing one of the
        desired columns.
        """

        with self.assertRaises(KeyError):
            error_checking.assert_columns_in_dataframe(
                DATAFRAME, FAKE_COLUMNS_IN_DATAFRAME)
def read_normalization_params_from_file(pickle_file_name):
    """Reads normalization parameters from Pickle file.

    :param pickle_file_name: Path to input file.
    :return: radar_table_no_height: See doc for `write_normalization_params`.
    :return: radar_table_with_height: Same.
    :return: sounding_table_no_height: Same.
    :return: sounding_table_with_height: Same.
    """

    # TODO(thunderhoser): Move this to normalization.py or something.
    pickle_file_handle = open(pickle_file_name, 'rb')
    radar_table_no_height = pickle.load(pickle_file_handle)
    radar_table_with_height = pickle.load(pickle_file_handle)
    sounding_table_no_height = pickle.load(pickle_file_handle)
    sounding_table_with_height = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(
        radar_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        radar_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        sounding_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        sounding_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT)

    return (radar_table_no_height, radar_table_with_height,
            sounding_table_no_height, sounding_table_with_height)
Exemplo n.º 9
0
def read_polylines_from_file(pickle_file_name):
    """Reads one or more frontal polylines from Pickle file.

    :param pickle_file_name: Path to input file.
    :return: front_table: See documentation for `write_polylines_to_file`.
    """

    pickle_file_handle = open(pickle_file_name, 'rb')
    front_table = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(front_table,
                                               REQUIRED_POLYLINE_COLUMNS)
    return front_table
Exemplo n.º 10
0
def read_file(pickle_file_name):
    """Reads tracking data from Pickle file.

    :param pickle_file_name: Path to input file.
    :return: storm_object_table: See documentation for `write_file`.
    """

    pickle_file_handle = open(pickle_file_name, 'rb')
    storm_object_table = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(
        storm_object_table, REQUIRED_COLUMNS)

    return storm_object_table
def read_storm_to_winds_table(pickle_file_name):
    """Reads linkages (storm-to-wind associations) from Pickle file.

    :param pickle_file_name: Path to input file.
    :return: storm_to_winds_table: pandas DataFrame with columns documented in
        write_storm_to_winds_table.
    """

    pickle_file_handle = open(pickle_file_name, 'rb')
    storm_to_winds_table = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(storm_to_winds_table,
                                               REQUIRED_COLUMNS_TO_WRITE)
    return storm_to_winds_table
def _read_intermediate_results(temp_file_name):
    """Reads intermediate best-track results for a subset of storm objects.

    :param temp_file_name: Path to intermediate file.
    :return: storm_object_table: See documentation for
        _write_intermediate_results.
    """

    pickle_file_handle = open(temp_file_name, 'rb')
    storm_object_table = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(storm_object_table,
                                               INTERMEDIATE_COLUMNS)
    return storm_object_table
Exemplo n.º 13
0
def read_narr_grids_from_file(pickle_file_name):
    """Reads one or more NARR* grids from file.

    * NARR = North American Regional Reanalysis

    :param pickle_file_name: Path to input file.
    :return: frontal_grid_table: See documentation for
        `write_narr_grids_to_file`.
    """

    pickle_file_handle = open(pickle_file_name, 'rb')
    frontal_grid_table = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(frontal_grid_table,
                                               REQUIRED_GRID_COLUMNS)
    return frontal_grid_table
Exemplo n.º 14
0
def read_processed_file(pickle_file_name):
    """Reads tracking data from file.

    This file should contain both polygons and track statistics for one time
    step and one tracking scale.

    :param pickle_file_name: Path to input file.
    :return: storm_object_table: See documentation for write_processed_file.
    """

    pickle_file_handle = open(pickle_file_name, 'rb')
    storm_object_table = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(storm_object_table,
                                               MANDATORY_COLUMNS)
    return storm_object_table
Exemplo n.º 15
0
def read_file(pickle_file_name):
    """Reads normalization parameters from Pickle file.

    :param pickle_file_name: Path to input file.
    :return: norm_table_no_height: See doc for `write_file`.
    :return: norm_table_with_height: Same.
    """

    pickle_file_handle = open(pickle_file_name, 'rb')
    norm_table_no_height = pickle.load(pickle_file_handle)
    norm_table_with_height = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(norm_table_no_height,
                                               TABLE_COLUMNS)
    error_checking.assert_columns_in_dataframe(norm_table_with_height,
                                               TABLE_COLUMNS)

    return norm_table_no_height, norm_table_with_height
Exemplo n.º 16
0
def read_normalization_params_from_file(pickle_file_name):
    """Reads normalization parameters from Pickle file.

    :param pickle_file_name: Path to input file.
    :return: radar_table_no_height: See doc for `write_normalization_params`.
    :return: radar_table_with_height: Same.
    :return: sounding_table_no_height: Same.
    :return: sounding_table_with_height: Same.
    """

    if not os.path.isfile(pickle_file_name):
        pickle_file_name = pickle_file_name.replace('/condo/swatwork/ralager',
                                                    '/scratch/ralager')

    if not os.path.isfile(pickle_file_name):
        pickle_file_name = pickle_file_name.replace('/scratch/ralager',
                                                    '/glade/scratch/ryanlage')

    if not os.path.isfile(pickle_file_name):
        pickle_file_name = pickle_file_name.replace('/glade/scratch/ryanlage',
                                                    '/glade/work/ryanlage')

    if not os.path.isfile(pickle_file_name):
        pickle_file_name = pickle_file_name.replace('/glade/work/ryanlage',
                                                    '/condo/swatwork/ralager')

    if not os.path.isfile(pickle_file_name):
        pickle_file_name = pickle_file_name.replace(
            '/condo/swatwork/ralager', '/condo/swatcommon/common')

    if not os.path.isfile(pickle_file_name):
        pickle_file_name = pickle_file_name.replace('/condo/swatwork/ralager',
                                                    '/scratch/ralager')

    # TODO(thunderhoser): Move this to normalization.py or something.
    pickle_file_handle = open(pickle_file_name, 'rb')
    radar_table_no_height = pickle.load(pickle_file_handle)
    radar_table_with_height = pickle.load(pickle_file_handle)
    sounding_table_no_height = pickle.load(pickle_file_handle)
    sounding_table_with_height = pickle.load(pickle_file_handle)
    pickle_file_handle.close()

    error_checking.assert_columns_in_dataframe(
        radar_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        radar_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        sounding_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        sounding_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT)

    return (radar_table_no_height, radar_table_with_height,
            sounding_table_no_height, sounding_table_with_height)
Exemplo n.º 17
0
def write_polylines_to_file(front_table, pickle_file_name):
    """Writes one or more frontal polylines to Pickle file.

    :param front_table: pandas DataFrame with the following columns.  Each row
        is one front.
    front_table.front_type: Type of front (examples: "warm", "cold").
    front_table.unix_time_sec: Valid time.
    front_table.latitudes_deg: 1-D numpy array of latitudes (deg N) along front.
    front_table.longitudes_deg: 1-D numpy array of longitudes (deg E) along
        front.
    :param pickle_file_name: Path to output file.
    """

    error_checking.assert_columns_in_dataframe(front_table,
                                               REQUIRED_POLYLINE_COLUMNS)

    file_system_utils.mkdir_recursive_if_necessary(file_name=pickle_file_name)
    pickle_file_handle = open(pickle_file_name, 'wb')
    pickle.dump(front_table[REQUIRED_POLYLINE_COLUMNS], pickle_file_handle)
    pickle_file_handle.close()
def write_normalization_params(pickle_file_name, radar_table_no_height,
                               radar_table_with_height,
                               sounding_table_no_height,
                               sounding_table_with_height):
    """Writes normalization parameters to Pickle file.

    :param pickle_file_name: Path to output file.
    :param radar_table_no_height: Single-indexed pandas DataFrame.  Each index
        is a field name (accepted by `radar_utils.check_field_name`).  Must
        contain the following columns.
    radar_table_no_height.mean_value: Mean value for the given field.
    radar_table_no_height.standard_deviation: Standard deviation.
    radar_table_no_height.min_value: Minimum value.
    radar_table_no_height.max_value: Max value.

    :param radar_table_with_height: Double-indexed pandas DataFrame.  Each index
        is a tuple with (field_name, height_m_agl), where `field_name` is
        accepted by `radar_utils.check_field_name` and `height_m_agl` is in
        metres above ground level.  Must contain the following columns.
    radar_table_with_height.mean_value: Mean value for the given field.
    radar_table_with_height.standard_deviation: Standard deviation.

    :param sounding_table_no_height: Single-indexed pandas DataFrame.  Each
        index is a field name (accepted by `soundings.check_field_name`).
        Columns should be the same as in `radar_table_no_height`.
    :param sounding_table_with_height: Double-indexed pandas DataFrame.  Each
        index is a tuple with (field_name, height_m_agl), where `field_name` is
        accepted by `soundings.check_field_name` and `height_m_agl` is in metres
        above ground level.  Columns should be the same as in
        `radar_table_with_height`.
    """

    # TODO(thunderhoser): Move this to normalization.py or something.
    error_checking.assert_columns_in_dataframe(
        radar_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        radar_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        sounding_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT)
    error_checking.assert_columns_in_dataframe(
        sounding_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT)

    file_system_utils.mkdir_recursive_if_necessary(file_name=pickle_file_name)
    pickle_file_handle = open(pickle_file_name, 'wb')
    pickle.dump(radar_table_no_height, pickle_file_handle)
    pickle.dump(radar_table_with_height, pickle_file_handle)
    pickle.dump(sounding_table_no_height, pickle_file_handle)
    pickle.dump(sounding_table_with_height, pickle_file_handle)
    pickle_file_handle.close()
Exemplo n.º 19
0
def write_file(pickle_file_name, norm_table_no_height, norm_table_with_height):
    """Writes normalization parameters to Pickle file.

    :param pickle_file_name: Path to output file.
    :param norm_table_no_height: pandas DataFrame created by `finalize_params`,
        containing one set of params for each variable.  This table should be
        single-indexed (field name only).
    :param norm_table_with_height: pandas DataFrame created by
        `finalize_params`, containing one set of params for each
        variable/height.  This table should be double-indexed (field name, then
        height in metres above ground level).
    """

    error_checking.assert_columns_in_dataframe(norm_table_no_height,
                                               TABLE_COLUMNS)
    error_checking.assert_columns_in_dataframe(norm_table_with_height,
                                               TABLE_COLUMNS)

    file_system_utils.mkdir_recursive_if_necessary(file_name=pickle_file_name)
    pickle_file_handle = open(pickle_file_name, 'wb')
    pickle.dump(norm_table_no_height, pickle_file_handle)
    pickle.dump(norm_table_with_height, pickle_file_handle)
    pickle_file_handle.close()
Exemplo n.º 20
0
def check_statistic_table(statistic_table, require_storm_objects=True):
    """Ensures that pandas DataFrame contains shape statistics.

    :param statistic_table: pandas DataFrame.
    :param require_storm_objects: Boolean flag.  If True, statistic_table must
        contain columns "storm_id" and "unix_time_sec".  If False,
        statistic_table does not need these columns.
    :return: statistic_column_names: 1-D list containing names of columns with
        shape statistics.
    :raises: ValueError: if statistic_table does not contain any columns with
        shape statistics.
    """

    statistic_column_names = get_statistic_columns(statistic_table)
    if statistic_column_names is None:
        raise ValueError(
            'statistic_table does not contain any column with shape '
            'statistics.')

    if require_storm_objects:
        error_checking.assert_columns_in_dataframe(statistic_table,
                                                   STORM_COLUMNS_TO_KEEP)

    return statistic_column_names
Exemplo n.º 21
0
def check_label_table(label_table, require_storm_objects=True):
    """Ensures that pandas DataFrame contains labels.

    :param label_table: pandas DataFrame.
    :param require_storm_objects: Boolean flag.  If True, label_table must
        contain columns "storm_id" and "unix_time_sec".  If False, label_table
        does not need these columns.
    :return: label_column_names: 1-D list containing names of columns with
        regression or classification labels.
    :raises: ValueError: if label_table does not contain any columns with
        regression or classification labels.
    """

    label_column_names = get_label_columns(label_table)
    if label_column_names is None:
        raise ValueError(
            'label_table does not contain any column with regression or '
            'classification labels.')

    if require_storm_objects:
        error_checking.assert_columns_in_dataframe(label_table,
                                                   MANDATORY_COLUMNS)

    return label_column_names
Exemplo n.º 22
0
def check_feature_table(feature_table, require_storm_objects=True):
    """Ensures that pandas DataFrame contains features and labels.

    feature_table must contain one or more feature columns.
    feature_table must contain either 1 or 2 label columns.  If 2 columns, there
    must be one regression label L_r and one classification label L_c, where L_r
    is the regression version of L_c.

    :param feature_table: pandas DataFrame.
    :param require_storm_objects: Boolean flag.  If True, feature_table must
        contain columns "storm_id" and "unix_time_sec".  If False, feature_table
        does not need these columns.
    :return: feature_column_names: 1-D list containing names of columns with
        features.
    :return: regression_label_column_name: Name of column with regression label.
        If there is no regression label, this will be None.
    :return: classification_label_column_name: Name of column with
        classification label.  If there is no regression label, this will be
        None.
    :raises: ValueError: if feature_table does not contain any feature columns.
    :raises: ValueError: if feature_table does not contain exactly one label
        column.
    """

    feature_column_names = radar_stats.get_statistic_columns(feature_table)

    shape_stat_column_names = shape_stats.get_statistic_columns(feature_table)
    if shape_stat_column_names:
        if feature_column_names:
            feature_column_names += shape_stat_column_names
        else:
            feature_column_names = shape_stat_column_names

    sounding_stat_column_names = soundings.get_sounding_stat_columns(
        feature_table)
    if sounding_stat_column_names:
        if feature_column_names:
            feature_column_names += sounding_stat_column_names
        else:
            feature_column_names = sounding_stat_column_names

    if feature_column_names is None:
        raise ValueError(
            'feature_table does not contain any columns with features '
            '(predictor variables).')

    regression_label_column_names = labels.get_regression_label_columns(
        feature_table)
    if regression_label_column_names and len(
            regression_label_column_names) == 1:
        regression_label_column_name = regression_label_column_names[0]
    else:
        regression_label_column_name = None

    classification_label_column_names = labels.get_classification_label_columns(
        feature_table)
    if classification_label_column_names and len(
            classification_label_column_names) == 1:
        classification_label_column_name = classification_label_column_names[0]
    else:
        classification_label_column_name = None

    if regression_label_column_name and classification_label_column_name:
        classification_param_dict = labels.column_name_to_label_params(
            classification_label_column_name)

        this_regression_label_column_name = (
            labels.get_column_name_for_regression_label(
                min_lead_time_sec=classification_param_dict[
                    labels.MIN_LEAD_TIME_NAME],
                max_lead_time_sec=classification_param_dict[
                    labels.MAX_LEAD_TIME_NAME],
                min_distance_metres=classification_param_dict[
                    labels.MIN_DISTANCE_NAME],
                max_distance_metres=classification_param_dict[
                    labels.MAX_DISTANCE_NAME],
                percentile_level=classification_param_dict[
                    labels.PERCENTILE_LEVEL_NAME]))

        if this_regression_label_column_name != regression_label_column_name:
            regression_label_column_name = None
            classification_label_column_name = None

    if not (regression_label_column_name or classification_label_column_name):
        error_string = (
            '\n\n' + str(regression_label_column_names) +
            str(classification_label_column_names) + '\n\nfeature_table ' +
            'should contain one regression-label column, one classification-'
            'label column, or a classification-label column with the '
            'corresponding regression-label column.  Instead, contains label '
            'columns listed above.')
        raise ValueError(error_string)

    if require_storm_objects:
        error_checking.assert_columns_in_dataframe(
            feature_table, STORM_TO_WIND_COLUMNS_TO_KEEP)

    return (feature_column_names, regression_label_column_name,
            classification_label_column_name)
def _transform_each_marginal_to_uniform(new_feature_table,
                                        orig_feature_table=None):
    """Transforms marginal distribution of each feature to uniform distribution.

    This method transforms data in `new_feature_table` only.

    If `orig_feature_table` is None, the transformation for feature "x" in the
    [i]th example will be based on the percentile score of
    new_feature_table["x"].values[i] in new_feature_table["x"].values.

    If `orig_feature_table` is specified, the transformation for feature "x" in
    the [i]th example will be based on the percentile score of
    new_feature_table["x"].values[i] in orig_feature_table["x"].values.

    P = number of original examples
    Q = number of new examples
    M = number of features

    :param new_feature_table: pandas DataFrame with Q rows and M columns.
        Column names are feature names.
    :param orig_feature_table: pandas DataFrame with P rows and M columns.
        Column names are feature names.
    :return: transformed_new_feature_table: Same as input, except that the
        marginal distribution of each column is uniform.
    """

    # TODO(thunderhoser): I could probably make this faster for cases where
    # `orig_feature_table` is specified.

    feature_names = list(new_feature_table)
    new_feature_matrix = new_feature_table[feature_names].to_numpy()

    if orig_feature_table is not None:
        error_checking.assert_columns_in_dataframe(orig_feature_table,
                                                   feature_names)
        orig_feature_matrix = orig_feature_table[feature_names].to_numpy()

    num_features = len(feature_names)
    num_new_examples = new_feature_matrix.shape[0]
    transformed_new_feature_table = None

    for j in range(num_features):
        new_indices_to_use = numpy.where(
            numpy.invert(numpy.isnan(new_feature_matrix[:, j])))[0]
        transformed_values = numpy.full(num_new_examples, 0.5)

        if orig_feature_table is None:
            these_ranks = scipy.stats.rankdata(
                new_feature_matrix[new_indices_to_use, j], method='average')
            transformed_values[new_indices_to_use] = (these_ranks /
                                                      len(new_indices_to_use))
        else:
            orig_indices_to_use = numpy.where(
                numpy.invert(numpy.isnan(orig_feature_matrix[:, j])))[0]

            for i in new_indices_to_use:
                transformed_values[i] = scipy.stats.percentileofscore(
                    orig_feature_matrix[orig_indices_to_use, j],
                    new_feature_matrix[i, j],
                    kind='weak') / 100

        if transformed_new_feature_table is None:
            transformed_new_feature_table = pandas.DataFrame.from_dict(
                {feature_names[j]: transformed_values})
        else:
            transformed_new_feature_table = (
                transformed_new_feature_table.assign(
                    **{feature_names[j]: transformed_values}))

    return transformed_new_feature_table
Exemplo n.º 24
0
    def test_assert_columns_in_dataframe_tuple(self):
        """Checks assert_columns_in_dataframe when input is tuple."""

        with self.assertRaises(TypeError):
            error_checking.assert_columns_in_dataframe(
                REAL_NUMBER_TUPLE, FAKE_COLUMNS_IN_DATAFRAME)
Exemplo n.º 25
0
    def test_assert_columns_in_dataframe_numpy_array(self):
        """Checks assert_columns_in_dataframe when input is numpy array."""

        with self.assertRaises(TypeError):
            error_checking.assert_columns_in_dataframe(
                REAL_NUMPY_ARRAY, FAKE_COLUMNS_IN_DATAFRAME)