示例#1
0
def robust_scaler(data, center=True, reduce=True, q_range=(25.0, 75.0), return_robust_scaler=False, rebuild_df=False):
    """
    Scale features using statistics that are robust to outliers.
    
    This Scaler removes the median and scales the data according to the quantile range 
    (defaults to IQR: Interquartile Range). 

    :param data: unscaled data (numpy array or dataframe)
    :param center: center unscaled data (mean = 0)
    :param reduce: reduce unscaled data (standard deviation = 1)
    :param return_robust_scaler: boolean value which enable returning (or not) RobustScaler instance
    :param rebuild_df: boolean value which enable rebuilding original dataframe with scaled data
    
    :return: scaled data (numpy array or dataframe), RobustScaler instance (optional)
    """
    rbt_scaler = RobustScaler(with_centering=center, with_scaling=reduce, quantile_range=q_range)
    x_scaled = rbt_scaler.fit_transform(data)
    if return_robust_scaler is True and rebuild_df is True:
        df_scaled = pd.DataFrame(x_scaled, columns=data.columns, index=data.index)
        return df_scaled, rbt_scaler
    elif return_robust_scaler:
        return x_scaled, rbt_scaler
    elif rebuild_df:
        return pd.DataFrame(x_scaled, columns=data.columns, index=data.index)
    return x_scaled
示例#2
0
def min_max_scaler(data, return_min_max_scaler=False, rebuild_df=False):
    """
    Transform features by scaling each feature to a given range
    
    The transformation is given by:
    
    X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
    X_scaled = X_std * (max - min) + min
    
    :param data: unscaled data (numpy array or dataframe)
    :param return_min_max_scaler: boolean value which enable returning (or not) MinMaxScaler instance
    :param rebuild_df: boolean value which enable rebuilding original dataframe with scaled data
    
    :return: scaled data (numpy array or dataframe), MinMaxScaler instance (optional)
    """
    m_scaler = MinMaxScaler()
    x_scaled = m_scaler.fit_transform(data)
    if return_min_max_scaler is True and rebuild_df is True:
        df_scaled = pd.DataFrame(x_scaled, columns=data.columns, index=data.index)
        return df_scaled, m_scaler
    elif return_min_max_scaler:
        return x_scaled, m_scaler
    elif rebuild_df:
        return pd.DataFrame(x_scaled, columns=data.columns, index=data.index)
    return x_scaled
示例#3
0
def standard_scaler(data,
                    center=True,
                    reduce=True,
                    return_std_scaler=False,
                    rebuild_df=False):
    """
    Standardize features by removing the mean and scaling to unit variance (z = (x - u) / s)
    
    :param data: unscaled data (numpy array or dataframe)
    :param center: center unscaled data (mean = 0)
    :param reduce: reduce unscaled data (standard deviation = 1)
    :param return_std_scaler: boolean value which enable returning (or not) StandardScaler instance
    :param rebuild_df: boolean value which enable rebuilding original dataframe with scaled data
    
    :return: scaled data (numpy array or dataframe), StandardScaler instance (optional)
    """
    std_scaler = StandardScaler(with_mean=center, with_std=reduce)
    x_scaled = std_scaler.fit_transform(data)
    if return_std_scaler is True and rebuild_df is True:
        df_scaled = pd.DataFrame(x_scaled,
                                 columns=data.columns,
                                 index=data.index)
        return df_scaled, std_scaler
    elif return_std_scaler:
        return x_scaled, std_scaler
    elif rebuild_df:
        return pd.DataFrame(x_scaled, columns=data.columns, index=data.index)
    return x_scaled
示例#4
0
def get_features_importance(labels, coefs, abs_coefs=False, non_zero_coefs=False, sort=True, verbose=False):
    """
    """
    # Build feature importance dataframe
    fimp_df = pd.DataFrame({'feature': labels, 'coefficient': coefs})
    # Get positive coefficients
    if abs_coefs:  
        fimp_df['coefficient'] = np.abs(fimp_df['coefficient'])
    # Filter zero coefficients
    if non_zero_coefs:
        fimp_df = fimp_df[fimp_df['coefficient'] != 0]
    # Sort features (get most important features at the head)
    if sort:
        fimp_df = fimp_df.sort_values('coefficient', ascending=False).reset_index(drop=True)
    # Print selected features and reduction ratio
    if verbose:
        # Get features count
        total_features = len(labels)
        # Get filtered features count (zero coefficients removed)
        selected_features = fimp_df.shape[0]
        reduction_ratio = (1 - (selected_features/total_features))*100
        print('{0}/{1} features selected, reduction of {2:.1f}%'.format(selected_features,
                                                                        total_features,
                                                                        reduction_ratio))
    # Compute coefficient frequency in order to calculate cumulative feature importance
    coefficients_sum = fimp_df['coefficient'].sum()
    fimp_df['coefficient_frequency'] = fimp_df['coefficient'] / coefficients_sum
    fimp_df['cumulative_coefficient_frequency'] = np.cumsum(fimp_df['coefficient_frequency'])
    return fimp_df
示例#5
0
def reverse_standardization(data_scaled, scaler, rebuild_df=False):
    """
    Inverse standardized features transformation
    
    :param x_scaled: scaled data (numpy array)
    :param scaler: StandardScaler or FunctionTransformer instance
    :param rebuild_df: boolean value which enable rebuilding original dataframe with unscaled data
    
    :return: unscaled data
    """
    x_unscaled = scaler.inverse_transform(data_scaled)
    if rebuild_df:
        df_unscaled = pd.DataFrame(x_unscaled, columns=data_scaled.columns, index=data_scaled.index)
        return df_unscaled
    return x_unscaled