예제 #1
0
def categories_ord_cols(df, ord_cols):
    '''Returns the categories of the ordinal columns in such a way they can be displayed to the front-end

    Parameters:
    -----------
    df: pd.DataFrame
    ord_cols: list of str, containing the names of the ordinal columns
    '''
    result = []
    df_new = make_missing_np_nan(df)
    for col in ord_cols:
        col_result = {"name": col, "values": []}

        i = 1
        categories = list(set(df_new[col]))
        if pd.isnull(categories).any():
            idx = np.where(pd.isnull(categories) == True)[0][0]
            del categories[idx]
        for category in categories:
            col_result['values'].append({
                "name": category,
                "id": i,
                "fixed": "false"
            })
            i += 1

        result.append(col_result)

    return result
예제 #2
0
def MICE_imputation(df, categorical=False, nr_iter=3):
    '''Returns the dataframe where missing values are imputed using MICE

    Parameters:
    -----------
    df: pd.DataFrame
    categorical: boolean, if set to True, the returned dataframe will contain the original category values
                 (as opposed to their integer index)
    nr_iter: int, the number of imputations to be generated

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using MICE
    '''
    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    result = [0] * nr_iter
    for i in range(nr_iter):
        imputer = IterativeImputer(sample_posterior=True)
        imputed = imputer.fit_transform(df_new)
        df_imputed = pd.DataFrame(imputed, columns=columns)
        result[i] = df_imputed

    return result
예제 #3
0
def KNN_imputation(df, k=5):
    '''Imputes the missing values in a dataframe using K-Nearest Neighbor

    Parameters:
    -----------
    df: pd.DataFrame
    k: int, number of neighboring samples to use for imputation

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using KNN
    '''
    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 1)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    imputer = KNNImputer(n_neighbors=k)
    imputed = imputer.fit_transform(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
예제 #4
0
def regression_imputation(df):
    '''Returns the dataframe where missing values are imputed using IterativeImputer and BayesianRidge()
    This is a regression imputation method.

    Parameters:
    -----------
    df: pd.DataFrame

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using multiple imputation
    '''
    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)

    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    imputer = IterativeImputer(random_state=0)
    imputed = imputer.fit_transform(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
예제 #5
0
def median_imputation(df):
    '''Imputes the missing values in a data frame using median imputation

    Parameters:
    -----------
    df: pd.DataFrame

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using the median 
    '''

    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

    df = make_missing_np_nan(df)
    cat_cols, date_cols, num_cols = type_cols(df)
    df_new = df[num_cols]

    columns = df_new.columns
    df_imputed = imp_median.fit_transform(df_new)
    df_imputed = pd.DataFrame(df_imputed, columns=columns)

    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
예제 #6
0
def get_outliers_info(df, outlier_method):
  '''For the frontend produces the outlier scores, the three sigmas, and the corresponding plot of the outliers

  Parameters:
  ----------
  df: pd.DataFrame
  outlier_method: str, corresponding to one of the following outlier detection methods:
    ['LOF', 'IF', 'SVM', 'KNN', 'VAE']
  '''
  df_new = make_missing_np_nan(df)
  df_new = df_new.dropna()
  cat_cols, date_cols, num_cols = type_cols(df_new)
  df_new = df_new[num_cols]
  
  if outlier_method == "LOF":
    outliers = detect_outliers_LOF(df_new)
  elif outlier_method == "IF":
    outliers = detect_outliers_IF(df_new)
  elif outlier_method == "SVM":
    outliers = detect_outliers_SVM(df_new)
  elif outlier_method == "KNN":
    outliers = detect_outliers_KNN(df_new)
  elif outlier_method == "VAE":
    outliers = detect_outliers_VAE(df_new)

  return outliers, three_sigma(outliers), detection_scores(outliers), outlier_scores_plot(outliers)
예제 #7
0
파일: plots.py 프로젝트: scoopmans/thesis
def line_plot(df, target=False, color=False, plot_range=None):
    '''Returns a lineplot of a prespecified column

    Parameters:
    -----------
    df: pd.DataFrame
    target: str, name of the column, default is the last column of the dataframe (which is usually the target variable)
    color: str, name of a column which will illustrate the color of the lines (in combination with the target)
    plot_range: list with 2 values, indicating the range of the x-axis
    '''
    if not target:
        target = df.columns[-1]  #sets it to the "target" variable on default

    df = make_missing_np_nan(df, replace_with='nan')

    if color:
        df_small = df[[target, color]]
        fig = go.Figure()

        for val in set(df_small[color]):
            df_new = df_small[df_small[color] == val]
            unique_vals, unique_vals_counts = np.unique(
                [int_element for int_element in df_new[target].tolist()],
                return_counts=True)
            fig.add_trace(
                go.Scatter(x=unique_vals,
                           y=unique_vals_counts,
                           mode='lines',
                           name=val))

        fig.update_layout(xaxis={
            'range': plot_range,
            'title': target
        },
                          yaxis={'title': 'count'})
        return fig

    else:
        unique_vals, unique_vals_counts = np.unique(
            [int_element for int_element in df[target].tolist()],
            return_counts=True)
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(x=unique_vals,
                       y=unique_vals_counts,
                       mode='lines',
                       name=target))
        fig.update_layout(xaxis={
            'range': plot_range,
            'title': target
        },
                          yaxis={'title': 'count'})

        return fig
예제 #8
0
def nr_rows_missing(df):
    '''Returns the number of rows with missing values

    Parameters:
    -----------
    df: pd.DataFrame

    Returns:
    --------
    int: number of rows with missing values
    '''
    df_all_nan = make_missing_np_nan(df)
    return len(df_all_nan[df_all_nan.isnull().any(axis=1)])
예제 #9
0
파일: report.py 프로젝트: scoopmans/thesis
def inference(df, fast=True):
    ''' Returns a dictionary with information about each column in a dataframe using PANDAS, information includes:
        - detected data type (using pandas)           -> data_type_pandas
        - nr. of missing values                       -> nr_missing
        - % of data consist of missing values         -> pct_missing
        - nr. of unique values                        -> nr_unique

    Parameters:
    -----------
    df: pd.DataFrame
    fast: bool, indicates whether to run fast (less accurate) or slow (more accurate) data type detection inference
    '''

    df = make_missing_np_nan(df) #set all missing value encodings to np.nan

    if fast:
        data_types = detect_datatypes(df)
    else:
        data_types = detect_datatypes_ptype(df)

    columns_missing = df.columns[df.isna().any()].tolist()

    result_inference = {}
    for idx,col in enumerate(df.columns):
        unique_vals, unique_vals_counts = np.unique([str(int_element) for int_element in df[col].tolist()], return_counts=True)
        nan_idx = np.where(unique_vals == 'nan')
        nr_missing = df[col].isnull().sum()

        if col in columns_missing:
            result_inference[col] = {'data_type': str(data_types[col]),
                                     'nr_missing': int(nr_missing),
                                     'pct_missing': str('{:.1f}%'.format(nr_missing / len(df) * 100)),
                                     'nr_unique': int(len(unique_vals)),
                                     'pct_unique': str('{:.1f}%'.format(len(unique_vals) / len(df) * 100)),
                                     'distribution_plot': distribution_plot(col, df[col], str(data_types[col]), unique_vals, unique_vals_counts)
                                     }
        else:
            result_inference[col] = {'data_type': str(data_types[col]),
                                     'nr_missing': 0,
                                     'pct_missing': '0.0%',
                                     'nr_unique': int(len(unique_vals)),
                                     'pct_unique': str('{:.1f}%'.format(len(unique_vals) / len(df) * 100)),
                                     'distribution_plot': distribution_plot(col, df[col], str(data_types[col]), unique_vals, unique_vals_counts)
                                     }
    return result_inference
예제 #10
0
def RF_imputation(df, fast=True):
    '''Returns the dataframe where missing values are imputed using Random Forest Imputation (sklearn)
    ExtraTreesRegressor is used for increased speed.

    Parameters:
    -----------
    df: pd.DataFrame
    fast: boolean, if set to True, ExtraTreesRegressor is used in preference of RandomForestRegressor

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using Random Forest (MissForest)
    '''

    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    #categorical and datetime columns cannot be imputed, so are removed from the imputation dataframe
    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    if fast:
        imputer = IterativeImputer(random_state=0,
                                   estimator=ExtraTreesRegressor(
                                       n_estimators=10, random_state=0))
    else:
        imputer = IterativeImputer(random_state=0,
                                   estimator=RandomForestRegressor(
                                       n_estimators=10, random_state=0))

    imputed = imputer.fit_transform(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    #categorical and datetime columns are added back
    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
예제 #11
0
def placeholder_imputation(df, col, placeholder):
    '''Returns the data frame with the missing values of the specified columns replaced by a placeholder

    Parameters:
    -----------
    df: pd.DataFrame
    col: str, corresponding to a categorical column in the data frame
    placehold: str, placeholder value to replace the missing values with

    Returns:
    --------
    pd.DataFrame: where the missing values for the specified columns are replaced by a placeholder
    '''

    df = make_missing_np_nan(df)
    df_new = df[[col]]
    df_new.fillna(value=placeholder, inplace=True)
    df[col] = df_new[col]

    return df
예제 #12
0
def DL_imputation(df, categorical=True):
    '''Returns the dataframe where missing values are imputed using DataWig
    
    Parameters:
    -----------
    df: pd.DataFrame
    categorical: boolean, if set to True, the returned dataframe will contain the original category values
                 (as opposed to their integer index)

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using DataWig
    '''

    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    num_cols = [col for col in df_new.columns if is_numeric_dtype(df_new[col])]
    string_cols = list(set(df_new.columns) - set(num_cols))
    imputer = simple_imputer.SimpleImputer(input_columns=['1'],
                                           output_column='2')
    imputed = imputer.complete(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
예제 #13
0
 def __init__(self, data):
     self.data = make_missing_np_nan(data)