示例#1
0
def regression_imputation(df):
    '''Returns the dataframe where missing values are imputed using IterativeImputer and BayesianRidge()
    This is a regression imputation method.

    Parameters:
    -----------
    df: pd.DataFrame

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using multiple imputation
    '''
    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)

    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    imputer = IterativeImputer(random_state=0)
    imputed = imputer.fit_transform(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
示例#2
0
def MICE_imputation(df, categorical=False, nr_iter=3):
    '''Returns the dataframe where missing values are imputed using MICE

    Parameters:
    -----------
    df: pd.DataFrame
    categorical: boolean, if set to True, the returned dataframe will contain the original category values
                 (as opposed to their integer index)
    nr_iter: int, the number of imputations to be generated

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using MICE
    '''
    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    result = [0] * nr_iter
    for i in range(nr_iter):
        imputer = IterativeImputer(sample_posterior=True)
        imputed = imputer.fit_transform(df_new)
        df_imputed = pd.DataFrame(imputed, columns=columns)
        result[i] = df_imputed

    return result
示例#3
0
def median_imputation(df):
    '''Imputes the missing values in a data frame using median imputation

    Parameters:
    -----------
    df: pd.DataFrame

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using the median 
    '''

    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

    df = make_missing_np_nan(df)
    cat_cols, date_cols, num_cols = type_cols(df)
    df_new = df[num_cols]

    columns = df_new.columns
    df_imputed = imp_median.fit_transform(df_new)
    df_imputed = pd.DataFrame(df_imputed, columns=columns)

    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
示例#4
0
def KNN_imputation(df, k=5):
    '''Imputes the missing values in a dataframe using K-Nearest Neighbor

    Parameters:
    -----------
    df: pd.DataFrame
    k: int, number of neighboring samples to use for imputation

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using KNN
    '''
    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 1)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    imputer = KNNImputer(n_neighbors=k)
    imputed = imputer.fit_transform(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
示例#5
0
def make_date_season(df, date_col=None, country_col=None):
    '''From a datetime variable it constructs an additional variable
  containing the season number where:
    winter = 0
    spring = 1
    summer = 2
    autumn = 3
    
  Parameters:
  -----------
  df: pd.DataFrame
  date_col = str, containing the datetime column name
  '''

    if not date_col:
        cat_col, date_col, num_col = type_cols(df_test)
        date_col = date_col[0]

    if country_col:
        df['season'] = df.apply(
            lambda x: get_season(x[date_col], x[country_col]), axis=1)
    else:
        df['season'] = df[date_col].apply(lambda x: get_season(x))

    df['season'] = df['season'].apply(lambda x: season_to_num(x))
    return df
示例#6
0
def get_outliers_info(df, outlier_method):
  '''For the frontend produces the outlier scores, the three sigmas, and the corresponding plot of the outliers

  Parameters:
  ----------
  df: pd.DataFrame
  outlier_method: str, corresponding to one of the following outlier detection methods:
    ['LOF', 'IF', 'SVM', 'KNN', 'VAE']
  '''
  df_new = make_missing_np_nan(df)
  df_new = df_new.dropna()
  cat_cols, date_cols, num_cols = type_cols(df_new)
  df_new = df_new[num_cols]
  
  if outlier_method == "LOF":
    outliers = detect_outliers_LOF(df_new)
  elif outlier_method == "IF":
    outliers = detect_outliers_IF(df_new)
  elif outlier_method == "SVM":
    outliers = detect_outliers_SVM(df_new)
  elif outlier_method == "KNN":
    outliers = detect_outliers_KNN(df_new)
  elif outlier_method == "VAE":
    outliers = detect_outliers_VAE(df_new)

  return outliers, three_sigma(outliers), detection_scores(outliers), outlier_scores_plot(outliers)
示例#7
0
def make_date_week_number(df, date_col=None):
    '''From a datetime variable it constructs an additional variable
  containing the number for the week of the year.

  Parameters:
  -----------
  df: pd.DataFrame
  date_col = list of str, containing the datetime column name
  '''
    if not date_col:
        cat_col, date_col, num_col = type_cols(df_test)
        date_col = date_col[0]

    df['week_number'] = df[date_col].apply(lambda x: x.isocalendar()[1])
    return df
示例#8
0
def make_date_hour(df, date_col=None):
    '''From a datetime variable it constructs an additional variable
  containing the hour of the day.

  Parameters:
  -----------
  df: pd.DataFrame
  date_col = str, containing the datetime column name
  '''
    if not date_col:
        cat_col, date_col, num_col = type_cols(df_test)
        date_col = date_col[0]

    df['hour'] = df[date_col].apply(lambda x: x.hour)
    return df
示例#9
0
def make_date_month(df, date_col=None):
    '''From a datetime variable it constructs an additional variable
  containing the number for the month of the year.

  Parameters:
  -----------
  df: pd.DataFrame
  date_col = str, containing the datetime column name
  '''
    if not date_col:
        cat_col, date_col, num_col = type_cols(df_test)
        date_col = date_col[0]

    df['month'] = df[date_col].apply(lambda x: x.month)
    return df
示例#10
0
def RF_imputation(df, fast=True):
    '''Returns the dataframe where missing values are imputed using Random Forest Imputation (sklearn)
    ExtraTreesRegressor is used for increased speed.

    Parameters:
    -----------
    df: pd.DataFrame
    fast: boolean, if set to True, ExtraTreesRegressor is used in preference of RandomForestRegressor

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using Random Forest (MissForest)
    '''

    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    #categorical and datetime columns cannot be imputed, so are removed from the imputation dataframe
    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    if fast:
        imputer = IterativeImputer(random_state=0,
                                   estimator=ExtraTreesRegressor(
                                       n_estimators=10, random_state=0))
    else:
        imputer = IterativeImputer(random_state=0,
                                   estimator=RandomForestRegressor(
                                       n_estimators=10, random_state=0))

    imputed = imputer.fit_transform(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    #categorical and datetime columns are added back
    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
示例#11
0
def weekend_or_not(df, date_col=None):
    '''From a datetime variable it constructs an additional variable
  indicating whether it is the weekend or not.
    
  Parameters:
  -----------
  df: pd.DataFrame
  date_col = str, containing the datetime column name
  '''
    if not date_col:
        cat_col, date_col, num_col = type_cols(df_test)
        date_col = date_col[0]

    df = make_date_weekday(df, date_col, temp=True)

    df['weekend'] = df['weekday_temp'].apply(lambda x: weekend(x))

    return df.drop(['weekday_temp'], axis=1)
示例#12
0
def make_date_weekday(df, date_col=None, temp=False):
    '''From a datetime variable it constructs an additional variable
  containing a number for the day of the week (monday = 0,
  sunday = 6)

  Parameters:
  -----------
  df: pd.DataFrame
  date_col = str, containing the datetime column name
  '''
    if not date_col:
        cat_col, date_col, num_col = type_cols(df_test)
        date_col = date_col[0]

    if temp:
        df['weekday_temp'] = df[date_col].apply(lambda x: x.weekday())
    else:
        df['weekday'] = df[date_col].apply(lambda x: x.weekday())

    return df
示例#13
0
def DL_imputation(df, categorical=True):
    '''Returns the dataframe where missing values are imputed using DataWig
    
    Parameters:
    -----------
    df: pd.DataFrame
    categorical: boolean, if set to True, the returned dataframe will contain the original category values
                 (as opposed to their integer index)

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using DataWig
    '''

    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    num_cols = [col for col in df_new.columns if is_numeric_dtype(df_new[col])]
    string_cols = list(set(df_new.columns) - set(num_cols))
    imputer = simple_imputer.SimpleImputer(input_columns=['1'],
                                           output_column='2')
    imputed = imputer.complete(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
示例#14
0
def get_imputation_scores(df):
    '''For the specified dataframe, each of the imputation methods is tried on the observations without missing values and synthethically imputed
    to establish the best performing imputation method.

    Parameters:
    -----------
    df: pd.DataFrame

    Returns:
    result: dict containing for each of the imputation techniques (mean, median, multiple, KNN and RF) the NRMSE
    '''
    cat_cols, date_cols, num_cols = type_cols(df)
    df_new = df[num_cols]

    if df.shape[0] * df.shape[1] < 10000:
        imputations = ['mean', 'median', 'regression', 'KNN', 'RF']
    else:
        imputations = ['mean', 'median', 'regression', 'RF']
    result = {}
    for imputation in imputations:
        score = imputation_error(df_new, imputation=imputation)
        result[imputation] = score
    return result