示例#1
0
    def create_dataset_remove_corr_feats(self, target_var, filter_val,
                                         corr_threshold, feats_ignore):
        df = self.df.copy()
        x_df_dum = pd.get_dummies(df)
        x_df_Default_dum = x_df_dum[x_df_dum[target_var] == filter_val]

        x_df_dum.columns = x_df_dum.columns.map(f.remove_space)
        x_df_Default_dum.columns = x_df_Default_dum.columns.map(f.remove_space)

        _corr_threshold = corr_threshold
        get_highly_corr_feats = f.corr_feats(x_df_dum, x_df_dum.columns,
                                             _corr_threshold)

        get_highly_corr_feats = pd.DataFrame(get_highly_corr_feats)
        print('Highly correlated features description more than pearsonsr',
              _corr_threshold)

        corr_lst = []
        for i in range(len(get_highly_corr_feats.index) - 1):

            lst_feat = get_highly_corr_feats.iloc[i, 0]
            lst_corr_feat = get_highly_corr_feats.iloc[i, 1]

            for j in range(len(lst_corr_feat)):
                _str = f.match_strings(lst_feat, lst_corr_feat[j])
                if len(_str) > f.min_len_col(df.drop(df[feats_ignore],
                                                     axis=1)):
                    corr_lst.append(lst_corr_feat[j])

        corr_lst = pd.DataFrame(corr_lst)[0].unique().tolist()
        print(corr_lst)
        _train_drop_cols_df = x_df_dum.copy()
        _train_drop_cols_df.drop(_train_drop_cols_df[corr_lst],
                                 axis=1,
                                 inplace=True)
        self.dim_red_by_corr_df = _train_drop_cols_df.copy()
x_df_Default_dum = x_df_dum[x_df_dum['TARGET']==1]


# In[11]:


# General correlations wrt Correlations in case of default.
x_corr_default = x_df_Default_dum.corr()
x_corr = x_df_dum.corr()


# In[12]:


corr_threshold = 0.6
get_highly_corr_feats = f.corr_feats (x_df_dum,x_df_dum.columns,corr_threshold)
get_highly_corr_feats = pd.DataFrame(get_highly_corr_feats)
print('Highly correlated features description more than pearsonsr',corr_threshold)
get_highly_corr_feats


# ##### EXPLORATORY DATA ANALYSIS

# ##### TARGET

# In[13]:


# Corr
val= x_corr['TARGET'].sort_values(ascending=False)*100
val = val[val.where(val>5)>0]