def test_clean_df_y_nan(self): df = pd.DataFrame([[1, 2, 3, np.nan], [5, 6, 7, 8], [9, 10, 11, 12]], columns=['first', 'second', 'third', 'fourth']) df_clean_sample = pd.DataFrame( [[5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]], columns=['first', 'second', 'third', 'fourth']) df_cleaned, sample_limit = clean_dataframe(df, 'fourth', percent_data=1) assert df_cleaned.reset_index( drop=True).to_dict() == df_clean_sample.reset_index( drop=True).to_dict()
def test_clean_df_y_x_inf(self): df = pd.DataFrame([[1, np.nan, 3, np.inf], [5, 6, 7, 8], [9, 10, np.nan, 12], [13, 14, 15, 16]], columns=['first', 'second', 'third', 'fourth']) df_clean_sample = pd.DataFrame( [[5.0, 6.0, 7.0, False, 8.0], [9.0, 10.0, 10.5, True, 12.0], [13.0, 14.0, 15.0, False, 16.0]], columns=['first', 'second', 'third', 'third_was_null', 'fourth']) print(df) df_clean_sample = df_clean_sample.reset_index(drop=True) print(df_clean_sample) df_cleaned, sample_limit = clean_dataframe(df, 'fourth', percent_data=1) df_cleaned = df_cleaned.sort_index().reset_index(drop=True) print(df_cleaned) assert df_cleaned.to_dict() == df_cleaned.to_dict()
def test_clean_dataframe(self): df1 = pd.DataFrame([[1, np.inf, 3], [2, 3, 4], [3, 4, 5], [2, 3, 4], [3, 4, 5], [2, 3, np.NAN], [3, 4, 5], [2, 3, 4], [3, 4, 5], [2, 3, 4], [3, 4, 5]], columns=['first', 'second', 'third']) y_var_name = 'first' percent_data = 1 df2, sample_limit = clean_dataframe(df1, y_var_name, percent_data=None) self.assertTrue( df2.reset_index(drop=True).equals( pd.DataFrame( [[3.5, 3, True, False, 1], [3, 4, False, False, 2], [4, 5, False, False, 3], [3, 4, False, False, 2], [4, 5, False, False, 3], [3, 4.4, False, True, 2], [4, 5, False, False, 3], [3, 4, False, False, 2], [4, 5, False, False, 3], [3, 4, False, False, 2], [4, 5, False, False, 3]], columns=[ 'second', 'third', 'second_was_inf', 'third_was_null', 'first' ]).reset_index(drop=True)))
corr_matrix=True, # scatter_matrix=True, #doesn't work IF categorical values of 4 groups in X. bootstrap_coefs=True, partial_dep=True, plot_alphas=True, plot_predicted_vs_actuals_flag=True, plot_coefs_flag=True, feature_importances=True, actual_vs_predicted=True, plot_predicteds_vs_actuals=True, residuals=True, univariates=True, compare_models=True, ROC=True, ) # autoregression.compare_predictions(iris_df,'sepal_length', # feature_importances=False # ) from autoregression import clean_dataframe df_titanic_cleaned, sample_limit = clean_dataframe(df_titanic, 'Survived', percent_data=1) df_titanic_transformed = pipeline.transform(df_titanic_cleaned) print(df_titanic_transformed.columns) print(fit_models) print(fit_models[0].predict_proba(df_titanic_transformed))