############################################################################### from io_helper import IOHelper from statistics_helper import StatisticsHelper from data_helper import DataHelper from config_helper import ConfigHelper if __name__ == "__main__": dataset_name = ConfigHelper.analysis_dataset train_data = IOHelper.read_dataset(dataset_name) for col in train_data.columns: series = train_data[col] stats = StatisticsHelper.get_feature_stats(series) StatisticsHelper.draw_feature_distribution(series, col) IOHelper.store_analysis(stats, col, dataset_name) DataHelper.fill_missing_data(train_data, is_train=True) for col in train_data.columns: series = train_data[col] col = col + "_filled" stats = StatisticsHelper.get_feature_stats(series) StatisticsHelper.draw_feature_distribution(series, col) IOHelper.store_analysis(stats, col, dataset_name)
from config_helper import ConfigHelper from metrics_helper import MetricsHelper if __name__ == "__main__": train_data = IOHelper.read_dataset("train") train_X, train_y = DataHelper.extract_feature_labels(train_data) predef = ConfigHelper.use_predefined_cols DataHelper.add_nan_indication_cols(train_X) DataHelper.remove_high_nan_rate_cols(train_X, predef) DataHelper.remove_small_variance_cols(train_X, predef) train_y = DataHelper.remove_high_nan_rate_rows(train_X, train_y) DataHelper.fill_missing_data(train_X, is_train=True) train_X = DataHelper.split_categorical_cols(train_X, is_train=True) DataHelper.scale_continuous_cols(train_X, is_train=True) train_X = DataHelper.select_best_features(train_X, None, train_y, ConfigHelper.max_nb_features, is_train=True) test_X = IOHelper.read_dataset("test") DataHelper.add_nan_indication_cols(test_X) DataHelper.remove_high_nan_rate_cols(test_X, True) DataHelper.remove_small_variance_cols(test_X, True) DataHelper.fill_missing_data(test_X, is_train=False)