Пример #1
0
###############################################################################

from io_helper import IOHelper
from statistics_helper import StatisticsHelper
from data_helper import DataHelper
from config_helper import ConfigHelper

if __name__ == "__main__":

    dataset_name = ConfigHelper.analysis_dataset

    train_data = IOHelper.read_dataset(dataset_name)
    for col in train_data.columns:
        series = train_data[col]

        stats = StatisticsHelper.get_feature_stats(series)
        StatisticsHelper.draw_feature_distribution(series, col)
        IOHelper.store_analysis(stats, col, dataset_name)

    DataHelper.fill_missing_data(train_data, is_train=True)
    for col in train_data.columns:
        series = train_data[col]

        col = col + "_filled"
        stats = StatisticsHelper.get_feature_stats(series)
        StatisticsHelper.draw_feature_distribution(series, col)
        IOHelper.store_analysis(stats, col, dataset_name)
from config_helper import ConfigHelper
from metrics_helper import MetricsHelper

if __name__ == "__main__":

    train_data = IOHelper.read_dataset("train")
    train_X, train_y = DataHelper.extract_feature_labels(train_data)

    predef = ConfigHelper.use_predefined_cols

    DataHelper.add_nan_indication_cols(train_X)
    DataHelper.remove_high_nan_rate_cols(train_X, predef)
    DataHelper.remove_small_variance_cols(train_X, predef)

    train_y = DataHelper.remove_high_nan_rate_rows(train_X, train_y)
    DataHelper.fill_missing_data(train_X, is_train=True)
    train_X = DataHelper.split_categorical_cols(train_X, is_train=True)
    DataHelper.scale_continuous_cols(train_X, is_train=True)
    train_X = DataHelper.select_best_features(train_X,
                                              None,
                                              train_y,
                                              ConfigHelper.max_nb_features,
                                              is_train=True)

    test_X = IOHelper.read_dataset("test")

    DataHelper.add_nan_indication_cols(test_X)
    DataHelper.remove_high_nan_rate_cols(test_X, True)
    DataHelper.remove_small_variance_cols(test_X, True)

    DataHelper.fill_missing_data(test_X, is_train=False)