Пример #1
0
def main():

    for set_name in ConfigHelper.get_datasets():

        MetricsHelper.reset_metrics()

        data, set_target = IOHelper.read_dataset(set_name)

        feats, labels = DataHelper.extract_feature_labels(data, set_target)
        DataHelper.create_label_mapping(labels)
        max_nb_feats = DataHelper.calculate_max_nb_features(feats)

        for e in range(ConfigHelper.nb_executions):
            start = time.time()
            print("Execution " + str(e))

            train_idxs, test_idxs = DataHelper.split_in_sets(feats, labels)

            train_X = DataHelper.select_rows(feats, train_idxs, copy=False)
            train_y = DataHelper.select_rows(labels, train_idxs, copy=False)
            test_X = DataHelper.select_rows(feats, test_idxs, copy=False)
            test_y = DataHelper.select_rows(labels, test_idxs, copy=False)

            for noise_level in ConfigHelper.noise_levels:

                noisy_idxs, noisy_train_y = DataHelper.insert_noise(
                    train_y, noise_level)

                for name, clf, clean_type in ConfigHelper.get_classifiers():

                    algorithm_data = ConfigHelper.choose_algorithm(
                        clf, clean_type, train_X, noisy_train_y, noisy_idxs,
                        max_nb_feats)

                    chosen_rate = algorithm_data[0]
                    chosen_threshold = algorithm_data[1]
                    chosen_X = algorithm_data[2]
                    chosen_y = algorithm_data[3]
                    chosen_clf = algorithm_data[4]
                    true_filtered = algorithm_data[5]
                    false_filtered = algorithm_data[6]

                    chosen_clf.fit(chosen_X, chosen_y)
                    predictions = chosen_clf.predict(test_X)
                    error = MetricsHelper.calculate_error_score(
                        test_y, predictions)

                    MetricsHelper.metrics.append([
                        set_name, e, noise_level, name, chosen_rate,
                        chosen_threshold, error, true_filtered, false_filtered
                    ])
            print(str(time.time() - start))

        IOHelper.store_results(MetricsHelper.convert_metrics_to_frame(),
                               "final_" + set_name)
import random as rnd
from numpy import random as rnp

rnd.seed(2789)
rnp.seed(3056)
########################################

from io_helper import IOHelper
from data_helper import DataHelper
from config_helper import ConfigHelper
from metrics_helper import MetricsHelper

if __name__ == "__main__":

    train_data = IOHelper.read_dataset("train")
    train_X, train_y = DataHelper.extract_feature_labels(train_data)

    predef = ConfigHelper.use_predefined_cols

    DataHelper.add_nan_indication_cols(train_X)
    DataHelper.remove_high_nan_rate_cols(train_X, predef)
    DataHelper.remove_small_variance_cols(train_X, predef)

    train_y = DataHelper.remove_high_nan_rate_rows(train_X, train_y)
    DataHelper.fill_missing_data(train_X, is_train=True)
    train_X = DataHelper.split_categorical_cols(train_X, is_train=True)
    DataHelper.scale_continuous_cols(train_X, is_train=True)
    train_X = DataHelper.select_best_features(train_X,
                                              None,
                                              train_y,
                                              ConfigHelper.max_nb_features,
from numpy import random as rnp
rnd.seed(2789)
rnp.seed(3056)
########################################

import time

from io_helper import IOHelper
from data_helper import DataHelper
from config_helper import ConfigHelper
from metrics_helper import MetricsHelper

if __name__ == "__main__":

    data = IOHelper.read_dataset("train")
    feats, labels = DataHelper.extract_feature_labels(data)

    predef = ConfigHelper.use_predefined_cols

    DataHelper.add_nan_indication_cols(feats)
    DataHelper.remove_high_nan_rate_cols(feats, predef)
    DataHelper.remove_small_variance_cols(feats, predef)

    for e in xrange(ConfigHelper.nb_executions):
        print "Execution: " + str(e + 1)

        MetricsHelper.reset_metrics()

        for f, (train_idxs,
                val_idxs) in enumerate(ConfigHelper.k_fold_cv(labels)):
            start_time = time.time()