示例#1
0
def compare_tree_options(seed,
                         data,
                         tree_types,
                         parent_dir,
                         options,
                         data_amount,
                         prepend_attempt=""):
    training_options_map = {
        'training-options': util.myget(options, 'training-options')
    }

    combined_stats = {}
    for t in tree_types:
        options['tree-options'] = util.myget(all_tree_options, t)(util.myget(
            options, 'tree-options', {}))

        no_opts_stats = classifier_and_record(seed, data, t, parent_dir,
                                              data_amount,
                                              training_options_map,
                                              prepend_attempt)
        with_opts_stats = classifier_and_record(seed, data, t, parent_dir,
                                                data_amount, options,
                                                prepend_attempt)

        stats = {}

        stats['no-options'] = no_opts_stats
        stats['with-options'] = with_opts_stats

        # stats['with-options']['options'] = util.myget(options, 'tree-options', {})

        combined_stats[t] = stats

    return combined_stats
示例#2
0
def wine_net(seed, r, nn_type, options=None):
    if nn_type is 'cat':
        filename = '../resources/winequality/winequality-combined-cat.csv'
        y_key = 'Reviews'
        split_options = {'exclude-keys': ['quality']}
    else:
        filename = '../resources/winequality/winequality-combined.csv'
        y_key = 'quality'
        split_options = {'exclude-keys': []}
    split_options['perc_sample'] = r
    dir = 'nn-exports/wine/'
    seed_dir = 'attempts/' + dir + str(seed) + '/'
    np.random.seed(seed)
    options = util.default(options, {})
    class_types = util.myget(options, 'class_types', ['net'])
    layers = util.myget(options, 'layers', [(5, 6)])
    data = util.get_data(filename)
    split_data = util.prep_data(data, y_key, split_options)
    net_results = {}
    for t in class_types:
        for l in layers:
            np.random.seed(seed)
            start_time = datetime.datetime.now()
            [train_results,
             test_results] = train_and_test(seed_dir, split_data, t, l)
            net_results[t + '_train'] = train_results
            net_results[t + '_test'] = test_results
            total_time = datetime.datetime.now() - start_time
            write_results(seed, t, dir, train_results['error-perc'],
                          test_results['error-perc'], total_time,
                          len(split_data[0]), l, nn_type + '-')
    pass
示例#3
0
def breast_cancer_net(seed, r, options=None):
    dir = 'nn-exports/breast_cancer/'
    seed_dir = 'attempts/' + dir + str(seed) + '/'
    np.random.seed(seed)
    options = util.default(options, {})
    class_types = util.myget(options, 'class_types', ['net'])
    layers = util.myget(options, 'layers', [(5, 6)])
    data = util.get_data('../resources/breast-cancer/wdbc.data.csv')
    data = util.split_feature(data, 'diagnosis')
    split_data = util.prep_data(
        data, 'diagnosis_M', {
            'exclude-keys': ['id', 'diagnosis_B', 'diagnosis'],
            'perc_sample': r
        })
    net_results = {}
    for t in class_types:
        for l in layers:
            np.random.seed(seed)
            start_time = datetime.datetime.now()
            [train_results,
             test_results] = train_and_test(seed_dir, split_data, t, l)
            net_results[t + '_train'] = train_results
            net_results[t + '_test'] = test_results
            total_time = datetime.datetime.now() - start_time
            write_results(seed, t, dir, train_results['error-perc'],
                          test_results['error-perc'], total_time,
                          len(split_data[0]), l)
    return net_results
示例#4
0
def classify(seed, data, tree_type, amount_data, options=None):
    np.random.seed(seed)
    tree_options = util.myget(options, 'tree-options', {})

    x_train, x_test, y_train, y_test, n_test = prep_data(
        data, util.myget(options['training-options'], 'y-key'),
        options['training-options'], amount_data)

    classifier = tree_type(**tree_options)
    classifier.fit(x_train, y_train)

    return classifier, x_train, x_test, y_train, y_test, n_test
示例#5
0
def decision_classifier_options(options=None):
    max_leaf_nodes = util.myget(options, 'max-leaf-nodes')
    max_depth = util.myget(options, 'max-depth')
    return {
        'class_weight': None,
        'criterion': 'gini',
        'max_depth': max_depth,
        'max_leaf_nodes': max_leaf_nodes,
        'min_samples_leaf': 5,
        'min_samples_split': 2,
        'min_weight_fraction_leaf': 0.0,
        'presort': False,
        'random_state': None,
        'splitter': 'random'
    }
示例#6
0
def write_results(seed,
                  tree_type,
                  data_dir,
                  data_amount,
                  n_test,
                  error_train,
                  error_test,
                  time,
                  options=None,
                  prepend_attempt=""):
    sub = '-with-opts' if util.myget(options, 'tree-options',
                                     None) is not None else '-no-opts'
    fields = [
        attempt, tree_type + sub, seed, data_amount, n_test, error_train,
        error_test, time,
        str(options)
    ]
    with open(data_dir + prepend_attempt + attempt + '-results-comb.csv',
              'a') as f:
        writer = csv.writer(f)
        writer.writerow(fields)
    with open(
            data_dir + prepend_attempt + attempt + '-results-' + tree_type +
            sub + '.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow(fields)
示例#7
0
def prep_data(data, y_key, options=None, amount_data=0.67):
    options = util.default(options, {})
    exclude_keys = util.myget(options, 'exclude-keys', [])
    exclude_keys.append(y_key)

    x = data.loc[:, data.columns.difference(exclude_keys)]
    y = data.loc[:, y_key]

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=1 -
                                                        amount_data)

    return x_train, x_test, y_train, y_test, len(x_train)
示例#8
0
def classifier_and_record(seed,
                          data,
                          t,
                          parent_dir,
                          data_amount,
                          options=None,
                          prepend_attempt=""):
    start_time = datetime.datetime.now()
    tree_type = util.myget(all_tree_types, t)
    # Creates and trains a classifier, then saves charts and stats
    tree_options = util.myget(options, 'tree-options')
    fn = 'basic' if tree_options is None else 'option'

    tree_filename = util.myget(util.myget(options, 'training-options'),
                               'image-name').format('tree', t, fn, 'svg')
    classifier, x_train, x_test, y_train, y_test, n_test = classify(
        seed, data, tree_type, data_amount, options)
    if t is not 'ada':
        charts.save_tree_chart(tree_filename, classifier)
    results = [
        performance(
            classifier, x_train, y_train,
            util.myget(util.myget(options, 'training-options'),
                       'image-name').format('confusion-matrix-train', t, fn,
                                            'csv')),
        performance(
            classifier, x_test, y_test,
            util.myget(util.myget(options, 'training-options'),
                       'image-name').format('confusion-matrix-test', t, fn,
                                            'csv'))
    ]
    [train_results, test_restuls] = results
    total_time = datetime.datetime.now() - start_time
    write_results(seed, t, parent_dir, data_amount, n_test,
                  train_results['error-perc'], test_restuls['error-perc'],
                  total_time, options, prepend_attempt)
    return results