def compare_tree_options(seed, data, tree_types, parent_dir, options, data_amount, prepend_attempt=""): training_options_map = { 'training-options': util.myget(options, 'training-options') } combined_stats = {} for t in tree_types: options['tree-options'] = util.myget(all_tree_options, t)(util.myget( options, 'tree-options', {})) no_opts_stats = classifier_and_record(seed, data, t, parent_dir, data_amount, training_options_map, prepend_attempt) with_opts_stats = classifier_and_record(seed, data, t, parent_dir, data_amount, options, prepend_attempt) stats = {} stats['no-options'] = no_opts_stats stats['with-options'] = with_opts_stats # stats['with-options']['options'] = util.myget(options, 'tree-options', {}) combined_stats[t] = stats return combined_stats
def wine_net(seed, r, nn_type, options=None): if nn_type is 'cat': filename = '../resources/winequality/winequality-combined-cat.csv' y_key = 'Reviews' split_options = {'exclude-keys': ['quality']} else: filename = '../resources/winequality/winequality-combined.csv' y_key = 'quality' split_options = {'exclude-keys': []} split_options['perc_sample'] = r dir = 'nn-exports/wine/' seed_dir = 'attempts/' + dir + str(seed) + '/' np.random.seed(seed) options = util.default(options, {}) class_types = util.myget(options, 'class_types', ['net']) layers = util.myget(options, 'layers', [(5, 6)]) data = util.get_data(filename) split_data = util.prep_data(data, y_key, split_options) net_results = {} for t in class_types: for l in layers: np.random.seed(seed) start_time = datetime.datetime.now() [train_results, test_results] = train_and_test(seed_dir, split_data, t, l) net_results[t + '_train'] = train_results net_results[t + '_test'] = test_results total_time = datetime.datetime.now() - start_time write_results(seed, t, dir, train_results['error-perc'], test_results['error-perc'], total_time, len(split_data[0]), l, nn_type + '-') pass
def breast_cancer_net(seed, r, options=None): dir = 'nn-exports/breast_cancer/' seed_dir = 'attempts/' + dir + str(seed) + '/' np.random.seed(seed) options = util.default(options, {}) class_types = util.myget(options, 'class_types', ['net']) layers = util.myget(options, 'layers', [(5, 6)]) data = util.get_data('../resources/breast-cancer/wdbc.data.csv') data = util.split_feature(data, 'diagnosis') split_data = util.prep_data( data, 'diagnosis_M', { 'exclude-keys': ['id', 'diagnosis_B', 'diagnosis'], 'perc_sample': r }) net_results = {} for t in class_types: for l in layers: np.random.seed(seed) start_time = datetime.datetime.now() [train_results, test_results] = train_and_test(seed_dir, split_data, t, l) net_results[t + '_train'] = train_results net_results[t + '_test'] = test_results total_time = datetime.datetime.now() - start_time write_results(seed, t, dir, train_results['error-perc'], test_results['error-perc'], total_time, len(split_data[0]), l) return net_results
def classify(seed, data, tree_type, amount_data, options=None): np.random.seed(seed) tree_options = util.myget(options, 'tree-options', {}) x_train, x_test, y_train, y_test, n_test = prep_data( data, util.myget(options['training-options'], 'y-key'), options['training-options'], amount_data) classifier = tree_type(**tree_options) classifier.fit(x_train, y_train) return classifier, x_train, x_test, y_train, y_test, n_test
def decision_classifier_options(options=None): max_leaf_nodes = util.myget(options, 'max-leaf-nodes') max_depth = util.myget(options, 'max-depth') return { 'class_weight': None, 'criterion': 'gini', 'max_depth': max_depth, 'max_leaf_nodes': max_leaf_nodes, 'min_samples_leaf': 5, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'random' }
def write_results(seed, tree_type, data_dir, data_amount, n_test, error_train, error_test, time, options=None, prepend_attempt=""): sub = '-with-opts' if util.myget(options, 'tree-options', None) is not None else '-no-opts' fields = [ attempt, tree_type + sub, seed, data_amount, n_test, error_train, error_test, time, str(options) ] with open(data_dir + prepend_attempt + attempt + '-results-comb.csv', 'a') as f: writer = csv.writer(f) writer.writerow(fields) with open( data_dir + prepend_attempt + attempt + '-results-' + tree_type + sub + '.csv', 'a') as f: writer = csv.writer(f) writer.writerow(fields)
def prep_data(data, y_key, options=None, amount_data=0.67): options = util.default(options, {}) exclude_keys = util.myget(options, 'exclude-keys', []) exclude_keys.append(y_key) x = data.loc[:, data.columns.difference(exclude_keys)] y = data.loc[:, y_key] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 - amount_data) return x_train, x_test, y_train, y_test, len(x_train)
def classifier_and_record(seed, data, t, parent_dir, data_amount, options=None, prepend_attempt=""): start_time = datetime.datetime.now() tree_type = util.myget(all_tree_types, t) # Creates and trains a classifier, then saves charts and stats tree_options = util.myget(options, 'tree-options') fn = 'basic' if tree_options is None else 'option' tree_filename = util.myget(util.myget(options, 'training-options'), 'image-name').format('tree', t, fn, 'svg') classifier, x_train, x_test, y_train, y_test, n_test = classify( seed, data, tree_type, data_amount, options) if t is not 'ada': charts.save_tree_chart(tree_filename, classifier) results = [ performance( classifier, x_train, y_train, util.myget(util.myget(options, 'training-options'), 'image-name').format('confusion-matrix-train', t, fn, 'csv')), performance( classifier, x_test, y_test, util.myget(util.myget(options, 'training-options'), 'image-name').format('confusion-matrix-test', t, fn, 'csv')) ] [train_results, test_restuls] = results total_time = datetime.datetime.now() - start_time write_results(seed, t, parent_dir, data_amount, n_test, train_results['error-perc'], test_restuls['error-perc'], total_time, options, prepend_attempt) return results