def nbest(metric, folder): nbest_coefs = np.arange(500, 4999, 100) metric = sorted(metric, key=lambda x: x[0]) metric_f1 = [] metric_n_feat = [] print(folder + ': classifying N BEST') timer.set_new() for i in range(len(nbest_coefs)): frame.progress((i + 1) / len(nbest_coefs)) indexes_metric = [x[1] for x in metric[-nbest_coefs[i]:]] metric_data = trim.trim_data(data, indexes_metric) metric_data_valid = trim.trim_data(data_valid, indexes_metric) metric_f1.append( metrics.f1_score(labels_valid, classify(metric_data, metric_data_valid, labels))) metric_n_feat.append(len(indexes_metric)) print(' DONE in ' + timer.get_diff_str()) dump.dump_object(nbest_coefs, folder + '/nbest/svm/coefs.dump') dump.dump_object(metric_f1, folder + '/nbest/svm/f1.dump') dump.dump_object(metric_n_feat, folder + '/nbest/svm/feat.dump') metric_cls = [(nbest_coefs[i], metric_f1[i]) for i in range(len(nbest_coefs))] metric_coef_max = max(metric_cls, key=lambda x: x[1])[0] indexes_metric = [x[1] for x in metric[-metric_coef_max:]] # to eiler's diagram dump.dump_object(indexes_metric, folder + '/nbest/max/indexes.dump')
def information_gain(data, labels, dumped=False): import numpy as np def get_features(data_set): n = len(data[0]) return [[i[j] for i in data_set] for j in range(n)] def possibilities(feature): counts = np.bincount(feature) return np.asarray(counts[np.nonzero(counts)] / float(len(feature))) def entropy(feature): p = possibilities(feature) return -np.sum(p * np.log2(p)) def spec_cond_entropy(x, y, xi): new_y = [y[i] for i in range(len(y)) if x[i] == xi] return entropy(new_y) def cond_entropy(x, y): p = possibilities(x) return sum([p[xi] * spec_cond_entropy(x, y, xi) for xi in range(len(p))]) def cond_entropy_full(x, y): from util.frame import progress print('Information gain: computing conditional entropy:') feat_len = len(x) result = [] for i in range(feat_len): result.append(cond_entropy(x[i], y)) if i % 10 == 0: progress((i + 1) / feat_len) progress(1) return np.asarray(result) import util.dump as dump features = get_features(data) h_y_x = [] if not dumped: h_y_x = cond_entropy_full(features, labels) dump.dump_object(h_y_x, 'ig/hyx.dump') else: h_y_x = dump.load_object('ig/hyx.dump') info_gain = entropy(labels) - h_y_x result = [(info_gain[i], i) for i in range(len(info_gain))] return result
def pearson(data, labels, dumped=False): import numpy as np import util.dump as dump import math import warnings import scipy.stats as stats warnings.filterwarnings('ignore') def get_features(data_set): n = len(data[0]) return [[i[j] for i in data_set] for j in range(n)] def feature_correlation(x, y): n = range(len(x)) x_avg = sum(x) / len(x) y_avg = sum(y) / len(y) cov = sum([(x[i] - x_avg) * (y[i] - y_avg) for i in n]) x_dev = math.sqrt(sum([(x[i] - x_avg) ** 2 for i in n])) y_dev = math.sqrt(sum([(y[i] - y_avg) ** 2 for i in n])) return cov / (x_dev * y_dev) def correlation(x, y): from util.frame import progress print('Pearson: computing corellation coefficients:') feat_len = len(x) result = [] for i in range(feat_len): result.append(feature_correlation(x[i], y)) if i % 10 == 0: progress((i + 1) / feat_len) progress(1) return np.asarray(result) features = get_features(data) ro = [] if not dumped: ro = correlation(features, labels) dump.dump_object(ro, 'pearson/ro.dump') else: ro = dump.load_object('pearson/ro.dump') v = len(labels) - 2 p = [] for i in range(len(ro)): t = ro[i] * math.sqrt(v) / math.sqrt(1 - ro[i] ** 2) p.append((stats.t.sf(np.abs(t), v) * 2, i)) return p
def spearman(data, labels, dumped=False): import numpy as np import util.dump as dump import math import warnings import scipy.stats as stats warnings.filterwarnings('ignore') def get_features(data_set): n = len(data[0]) return [[i[j] for i in data_set] for j in range(n)] def feature_correlation(x, y): n = len(x) rank_x = np.asarray(stats.rankdata(x, method='max')) rank_y = np.asarray(stats.rankdata(y, method='max')) sum_d_2 = sum((rank_x - rank_y) ** 2) return 1 - 6 * sum_d_2 / (n * (n ** 2 - 1)) def correlation(x, y): from util.frame import progress print('Spearman: computing corellation coefficients:') feat_len = len(x) result = [] for i in range(feat_len): result.append(feature_correlation(x[i], y)) if i % 10 == 0: progress((i + 1) / feat_len) progress(1) return np.asarray(result) features = get_features(data) ro = [] if not dumped: ro = correlation(features, labels) dump.dump_object(ro, 'spearman/ro.dump') else: ro = dump.load_object('spearman/ro.dump') n = len(labels) v = n - 2 p = [] for i in range(len(ro)): t = ro[i] * math.sqrt(v) / math.sqrt(1 - ro[i] ** 2) p.append((stats.t.sf(np.abs(t), v) * 2, i)) return p
def classify(x, x_val, y): import sklearn.svm as svm predict = run_classifier(x, y, x_val, svm.LinearSVC()) return predict data = dump.load_object('data.dump') data_valid = dump.load_object('data_valid.dump') labels = dump.load_object('labels.dump') labels_valid = dump.load_object('labels_valid.dump') score = metrics.f1_score(labels_valid, classify(data, data_valid, labels)) print(score) print() dump.dump_object(score, 'score.dump') # INFO GAIN if INFO_GAIN: ig = dump.load_object('ig/ig.dump') ig_coefs = np.arange(0.1, 0.91, 0.01) ig_f1 = [] ig_n_feat = [] print('Information Gain: classifying on different coefficients') timer.set_new() for i in range(len(ig_coefs)): frame.progress((i + 1) / len(ig_coefs)) trimmed_ig = [x for x in ig if x[0] > ig_coefs[i]] indexes_ig = [x[1] for x in trimmed_ig] ig_data = trim.trim_data(data, indexes_ig) ig_data_valid = trim.trim_data(data_valid, indexes_ig)
def classify(): predicted = network.predict(images_test) predicted = get_predicted(predicted) return accuracy_score(test_labels[1], predicted) network = NeuralNetwork(1, 1, 1) images_train = images_to_np_array(train_images[2]) labels_train = labels_to_np_array(train_labels[1]) cycles = 10 print('Training...') progress(0) timer = Timer() rang = list(range(150, 250, 10)) for j in range(len(rang)): if not rang[j] in stats_x: np.random.seed(1) network = NeuralNetwork(image_size[0] * image_size[1], 300, 10) for i in range(cycles): randoms = np.random.randint(0, 60000, rang[j]) network.train(images_train[randoms], labels_train[randoms], 0.1) if i % 1 == 0: progress((j * cycles + i + 1) / (cycles * len(rang))) stats_x.append(rang[j]) stats_y.append(classify()) progress(1) dump_object((stats_x, stats_y), 'stoch-n-images-stat.dump') print(' DONE in ', timer.get_diff_str()) pt.plot(stats_x, stats_y, color='red') pt.show()
test_images_file = reader.read_images('mnist/t10k-images-idx3-ubyte') test_data = images_to_np_array(test_images_file[2]) test_labels = np.asarray(test_labels_file[1]) print('DONE in ' + timer.get_diff_str()) # timer.set_new() # coef = information_gain(train_data, train_labels) # print(' DONE in ' + timer.get_diff_str()) # dump_object(coef, 'spearman.dump') import pylab as pt ig = [x[1] for x in sorted(load_object('ig.dump'))] y = np.zeros((28, 28, 3)) n = 100 features = ig[-n:] for i in features: y[i // 28][i % 28] = [1, 1, 1] pt.imshow(y) pt.show() fs_data = train_data.T[features].T fs_labels = train_labels fs_test_data = test_data.T[features].T fs_test_labels = test_labels dump_object(n, 'fs_size.dump') dump_object(fs_data, 'fs_train_data.dump') dump_object(fs_labels, 'fs_train_labels.dump') dump_object(fs_test_data, 'fs_test_data.dump') dump_object(fs_test_labels, 'fs_test_labels.dump')
rang_test = len(images_test) def classify(): predicted = network.predict(images_test) predicted = get_predicted(predicted) return accuracy_score(test_labels[1], predicted) network = NeuralNetwork(1, 1, 1) images_train = images_to_np_array(train_images[2]) labels_train = labels_to_np_array(train_labels[1]) cycles = 1000 num = 150 print('Training...') progress(0) timer = Timer() rang = list(range(200, 300, 20)) for j in range(len(rang)): np.random.seed(1) network = NeuralNetwork(image_size[0] * image_size[1], rang[j], 10) for i in range(cycles): randoms = np.random.randint(0, 60000, num) network.train(images_train[randoms], labels_train[randoms]) progress((j * cycles + i + 1) / (cycles * len(rang))) stats_x.append(rang[j]) stats_y.append(classify()) dump_object((stats_x, stats_y), 'stoch-hidden-stat.dump') print(' DONE in ', timer.get_diff_str()) import pylab as pt pt.plot(stats_x, stats_y, color='red') pt.show()
else: images_train = images_to_np_array(train_images[2]) labels_train = labels_to_np_array(train_labels[1]) stats = [] if NETWORK_CONTINUE: network = load_object('network.dump') stats = load_object('stats.dump') else: network = NeuralNetwork(image_size[0] * image_size[1], 10, 10) rang_train = len(images_train) print('Training...') cycles = 0 timer = Timer() progress(0) for i in range(cycles): network.train(images_train, labels_train) dump_object(network, 'network.dump') dump_object(stats, 'stats.dump') progress((i+1) / cycles) stats.append(classify()) print(' DONE in ', timer.get_diff_str()) import pylab as pt x, y = [0], [0] step = 25 for i in range(len(stats) // step): x.append(i * step + step) selection = stats[i*step:i*step + step] y.append(sum(selection) / step) pt.plot(range(len(stats)), stats) pt.plot(x, y, color='red', linewidth=3) pt.show()
labels_train = labels_to_np_array(train_labels[1]) stats = [] if NETWORK_CONTINUE: network = load_object('stoch-network.dump') stats = load_object('stoch-stats.dump') else: network = NeuralNetwork(image_size[0] * image_size[1], 300, 10, layers=1) rang_train = len(images_train) print('Training...') cycles = 100 num = 240 timer = Timer() progress(0) for i in range(cycles): randoms = np.random.randint(0, 60000, num) network.train(images_train[randoms], labels_train[randoms], 0.1) if network.cycles % network.step == 0: stats.append(classify()) progress((i+1) / cycles) print(' DONE in ', timer.get_diff_str()) classify_print() dump_object(network, 'stoch-network.dump') dump_object(stats, 'stoch-stats.dump') print(network.cycles) import pylab as pt pt.plot(np.arange(len(stats)) * network.step, stats) pt.grid() pt.show() pt.plot(np.arange(len(network.stats)) * network.step, network.stats) pt.show()