Exemplo n.º 1
0
def nbest(metric, folder):
    nbest_coefs = np.arange(500, 4999, 100)
    metric = sorted(metric, key=lambda x: x[0])
    metric_f1 = []
    metric_n_feat = []
    print(folder + ': classifying N BEST')
    timer.set_new()
    for i in range(len(nbest_coefs)):
        frame.progress((i + 1) / len(nbest_coefs))
        indexes_metric = [x[1] for x in metric[-nbest_coefs[i]:]]
        metric_data = trim.trim_data(data, indexes_metric)
        metric_data_valid = trim.trim_data(data_valid, indexes_metric)
        metric_f1.append(
            metrics.f1_score(labels_valid,
                             classify(metric_data, metric_data_valid, labels)))
        metric_n_feat.append(len(indexes_metric))
    print(' DONE in ' + timer.get_diff_str())
    dump.dump_object(nbest_coefs, folder + '/nbest/svm/coefs.dump')
    dump.dump_object(metric_f1, folder + '/nbest/svm/f1.dump')
    dump.dump_object(metric_n_feat, folder + '/nbest/svm/feat.dump')

    metric_cls = [(nbest_coefs[i], metric_f1[i])
                  for i in range(len(nbest_coefs))]
    metric_coef_max = max(metric_cls, key=lambda x: x[1])[0]
    indexes_metric = [x[1]
                      for x in metric[-metric_coef_max:]]  # to eiler's diagram
    dump.dump_object(indexes_metric, folder + '/nbest/max/indexes.dump')
Exemplo n.º 2
0
def information_gain(data, labels, dumped=False):
    import numpy as np

    def get_features(data_set):
        n = len(data[0])
        return [[i[j] for i in data_set] for j in range(n)]

    def possibilities(feature):
        counts = np.bincount(feature)
        return np.asarray(counts[np.nonzero(counts)] / float(len(feature)))

    def entropy(feature):
        p = possibilities(feature)
        return -np.sum(p * np.log2(p))

    def spec_cond_entropy(x, y, xi):
        new_y = [y[i] for i in range(len(y)) if x[i] == xi]
        return entropy(new_y)

    def cond_entropy(x, y):
        p = possibilities(x)
        return sum([p[xi] * spec_cond_entropy(x, y, xi) for xi in range(len(p))])

    def cond_entropy_full(x, y):
        from util.frame import progress
        print('Information gain: computing conditional entropy:')
        feat_len = len(x)
        result = []
        for i in range(feat_len):
            result.append(cond_entropy(x[i], y))
            if i % 10 == 0:
                progress((i + 1) / feat_len)
        progress(1)
        return np.asarray(result)

    import util.dump as dump
    features = get_features(data)
    h_y_x = []
    if not dumped:
        h_y_x = cond_entropy_full(features, labels)
        dump.dump_object(h_y_x, 'ig/hyx.dump')
    else:
        h_y_x = dump.load_object('ig/hyx.dump')
    info_gain = entropy(labels) - h_y_x
    result = [(info_gain[i], i) for i in range(len(info_gain))]
    return result
Exemplo n.º 3
0
def pearson(data, labels, dumped=False):
    import numpy as np
    import util.dump as dump
    import math
    import warnings
    import scipy.stats as stats
    warnings.filterwarnings('ignore')

    def get_features(data_set):
        n = len(data[0])
        return [[i[j] for i in data_set] for j in range(n)]

    def feature_correlation(x, y):
        n = range(len(x))
        x_avg = sum(x) / len(x)
        y_avg = sum(y) / len(y)
        cov = sum([(x[i] - x_avg) * (y[i] - y_avg) for i in n])
        x_dev = math.sqrt(sum([(x[i] - x_avg) ** 2 for i in n]))
        y_dev = math.sqrt(sum([(y[i] - y_avg) ** 2 for i in n]))
        return cov / (x_dev * y_dev)

    def correlation(x, y):
        from util.frame import progress
        print('Pearson: computing corellation coefficients:')
        feat_len = len(x)
        result = []
        for i in range(feat_len):
            result.append(feature_correlation(x[i], y))
            if i % 10 == 0:
                progress((i + 1) / feat_len)
        progress(1)
        return np.asarray(result)

    features = get_features(data)
    ro = []
    if not dumped:
        ro = correlation(features, labels)
        dump.dump_object(ro, 'pearson/ro.dump')
    else:
        ro = dump.load_object('pearson/ro.dump')
    v = len(labels) - 2
    p = []
    for i in range(len(ro)):
        t = ro[i] * math.sqrt(v) / math.sqrt(1 - ro[i] ** 2)
        p.append((stats.t.sf(np.abs(t), v) * 2, i))
    return p
Exemplo n.º 4
0
def spearman(data, labels, dumped=False):
    import numpy as np
    import util.dump as dump
    import math
    import warnings
    import scipy.stats as stats
    warnings.filterwarnings('ignore')

    def get_features(data_set):
        n = len(data[0])
        return [[i[j] for i in data_set] for j in range(n)]

    def feature_correlation(x, y):
        n = len(x)
        rank_x = np.asarray(stats.rankdata(x, method='max'))
        rank_y = np.asarray(stats.rankdata(y, method='max'))
        sum_d_2 = sum((rank_x - rank_y) ** 2)
        return 1 - 6 * sum_d_2 / (n * (n ** 2 - 1))

    def correlation(x, y):
        from util.frame import progress
        print('Spearman: computing corellation coefficients:')
        feat_len = len(x)
        result = []
        for i in range(feat_len):
            result.append(feature_correlation(x[i], y))
            if i % 10 == 0:
                progress((i + 1) / feat_len)
        progress(1)
        return np.asarray(result)

    features = get_features(data)
    ro = []
    if not dumped:
        ro = correlation(features, labels)
        dump.dump_object(ro, 'spearman/ro.dump')
    else:
        ro = dump.load_object('spearman/ro.dump')
    n = len(labels)
    v = n - 2
    p = []
    for i in range(len(ro)):
        t = ro[i] * math.sqrt(v) / math.sqrt(1 - ro[i] ** 2)
        p.append((stats.t.sf(np.abs(t), v) * 2, i))
    return p
Exemplo n.º 5
0
def classify(x, x_val, y):
    import sklearn.svm as svm
    predict = run_classifier(x, y, x_val, svm.LinearSVC())
    return predict


data = dump.load_object('data.dump')
data_valid = dump.load_object('data_valid.dump')
labels = dump.load_object('labels.dump')
labels_valid = dump.load_object('labels_valid.dump')

score = metrics.f1_score(labels_valid, classify(data, data_valid, labels))
print(score)
print()
dump.dump_object(score, 'score.dump')

# INFO GAIN
if INFO_GAIN:
    ig = dump.load_object('ig/ig.dump')
    ig_coefs = np.arange(0.1, 0.91, 0.01)
    ig_f1 = []
    ig_n_feat = []
    print('Information Gain: classifying on different coefficients')
    timer.set_new()
    for i in range(len(ig_coefs)):
        frame.progress((i + 1) / len(ig_coefs))
        trimmed_ig = [x for x in ig if x[0] > ig_coefs[i]]
        indexes_ig = [x[1] for x in trimmed_ig]
        ig_data = trim.trim_data(data, indexes_ig)
        ig_data_valid = trim.trim_data(data_valid, indexes_ig)
Exemplo n.º 6
0
    def classify():
        predicted = network.predict(images_test)
        predicted = get_predicted(predicted)
        return accuracy_score(test_labels[1], predicted)

    network = NeuralNetwork(1, 1, 1)
    images_train = images_to_np_array(train_images[2])
    labels_train = labels_to_np_array(train_labels[1])

    cycles = 10
    print('Training...')
    progress(0)
    timer = Timer()
    rang = list(range(150, 250, 10))
    for j in range(len(rang)):
        if not rang[j] in stats_x:
            np.random.seed(1)
            network = NeuralNetwork(image_size[0] * image_size[1], 300, 10)
            for i in range(cycles):
                randoms = np.random.randint(0, 60000, rang[j])
                network.train(images_train[randoms], labels_train[randoms],
                              0.1)
                if i % 1 == 0:
                    progress((j * cycles + i + 1) / (cycles * len(rang)))
            stats_x.append(rang[j])
            stats_y.append(classify())
    progress(1)
    dump_object((stats_x, stats_y), 'stoch-n-images-stat.dump')
    print(' DONE in ', timer.get_diff_str())
pt.plot(stats_x, stats_y, color='red')
pt.show()
Exemplo n.º 7
0
    test_images_file = reader.read_images('mnist/t10k-images-idx3-ubyte')
    test_data = images_to_np_array(test_images_file[2])
    test_labels = np.asarray(test_labels_file[1])
    print('DONE in ' + timer.get_diff_str())
    # timer.set_new()
    # coef = information_gain(train_data, train_labels)
    # print(' DONE in ' + timer.get_diff_str())
    # dump_object(coef, 'spearman.dump')
    import pylab as pt

    ig = [x[1] for x in sorted(load_object('ig.dump'))]

    y = np.zeros((28, 28, 3))
    n = 100
    features = ig[-n:]
    for i in features:
        y[i // 28][i % 28] = [1, 1, 1]
    pt.imshow(y)
    pt.show()

    fs_data = train_data.T[features].T
    fs_labels = train_labels

    fs_test_data = test_data.T[features].T
    fs_test_labels = test_labels

    dump_object(n, 'fs_size.dump')
    dump_object(fs_data, 'fs_train_data.dump')
    dump_object(fs_labels, 'fs_train_labels.dump')
    dump_object(fs_test_data, 'fs_test_data.dump')
    dump_object(fs_test_labels, 'fs_test_labels.dump')
Exemplo n.º 8
0
    rang_test = len(images_test)

    def classify():
        predicted = network.predict(images_test)
        predicted = get_predicted(predicted)
        return accuracy_score(test_labels[1], predicted)

    network = NeuralNetwork(1, 1, 1)
    images_train = images_to_np_array(train_images[2])
    labels_train = labels_to_np_array(train_labels[1])

    cycles = 1000
    num = 150
    print('Training...')
    progress(0)
    timer = Timer()
    rang = list(range(200, 300, 20))
    for j in range(len(rang)):
        np.random.seed(1)
        network = NeuralNetwork(image_size[0] * image_size[1], rang[j], 10)
        for i in range(cycles):
            randoms = np.random.randint(0, 60000, num)
            network.train(images_train[randoms], labels_train[randoms])
            progress((j * cycles + i + 1) / (cycles * len(rang)))
        stats_x.append(rang[j])
        stats_y.append(classify())
    dump_object((stats_x, stats_y), 'stoch-hidden-stat.dump')
    print(' DONE in ', timer.get_diff_str())
import pylab as pt
pt.plot(stats_x, stats_y, color='red')
pt.show()
Exemplo n.º 9
0
else:
    images_train = images_to_np_array(train_images[2])
    labels_train = labels_to_np_array(train_labels[1])
    stats = []
    if NETWORK_CONTINUE:
        network = load_object('network.dump')
        stats = load_object('stats.dump')
    else:
        network = NeuralNetwork(image_size[0] * image_size[1], 10, 10)
    rang_train = len(images_train)
    print('Training...')
    cycles = 0
    timer = Timer()
    progress(0)
    for i in range(cycles):
        network.train(images_train, labels_train)
        dump_object(network, 'network.dump')
        dump_object(stats, 'stats.dump')
        progress((i+1) / cycles)
        stats.append(classify())
    print(' DONE in ', timer.get_diff_str())
    import pylab as pt
    x, y = [0], [0]
    step = 25
    for i in range(len(stats) // step):
        x.append(i * step + step)
        selection = stats[i*step:i*step + step]
        y.append(sum(selection) / step)
    pt.plot(range(len(stats)), stats)
    pt.plot(x, y, color='red', linewidth=3)
    pt.show()
Exemplo n.º 10
0
    labels_train = labels_to_np_array(train_labels[1])
    stats = []
    if NETWORK_CONTINUE:
        network = load_object('stoch-network.dump')
        stats = load_object('stoch-stats.dump')
    else:
        network = NeuralNetwork(image_size[0] * image_size[1], 300, 10, layers=1)
    rang_train = len(images_train)

    print('Training...')
    cycles = 100
    num = 240
    timer = Timer()
    progress(0)
    for i in range(cycles):
        randoms = np.random.randint(0, 60000, num)
        network.train(images_train[randoms], labels_train[randoms], 0.1)
        if network.cycles % network.step == 0:
            stats.append(classify())
        progress((i+1) / cycles)
    print(' DONE in ', timer.get_diff_str())
    classify_print()
    dump_object(network, 'stoch-network.dump')
    dump_object(stats, 'stoch-stats.dump')
    print(network.cycles)
    import pylab as pt
    pt.plot(np.arange(len(stats)) * network.step, stats)
    pt.grid()
    pt.show()
    pt.plot(np.arange(len(network.stats)) * network.step, network.stats)
    pt.show()