예제 #1
0
def data_analysis():

    tr_data = dt.get_data('cifar10', (0, 20000))
    val_data = dt.get_data('cifar10', (40000, 50000))
    test_data = dt.get_data('cifar10', (50000, 60000))

    for m in models[:1]:
        # model0, model_name0 = mt.train2(m, tr_data, val_data, 50, False, 'cifar10-2-5', h5_path)
        # model0, model_name0 = mt.train(m, 'cifar10-channelswitched', 50, data_augmentation=False, path=res_path)
        # acc, predicted_classes, y_predicted = dt.predict_and_acc(model0, test_data)
        # t_log.log_predictions(y_predicted, model_name0, file_path=csv_path)

        model_name0 = mt.weight_file_name(m, 'cifar10-2-5', 50, False)
        y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)

        # true_classes = np.argmax(test_data[1], axis=1)  # wrong
        true_classes = [int(k) for k in test_data[1]]
        pr = metrics.prediction_ratings(y_predicted, true_classes)
        imgs_entropies = []

        # for image in test_data[0]:
        #     imgs_entropies.append(metrics_color.entropy_cc(image, 8))
            # c, i = metrics_color.contrast_intensity(image)
            # imgs_c.append(c)
            # imgs_i.append(i)

            # scores.append(metrics_color.colorfulness(image))

        sorted_e = np.argsort(imgs_entropies)
        # id_list = [sorted_e[k] for k in [10, 100, 1000, 2000, 5000, 8000, 9000, 9900, 9990]]
        id_list = [21, 3767, 9176, 730, 5905]
        plotting.show_imgs(id_list, 'cdc entropy examples', test_data[0], showColorCube=True)
예제 #2
0
def mt_noise_test():
    np.random.seed(0)
    tr_data = dt.get_data('cifar10', (0, 40000))
    val_data = dt.get_data('cifar10', (40000, 50000))
    for noise_level in xrange(5, 200, 10):
        for k in [1]:  # xrange(len(tr_data[0])):
            # noise_mat = np.repeat(np.random.random((32, 32))[:, :, np.newaxis], 3, axis=2)
            noise_mat = np.swapaxes([np.random.random((32, 32)), np.random.random((32, 32)),
                                     np.random.random((32, 32))], 0, 2)
            print(tr_data[0][k].shape)
            print(noise_mat.shape)
            tr_data[0][k] = np.clip(tr_data[0][k].astype('uint16') * (1 + (noise_mat-0.5) * noise_level/100), 0, 255)\
                .astype('uint8')
            plotting.imshow(tr_data[0][k])
        for m in models:
            print('Training', m)
예제 #3
0
def colorcube_analysis():
    # m = 'densenet121'
    for m in models:
        test_data = dt.get_data('cifar10', (50000, 60000))
        top_n = 2000
        model_name0 = mt.weight_file_name(m, 'cifar10-2-5', 50, False)
        # model_name0 = mt.weight_file_name(m, 'cifar10-2-5', 50, False, suffix='ft20ep-exp')
        model = mt.load_by_name(model_name0, test_data[0].shape[1:], h5_path+model_name0)
        # y_predicted = model.predict(np.array(test_data[0]))
        y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)
        true_classes = [int(k) for k in test_data[1]]
        scores = metrics.prediction_ratings(y_predicted, true_classes)
        score_sorted_ids = np.argsort(scores)
        cc_high = metrics_color.ColorDensityCube(resolution=4)
        for img_id in score_sorted_ids[-top_n:]:
            cc_high.feed(test_data[0][img_id])
        cc_high.normalize()
        cc_high.plot_cube()

        cc_low = metrics_color.ColorDensityCube(resolution=4)
        for img_id in score_sorted_ids[:top_n]:
            cc_low.feed(test_data[0][img_id])
        cc_low.normalize()

        cc_diff = cc_high.substract(cc_low, 'value')

        cc_low.plot_cube()

        cc_diff.normalize()
        cc_diff.plot_cube(title='Color cube analysis difference (' + str(top_n) + ' images/series)', normalize=True,
                          save=True)
예제 #4
0
def color_domain_test():
    all_data_orig = dt.get_data('cifar10', (0, 20000))
    g = 4
    n_images = 5
    # images_cube = ds.cifar10_color_domains(granularity=g, frequence=0.3)
    images_cube = dt.cifar10_maxcolor_domains(granularity=g)
    images_cube_sizes = np.zeros((g, g, g))
    total = 0
    for x in xrange(g):
        for y in xrange(g):
            for z in xrange(g):
                l = len(images_cube[x][y][z])
                images_cube_sizes[x][y][z] = l
                total += l
                id_list = images_cube[x][y][z][:n_images]
                if len(id_list) > 10000:
                    print(id_list)
                    c = 0
                    fig, axes = plt.subplots(1,
                                             n_images,
                                             figsize=(n_images, 4),
                                             subplot_kw={
                                                 'xticks': (),
                                                 'yticks': ()
                                             })
                    for img_id in id_list:
                        ax = axes[c]
                        c += 1
                        ax.imshow(all_data_orig[0][img_id], vmin=0, vmax=1)
                        ax.set_title("id#" + str(img_id))
                    plt.show()
    print(images_cube_sizes)
    print('total', total)
예제 #5
0
def colorfulness_analysis(model='densenet121', top_n=2500):
    """
    Experiment to analyse the relevance if the colorfulness attribute
    See the metrics_color.colorfulness() function for more details on the attribute
    :param model: The predictions of :model: will be used to compute the prediciton scores
    :param top_n: Number of elements in the series that will be plotted for analysis
    :return:
    """

    # Load test data and model results
    test_data = dt.get_data('cifar10', (50000, 60000))
    model_name0 = mt.weight_file_name(model, 'cifar10-2-5', 50, False)
    y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)
    true_classes = [int(k) for k in test_data[1]]

    # Compute scores and sort test data ids by score
    scores = metrics.prediction_ratings(y_predicted, true_classes)
    score_sorted_ids = np.argsort(scores)

    # Compute metric for high score and low score data
    high_score_series = []
    low_score_series = []
    print(len(score_sorted_ids))
    for k in xrange(0, top_n):
        high_score_series.append(metrics_color.colorfulness(test_data[0][score_sorted_ids[-k-1]]))
        low_score_series.append(metrics_color.colorfulness(test_data[0][score_sorted_ids[k]]))

    # Plot box plot of the two series
    plotting.box_plot(high_score_series, low_score_series, name_s1='high prediction scores',
                      name_s2='low prediction scores', y_label='Colorfulness',
                      title='Colorfulness analysis (' + str(top_n) + ' images/series)')
예제 #6
0
def show_ids():
    test_data = dt.get_data('cifar10', (50000, 60000))
    hard = [9746, 9840, 9853, 9901, 9910, 9923, 9924, 9926, 9960, 9982]
    easy = [9929, 9935, 9939, 9945, 9952, 9966, 9971, 9992, 9997, 9999]
    for k in easy:
        plotting.imshow(test_data[0][k])
    for k in hard:
        plotting.imshow(test_data[0][k])

    print('done')
예제 #7
0
def check_acc():
    m = 'densenet121'
    test_data = dt.get_data('cifar10', (50000, 60000))

    model_name0 = mt.weight_file_name(m, 'cifar10-2-5', 50, False)
    y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)
    predicted_classes = np.argmax(y_predicted, axis=1)
    print(predicted_classes[:10])
    true_classes = [int(k) for k in test_data[1]]
    acc = metrics.accuracy(predicted_classes, true_classes)
    print(acc)
예제 #8
0
def epochs_accuracy_test():
    tr_data = dt.get_data('cifar10', (0, 40000))
    val_data = dt.get_data('cifar10', (40000, 50000))
    test_data = dt.get_data('cifar10', (50000, 60000))
    m = models[0]
    epochs = [1, 2, 3, 4, 5, 6, 7, 10, 20, 40, 200]  # 8, 9,  10, 20, 40, 60, 80, 100, 140, 200]
    correctness = [[] for _ in xrange(len(test_data[0]))]
    for k in xrange(len(epochs)):
        print('###->', epochs[k], 'epochs')
        model0, model_name0 = mt.train2(m, tr_data, val_data, epochs[k], False,
                                        'cifar10_0445_epochsacc-5_', path=h5_path)
        acc, predicted_classes, _ = dt.predict_and_acc(model0, test_data)
        for c in xrange(len(correctness)):
            if predicted_classes[c] == test_data[1][c]:
                correctness[c].append(1)
            else:
                correctness[c].append(0)

        print('Test accuracy = ', acc)

    easy_imgs = []
    hard_imgs = []
    correctness_tot = [np.sum(img_preds) for img_preds in correctness]
    for c, n in enumerate(correctness_tot):
        if n == len(epochs):
            easy_imgs.append(c)
        if n == 0:
            hard_imgs.append(c)

    unique, counts = np.unique(correctness_tot, return_counts=True)
    n_correct = dict(zip(unique, counts))

    correctness_shapes = [str(img_preds) for img_preds in correctness]
    unique, counts = np.unique(correctness_shapes, return_counts=True)
    correct_shapes = dict(zip(unique, counts))
    sorted_cs = sorted(correct_shapes.items(), key=operator.itemgetter(1))
    print(n_correct)
    print(sorted_cs[-20:])

    print('Easy images ids: ', easy_imgs[max(-len(easy_imgs), -10):])
    print('Hard images ids: ', hard_imgs[max(-len(hard_imgs), -10):])
예제 #9
0
def cifar_color_domains_test():
    for m in models:
        tr_data = dt.get_data('cifar10', (0, 20000))
        val_data = dt.get_data('cifar10', (20000, 30000))
        test_data = dt.get_data('cifar10', (30000, 60000))
        f_test_data = dt.format_data(test_data, 10)  # f for formatted

        model0, model_name0 = mt.train2(m,
                                        tr_data,
                                        val_data,
                                        50,
                                        False,
                                        'cifar10-2-5',
                                        path=h5_path)
        #
        # for m in models:
        #     model0, model_name = mt.train(m, 'cifar10', 50, data_augmentation=True)
        cube = metrics_color.color_domains_accuracy(model0)
        print('cube', cube)
        sizes_cube = dt.cube_cardinals(cube)
        print('Sizes', sizes_cube)
예제 #10
0
def train_student_model(features_all, features_model, model_name, path, batch=50, n_epochs=300,
                learning_rate=1.0, model_type='sklearn', save=False, normalize=True):
    # Merge data over multiple years
    print("Reading data...")
    dataset = get_data(features_all, path=path)

    df, x, y, feature_names = process_data(dataset=dataset,
                                           features_model=features_model,
                                           normalize=normalize)

    train_model(x=x, y=y, model_name=model_name, feature_names=feature_names, batch=batch,
                n_epochs=n_epochs, learning_rate=learning_rate, model_type=model_type, save=save)
예제 #11
0
def car_example():
    test_data = dt.get_data('cifar10', (50000, 60000))
    cars = [6983, 3678, 3170, 1591]

    cc0 = metrics_color.ColorDensityCube(resolution=4)
    cc0.feed(test_data[0][cars[0]])
    plotting.imshow(test_data[0][cars[0]])
    cc0.plot_cube()

    cc0 = metrics_color.ColorDensityCube(resolution=4)
    cc0.feed(test_data[0][cars[1]])
    plotting.imshow(test_data[0][cars[1]])
    cc0.plot_cube()
예제 #12
0
def histogram_analysis():
    m = 'densenet121'
    test_data = dt.get_data('cifar10', (50000, 60000))
    top_n = 2000
    model_name0 = mt.weight_file_name(m, 'cifar10-2-5', 50, False)
    y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)
    true_classes = [int(k) for k in test_data[1]]
    scores = metrics.prediction_ratings(y_predicted, true_classes)
    score_sorted_ids = np.argsort(scores)
    high_score_series = []
    low_score_series = []
    for k in xrange(0, top_n):
        high_score_series.append(test_data[0][score_sorted_ids[-k-1]])
        low_score_series.append(test_data[0][score_sorted_ids[k]])

    plotting.plot_hists(high_score_series, 'high scores', low_score_series, 'low scores', plotting.cs_bgr, title=' ')
예제 #13
0
def train_college_model(features_all, features_student, features_model, model_name, path, batch=100, n_epochs=1000,
                learning_rate=0.000005, model_type='sklearn', save=False, normalize=True):
    # Merge data over multiple years
    print("Reading data...")
    dataset = get_data(features_all, path=path)

    df, x, y, feature_names = process_data(dataset=dataset,
                                           features_model=features_student,
                                           normalize=normalize)
    y = compute_college_scores(model_type, x, y)

    df, x, _, feature_names = process_data(dataset=dataset,
                                           features_model=features_model,
                                           normalize=normalize)

    train_model(x=x, y=y, model_name=model_name, feature_names=feature_names, batch=batch,
                n_epochs=n_epochs, learning_rate=learning_rate, model_type=model_type, save=save)
예제 #14
0
def check_rgb():
    test_data = dt.get_data('cifar10', (50000, 60000))
    # plotting.imshow(test_data[0][9960])
    # img_test = np.repeat(test_data[0][9960][:, :, 0, np.newaxis], 3, axis=2)
    img_test = np.array(test_data[0][9960])
    img_test[:, :, 1] = np.ones((32, 32))  # * 255
    img_test[:, :, 2] = np.ones((32, 32))  # * 255
    # img_test = np.swapaxes(img_test, 0, 2)
    print(np.array(test_data[0][9960]).shape)
    print(img_test)
    plotting.imshow(img_test)
    plotting.plot_hists(
        [test_data[0][9960]],
        'normal',
        [img_test],
        'red',
        plotting.cs_bgr,
    )
예제 #15
0
def confusion(model='densenet121'):
    # Load test data and model results
    test_data = dt.get_data('cifar10', (50000, 60000))
    model_name0 = mt.weight_file_name(model, 'cifar10-2-5', 50, False)
    y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)
    predicted_classes = np.argmax(y_predicted, axis=1)
    true_classes = [int(k) for k in test_data[1]]

    print('Confusion Matrix for Total Test Data')
    print(sk_metrics.confusion_matrix(true_classes, predicted_classes))

    scores = metrics.prediction_ratings(y_predicted, true_classes)
    prediction_scores = np.zeros((10, 1)).tolist()
    print(prediction_scores)
    for k in xrange(len(y_predicted)):
        prediction_scores[predicted_classes[k]].append(scores[k])

    print(np.array(prediction_scores).shape)
    for cifar_class in prediction_scores:
        print(float(np.mean(cifar_class)))
예제 #16
0
def check_entropy():
    r_col_imgs = []
    r_bw_imgs = []
    test_data = dt.get_data('cifar10', (50000, 60000))
    entropies = []
    for img in test_data[0]:
        entropies.append(metrics_color.entropy_cc(img))

    sorted_args = np.argsort(entropies)

    plotting.imshow(test_data[0][sorted_args[0]])
    print(entropies[sorted_args[0]], test_data[1][sorted_args[0]])
    plotting.imshow(test_data[0][sorted_args[100]])
    print(entropies[sorted_args[100]], test_data[1][sorted_args[100]])
    plotting.imshow(test_data[0][sorted_args[1000]])
    print(entropies[sorted_args[1000]], test_data[1][sorted_args[1000]])
    plotting.imshow(test_data[0][sorted_args[9000]])
    print(entropies[sorted_args[9000]], test_data[1][sorted_args[9000]])
    plotting.imshow(test_data[0][sorted_args[9900]])
    print(entropies[sorted_args[9900]], test_data[1][sorted_args[9900]])
    plotting.imshow(test_data[0][sorted_args[9999]])
    print(entropies[sorted_args[9999]], test_data[1][sorted_args[9999]])
예제 #17
0
def check_pr():
    m = 'densenet121'
    model_name0 = mt.weight_file_name(m, 'cifar10-2-5', 50, False)
    y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)

    test_data = dt.get_data('cifar10', (50000, 60000))
    easy = [9929, 9935, 9939, 9945, 9952, 9966, 9971, 9992, 9997, 9999]
    hard = [9746, 9840, 9853, 9901, 9910, 9923, 9924, 9926, 9960, 9982]
    # cat = [671]
    # cars = [6983, 3678, 3170, 1591]
    # plotting.show_imgs(easy, 'easy set: ', test_data[0], showColorCube=True, resolution=4)
    # plotting.show_imgs(hard, 'hard set: ', test_data[0], showColorCube=True, resolution=4)
    true_classes = [int(k) for k in test_data[1]]

    scores = metrics.prediction_ratings(y_predicted, true_classes)
    score_sorted_ids = np.argsort(scores)

    # print(scores[score_sorted_ids[0]], y_predicted[score_sorted_ids[0]])
    # print(scores[score_sorted_ids[1]], y_predicted[score_sorted_ids[1]])
    print(scores[score_sorted_ids[2500]], y_predicted[score_sorted_ids[2500]])
    print(scores[score_sorted_ids[2501]], y_predicted[score_sorted_ids[2501]])
    # print(scores[score_sorted_ids[9998]], y_predicted[score_sorted_ids[9998]])
    # print(scores[score_sorted_ids[9999]], y_predicted[score_sorted_ids[9999]])

    print('easy')
    for img_id in easy:
        print(
            img_id, '- pr:',
            metrics.prediction_rating(y_predicted[img_id],
                                      true_classes[img_id]), ' - correct?: ',
            np.argmax(y_predicted[img_id]) == true_classes[img_id])
        # print(y_predicted[id])
    print('hard')
    for img_id in hard:
        print(
            img_id, '- pr:',
            metrics.prediction_rating(y_predicted[img_id],
                                      true_classes[img_id]), ' - correct?: ',
            np.argmax(y_predicted[img_id]) == true_classes[img_id])
예제 #18
0
def entropy_cc_analysis():
    m = 'densenet121'
    test_data = dt.get_data('cifar10', (50000, 60000))
    top_n = 2000

    model_name0 = mt.weight_file_name(m, 'cifar10-2-5', 50, False)
    y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)
    true_classes = [int(k) for k in test_data[1]]
    scores = metrics.prediction_ratings(y_predicted, true_classes)
    score_sorted_ids = np.argsort(scores)
    high_score_entropies = []
    low_score_entropies = []
    print(len(score_sorted_ids))
    for k in xrange(0, top_n):
        # id = score_sorted_ids[-k - 1]
        # print(id)
        # img = test_data[id]
        high_score_entropies.append(metrics_color.entropy_cc(test_data[0][score_sorted_ids[-k-1]], 8))
        low_score_entropies.append(metrics_color.entropy_cc(test_data[0][score_sorted_ids[k]], 8))

    plotting.box_plot(high_score_entropies, low_score_entropies, name_s1='high prediction scores',
                      name_s2='low prediction scores', y_label='Color entropy',
                      title='Color entropy analysis (' + str(top_n) + ' images/series)')
def color_domains_accuracy(model, granularity=4, n=1, data_range=(50000, 60000)):
    g = granularity
    images_cube = dt.cifar10_nth_maxcolor_domains(granularity=g, n=n, data_range=data_range)
    scores_cube = np.zeros((g, g, g))
    data = dt.get_data('cifar10', data_range)
    xf, yf = dt.format_data(data, 10)
    for x in xrange(g):
        for y in xrange(g):
            for z in xrange(g):
                test_data = [[], []]
                if len(images_cube[x][y][z]) > 1:
                    for k in images_cube[x][y][z]:
                        test_data[0].append(xf[k])
                        test_data[1].append(yf[k])
                    # print(np.array(test_data[0]).shape)
                    y_predicted = model.predict(np.array(test_data[0]))
                    predicted_classes = np.argmax(y_predicted, axis=1)
                    true_classes = np.argmax(test_data[1], axis=1)
                    acc = metrics.accuracy(predicted_classes, true_classes)
                else:
                    acc = None
                scores_cube[x][y][z] = acc
    return scores_cube
예제 #20
0
def generate_ranking(features_all, features_model, path, model_type='sklearn', normalize=True):
    # Merge data over multiple years
    print("Reading data...")
    dataset = get_data(features_all, path=path)

    df, x, y, feature_names = process_data(dataset=dataset,
                                           features_model=features_model,
                                           normalize=normalize)

    # College score
    college_score = compute_college_scores(model_type, x, y)

    college_name = df['INSTNM'].values
    college_name = college_name.reshape((-1, 1))

    # Concatenate college scores and names
    scores = np.concatenate((college_name, college_score), axis=1)

    # We might have data over several years, so let's create a hash table and compute average over years
    scores_dict = {}
    for i in range(scores.shape[0]):
        name = scores[i, 0]
        if scores[i, 1] not in scores_dict:
            scores_dict[name] = [float(scores[i, 1])]
        else:
            scores_dict[name].append(float(scores[i, 1]))
    for key, value in scores_dict.items():
        scores_dict[key] = np.mean(value)

    # Write scores into a file
    scores_sorted = sorted(scores_dict.items(), key=lambda x: x[1], reverse=True)
    scores_file = 'scores.csv'
    with open(scores_file, 'w') as csv_file:
        writer = csv.writer(csv_file)
        for row in scores_sorted:
            writer.writerow(row)
    print("Scores are saved in %s" % scores_file)
예제 #21
0
def main():

    parser = argparse.ArgumentParser()

    # General arguments:
    parser.add_argument('--relations', default='impl', help='Relationship type. OPTIONS: impl, expl')
    parser.add_argument('--model_type', default='PIX', help='OPTIONS: REG, PIX')
    parser.add_argument('--n_side_pixl', default=15, type=int, help='Number of pixels as output of PIX')
    parser.add_argument('--method_compare', default=['emb','rnd','onehot','ctrl'],
                        help='Methods to compare. OPTIONS: init, rnd, onehot, ctrl')
    parser.add_argument('--n_folds', default=10, type=int, help='Number of cross-validation folds')
    parser.add_argument('--eval_generalized_set', default= 'words',
                        help='Whether we evaluate in a generalized set or not. If so, instances are left out for training. '
                             'OPTIONS: None, triplets, words')
    parser.add_argument('--eval_clean_set', default= None,
                        help='Whether we evaluate in a clean set (equal to the generalized ones, but without keeping the model from'
                             'seeing these words/triplets during training. BE CAREFUL! Do not use the same list as in generalized above'
                             '(because you will not find any tuple for the clean set if you have removed them first!) '
                             'OPTIONS: None, triplets, words')
    parser.add_argument('--save_indiv_predictions', default=False, type=bool_str, help='To store model predictions (they can be heavy, especially in PIX). '
                                                                        'Useful to visualize them afterwards.')
    parser.add_argument('--save_model', default=False, type=bool_str, help='To store the models (e.g., to explore weights afterwards)')

    args = parser.parse_args()

    if args.model_type == 'REG':
        perf_measures = ['R2', 'acc_y', 'F1_y', 'Pear_x', 'Pear_y', 'IoU_t']
    if args.model_type == 'PIX':
        perf_measures = ['acc_y', 'F1_y', 'Pear_x', 'Pear_y', 'max_acc_px']

    # Create folder for results
    saveFolder = wd.get_folder_name(args)

    # Get default params
    par_learning = pt.get_default_params(args.model_type)

    # --- Read data --- #
    TRAIN = rd.load_training_data('../training_data/TRAINING_DATA-' + args.relations + '.csv')
    TRAIN['subj_ctr_x'], TRAIN['obj_ctr_x'] = dt.mirror_x(TRAIN['subj_ctr_x'], TRAIN['obj_ctr_x'])
    words, EMB = rd.readDATA( '../embeddings/glove_words.csv')

    # --- GENERALIZED and CLEAN triplets or words --- #
    enforce_gen, clean_eval = {}, {}
    enforce_gen['eval'], clean_eval['eval'] = args.eval_generalized_set, args.eval_clean_set
    enforce_gen['triplets'], enforce_gen['words'] = dt.get_enforce_gen(enforce_gen['eval'])
    clean_eval['triplets'], clean_eval['words'] = dt.get_enforce_gen(clean_eval['eval'])

    print('Getting training data...')
    X, X_extra, y, y_pixl, X_extra_enf_gen, X_enf_gen, y_enf_gen, y_enf_gen_pixl, rel_ids, OBJ_ctr_sd, OBJ_ctr_sd_enf_gen, \
    EMBEDDINGS, TRAIN_relevant = dt.get_data(args.model_type, TRAIN, words, EMB, enforce_gen, args.n_side_pixl)

    # Get folds
    kf = dt.get_folds(X['subj'].shape[0], args.n_folds)

    # --- INITIALIZE performance measures --- #
    PERF, PERF_clean, PERF_enf_gen = {},{},{}
    PERF['train'], PERF['test'], PERF_clean['train'], PERF_clean['test'] = {},{},{},{}
    for method_full in args.method_compare:
        PERF['train'][method_full], PERF['test'][method_full] = {},{}
        PERF_clean['train'][method_full], PERF_clean['test'][method_full] = {},{}
        PERF_enf_gen[method_full] = {}
        for meas in perf_measures:
            PERF['train'][method_full][meas], PERF['test'][method_full][meas] = [],[]
            PERF_clean['train'][method_full][meas], PERF_clean['test'][method_full][meas] = [],[]
            PERF_enf_gen[method_full][meas] = []

    idx_clean_train, idx_clean_test = [],[]
    for fold_count, (train_idx, test_idx) in enumerate(kf): # FOLDS loop

        # --- TRAIN and TEST data (splits) --- #
        # This aux function isn't elegant, but we don't want to triplicate y_pixl with train and test splits. Takes too much memory
        X_train, X_test, X_extra_train, X_extra_test, y_train, y_test, OBJ_ctr_sd_train, \
        OBJ_ctr_sd_test = dt.aux_get_train_test_splits(X, X_extra, y, OBJ_ctr_sd, train_idx, test_idx)
        aux_train_idx = train_idx if args.model_type == 'PIX' else 0
        aux_test_idx = test_idx if args.model_type == 'PIX' else 0

        # --- get CLEAN_train and CLEAN_test INDICES --- #
        if clean_eval['eval'] is not None:
            idx_clean_train, idx_clean_test = dt.get_CLEAN_train_test_idx(TRAIN_relevant, train_idx, test_idx, clean_eval)

        for method in args.method_compare: # METHODS LOOP

            print('=========================================')
            print('=======>> ' + method + ' <<=======')
            print('=========================================')

            # Initialize model object
            model = models.NeuralnetModel(args, method, par_learning)

            # --- LEARN the model --- #
            model.method_learn(X_train, X_extra_train, y_train, y_pixl[aux_train_idx], EMBEDDINGS)

            # --- PREDICT --- #
            y_pred_train = model.model_predict(X_train, X_extra_train, y_train)
            y_pred_test = model.model_predict(X_test, X_extra_test, y_train)
            if enforce_gen['eval'] is not None:
                y_pred_enf_gen = model.model_predict(X_enf_gen, X_extra_enf_gen, y_train)

            # --- EVALUATE performance --- #
            PERF_DICT_test = et.evaluate_perf(y_test, y_pixl[aux_test_idx], y_pred_test, OBJ_ctr_sd_test, perf_measures, args.model_type)
            PERF_DICT_train = et.evaluate_perf(y_train, y_pixl[aux_train_idx], y_pred_train, OBJ_ctr_sd_train, perf_measures, args.model_type)

            if (clean_eval['eval'] is not None) and (idx_clean_test != []) and (idx_clean_train != []):
                aux_idx_clean_train = idx_clean_train if args.model_type == 'PIX' else 0
                aux_idx_clean_test = idx_clean_test if args.model_type == 'PIX' else 0
                PERF_DICT_clean_train = et.evaluate_perf(y_train[idx_clean_train], y_pixl[aux_train_idx][aux_idx_clean_train], y_pred_train[idx_clean_train], OBJ_ctr_sd_train[idx_clean_train], perf_measures, args.model_type)
                PERF_DICT_clean_test = et.evaluate_perf(y_test[idx_clean_test], y_pixl[aux_test_idx][aux_idx_clean_test], y_pred_test[idx_clean_test], OBJ_ctr_sd_test[idx_clean_test], perf_measures, args.model_type)
            if enforce_gen['eval'] is not None:
                PERF_DICT_enf_gen = et.evaluate_perf(y_enf_gen, y_enf_gen_pixl, y_pred_enf_gen, OBJ_ctr_sd_enf_gen, perf_measures, args.model_type)

            # --- append --- #
            for meas in perf_measures:
                PERF['train'][method_full][meas].append(PERF_DICT_train[meas])
                PERF['test'][method_full][meas].append(PERF_DICT_test[meas])
                if (clean_eval['eval'] is not None) and (idx_clean_test != []) and (idx_clean_train != []):
                    PERF_clean['train'][method_full][meas].append(PERF_DICT_clean_train[meas])
                    PERF_clean['test'][method_full][meas].append(PERF_DICT_clean_test[meas])
                    print ('method==> ' + method + ' || ' + meas + '_CLEAN_ts= ' + str(PERF_DICT_clean_test[meas]) + ' | ' + meas + '_CLEAN_tr= ' + str(PERF_DICT_clean_train[meas]))
                if enforce_gen['eval'] is not None:
                    PERF_enf_gen[method_full][meas].append(PERF_DICT_enf_gen[meas])
                    print ('method==> ' + method + ' || ' + meas + '_GEN= ' + str(PERF_DICT_enf_gen[meas]))
                print ('method==> ' + method + ' || ' + meas + '_test= ' + str(PERF_DICT_test[meas]) + ' | ' + meas + '_train= ' + str(PERF_DICT_train[meas]))

            # -- write individual predictions -- #
            if args.save_indiv_predictions == True:
                indiv_predDir = saveFolder + '/INDIV_' + method_full + '_fld_' + str(fold_count + 1) + '.csv'
                wd.write_indiv_predictions(y_pred_test, OBJ_ctr_sd_test, args.model_type, 0.1, indiv_predDir)
                if enforce_gen['eval'] is not None:
                    wd.write_indiv_predictions(y_pred_enf_gen, OBJ_ctr_sd_enf_gen, args.model_type, 0.1, indiv_predDir.replace('INDIV', 'INDIV-GEN'))
                if (clean_eval['eval'] is not None) and (idx_clean_test != []) and (idx_clean_train != []):
                    wd.write_indiv_predictions(y_pred_test[idx_clean_test], OBJ_ctr_sd_test[idx_clean_test], args.model_type, 0.1, indiv_predDir.replace('INDIV', 'INDIV-CLEAN_TST'))

            # -- store model weights -- #
            if (args.save_model == True) and (method is not 'ctrl'):
                import h5py
                model.keras_model.save_weights(saveFolder + '/MODEL_' + method_full + '_fld_' + str(fold_count + 1) + '.h5')

            # --- write results --- #
            wd.write_results_all(PERF, args.method_compare, perf_measures, saveFolder + '/TRAIN-TEST.csv')
            if enforce_gen['eval'] is not None:
                wd.write_results_enf_gen(PERF_enf_gen, args.method_compare, perf_measures, saveFolder + '/GEN.csv')
            if (clean_eval['eval'] is not None) and (idx_clean_test != []) and (idx_clean_train != []):
                wd.write_results_all(PERF_clean, args.method_compare, perf_measures, saveFolder + '/CLEAN.csv')
예제 #22
0
minage = 8
maxage = 20

# Additional variables to use when imputing puberty
# TODO: include Bamako, subscap for boys, men_age for girls
others = {0: ["HT", "WT", "BMI"], 1: ["HT", "WT"]}

# Fit a Gaussian process model separately to females and males.
for female in 0, 1:

    if female == 1 and impvar in ("log2T_use_Z", ):
        continue
    if female == 0 and impvar in ("Breast_Stage_Use_Z", "Menarche"):
        continue

    dx[female] = get_data(female, impvar, others=["datecomb"] + others[female])
    outf.write("Loaded %d x %d values\n" % tuple(dx[female].shape))
    outf.write("%d distinct people in initial data\n\n" %
               dx[female].ID.unique().size)

    dx[female] = dx[female].loc[dx[female].Age >= minage, :]
    outf.write("Retained %d x %d values at or above age %d\n" %
               (tuple(dx[female].shape) + (minage, )))
    outf.write("%d distinct people after requiring age at or above %d\n\n" %
               (dx[female].ID.unique().size, minage))

    # Not converted for some reason, stored as seconds, convert to days
    dx[female].datecomb = pd.to_datetime(dx[female].datecomb)

    dx[female] = dx[female].groupby("ID").apply(xform)
예제 #23
0
def color_region_finetuning():
    g = 4
    images_cube = dt.cifar10_maxcolor_domains(granularity=g, data_range=(50000, 60000))
    region_sizes = dt.cube_cardinals(images_cube)
    tr_data = dt.get_data('cifar10', (0, 20000))
    val_data = dt.get_data('cifar10', (40000, 50000))
    ft_data = dt.get_data('cifar10', (20000, 40000))
    train_data_ref = dt.get_data('cifar10', (20000, 30000))
    train_data_ref2 = dt.get_data('cifar10', (30000, 40000))
    # train_data_ref2 = ds.get_data('cifar10', (25000, 35000))
    test_data = dt.get_data('cifar10', (50000, 60000))
    f_test_data = dt.format_data(test_data, 10)
    ft_data_augmentation = True
    ft_epochs = 30

    for m in models:

        # cr = color region, 0-2 for tr data / 4-5 for val data
        model_base, model_name0 = mt.train2(m, tr_data, val_data, 50, False, 'cr_0245', path=h5_path)
        scores_cubes = []

        for x in xrange(g):
            nametag_prefix = 'ft_2345_ref' + str(x + 4)
            ft_model_name = mt.ft_weight_file_name(model_name0, ft_data_augmentation, ft_epochs, nametag_prefix)
            weights_file = h5_path + ft_model_name + '.h5'
            print('*-> ' + weights_file)

            if mt.model_state_exists(weights_file):
                model2 = mt.load_by_name(model_name0, ft_data[0].shape[1:], weights_file)
                score = dt.predict_and_acc(model2, val_data)
                print('Val accuracy:', score[0])
            else:
                ft_data_selected_ref = [np.concatenate((tr_data[0], train_data_ref2[0])),
                                        np.concatenate((tr_data[1], train_data_ref2[1]))]
                assert len(ft_data_selected_ref[0]) == 30000
                model2, model_name2 = mt.train2(m, ft_data_selected_ref, val_data, ft_epochs, ft_data_augmentation,
                                                nametag_prefix, h5_path, weights_file=model_name0 + '.h5')
            scores_cube2 = metrics_color.color_domains_accuracy(model2, g)
            # print('Scores cube ref:', scores_cube2)
            weighted_cube = scores_cube2 * np.array(region_sizes) / float(10000)
            print('(Approx) Test accuracy', np.nansum(weighted_cube))  # Weighted average score_cube
            scores_cubes.append(scores_cube2)

        avg_ref_score_cube = np.nanmean(scores_cubes, axis=0)
        max_ref_score_cube = np.max(scores_cubes, axis=0)

        for x in xrange(g):
            for y in xrange(g):
                for z in xrange(g):
                    if region_sizes[x][y][z] > 100:
                        print('#--> Region ' + str(x) + str(y) + str(z) + ' (' + str(
                            region_sizes[x][y][z]) + ' images)')
                        nametag_prefix = 'ft_2445_r' + str(x) + str(y) + str(z) + '_cr_1'

                        ft_model_name = mt.ft_weight_file_name(model_name0, ft_data_augmentation, ft_epochs,
                                                               nametag=nametag_prefix + 'exp')
                        weights_file = h5_path + ft_model_name + '.h5'

                        if mt.model_state_exists(weights_file):
                            model1 = mt.load_by_name(model_name0, ft_data[0].shape[1:], weights_file)
                            score = dt.predict_and_acc(model1, val_data)
                            print('Val accuracy:', score[0])
                        else:
                            ft_data_args = metrics_color.finetune_by_region((x, y, z), ft_data, 10000, g)
                            ft_data_selected = dt.get_finetune_data(tr_data, ft_data, ft_data_args)
                            assert len(ft_data_selected[0]) == 30000
                            model1, model_name1 = mt.train2(m, ft_data_selected, val_data, ft_epochs,
                                                            ft_data_augmentation, nametag_prefix + 'exp',
                                                            h5_path, weights_file=model_name0 + '.h5')
                        scores_cube1 = metrics_color.color_domains_accuracy(model1, g)
                        # print('Scores cube exp:', scores_cube1)
                        print('  -  Region accuracy = ' + str(scores_cube1[x][y][z]))
                        weighted_cube = scores_cube1 * np.array(region_sizes) / float(10000)
                        print('  -  (Approx) Test accuracy = ', np.nansum(weighted_cube))  # Weighted average score_cube
                        # cc = np.subtract(scores_cube1, scores_cube2)
                        cc_avg = np.subtract(scores_cube1, avg_ref_score_cube)
                        print('  -  Region score (avg ref) = ' + str(float(cc_avg[x][y][z])))
                        cc_max = np.subtract(scores_cube1, max_ref_score_cube)
                        print('  -  Region score (max ref) = ' + str(float(cc_max[x][y][z])))
                        # print(cc)
                        print('           ~           ')
예제 #24
0
def bug_feature_detection():

    for m in models:
        tr_data = dt.get_data('cifar10', (0, 20000))
        val_data = dt.get_data('cifar10', (20000, 30000))
        test_data = dt.get_data('cifar10', (30000, 60000))

        model0, model_name0 = mt.train2(m, tr_data, val_data, 50, False, tag='cifar10-2-5', path=h5_path)
        acc, predicted_classes, y_predicted = dt.predict_and_acc(model0, test_data)
        # log_predictions(y_predicted, model_name0, path=csv_path)
        print('acc', acc)

        # print(sk_metrics.confusion_matrix(test_data[1], predicted_classes))
        # true_classes = np.argmax(test_data[1], axis=1) wrong
        true_classes = [int(k) for k in test_data[1]]
        pr = metrics.prediction_ratings(y_predicted, true_classes)

        model2, model_name2 = mt.train2(m, tr_data, val_data, 1, False, tag='cifar10-0223', path=h5_path)
        model1 = mt.reg_from_(model2, m)
        print('Reg model created')
        X_test, y_test = test_data
        tr_data = X_test[0:20000], pr[0:20000]
        val_data = X_test[20000:30000], pr[20000:30000]
        model1, model_name1 = mt.train_reg(model1, m, tr_data, val_data, '', 50, False, path=h5_path)
        # score = model1.evaluate(val_data[0], val_data[1], verbose=0)
        # print('Test loss:', score[0])
        # print('Val accuracy:', score[1])
        formatted_test_data = dt.format_data(val_data, 10)
        y_true = pr[20000:30000]
        print('Ground truth values:')
        print('Mean', np.mean(y_true))
        print('Std', np.std(y_true))
        print('Max', np.max(y_true))
        print('Min', np.min(y_true))
        y_predicted1 = model1.predict(formatted_test_data[0])
        # print(np.array(y_predicted).shape)
        n_guesses = len(y_predicted1)
        y_predicted2 = [y_predicted1[k][0] for k in xrange(n_guesses)]
        print('Prediction values:')
        print('Mean', np.mean(y_predicted2))
        print('Std', np.std(y_predicted2))
        print('Max', np.max(y_predicted2))
        print('Min', np.min(y_predicted2))
        y_predicted3 = y_predicted2 / np.linalg.norm(y_predicted2)
        print('Norm Prediction values:')
        print('Mean', np.mean(y_predicted3))
        print('Std', np.std(y_predicted3))
        print('Max', np.max(y_predicted3))
        print('Min', np.min(y_predicted3))

        # fig, axs = plt.subplots(1, 1)
        # axs.hist(y_true, bins=30)
        # axs.set_title('y_true for ' + m)
        # plt.show()
        #
        # fig, axs = plt.subplots(1, 1)
        # axs.hist(y_predicted2, bins=30, range=(0, 2))
        # axs.set_title(m)
        # plt.show()

        diff2 = []
        diff3 = []
        for k in xrange(min(10000, len(y_predicted))):
            diff2.append(abs(y_predicted2[k] - y_true[k]))
            diff3.append(abs(y_predicted3[k] - y_true[k]))
        print('Difference:')
        print('Mean ', np.mean(diff2))
        print('Max ', max(diff2))
        print('Difference Norm:')
        print('Mean ', np.mean(diff3))
        print('Max ', max(diff3))

        # R/W guess prediction
        opti_thr = float(np.sort(y_predicted2)[int(acc*10000)])
        print('opti_thr', opti_thr)
        thresholds = (float(0.6), float(0.7), float(0.777), float(0.8), float(0.9), opti_thr)
        # thresholds = (float(0.9), float(1), float(1.1), float(1.2), opti_thr)

        for thr in thresholds:
            n_right_guesses = 0
            for k in xrange(n_guesses):
                q = (test_data[1][20000+k] == predicted_classes[20000+k])
                p = y_predicted1[k][0] > thr
                if p == q:
                    n_right_guesses = n_right_guesses + 1

            print('acc for reg for true/false with thr of ' + str(thr) + ': ' + str(float(n_right_guesses)/n_guesses))

        # n_images = 10
        # n_rows = 10
        # for th in xrange(n_rows):
        #     fig, axes = plt.subplots(1, n_images, figsize=(n_images, 4),
        #                              subplot_kw={'xticks': (), 'yticks': ()})
        #     for dec in xrange(n_images):
        #         ax = axes[dec]
        #         pr_rank = 7000 + th * 100 + dec
        #         img_id = sorted_pr_args[pr_rank]
        #         # print(str(pr_rank) + ': ' + str(y_test[img_id]))  # + ' conf. guessed = ' + str(guessed[img_id]))
        #         ax.imshow(X_test[img_id], vmin=0, vmax=1)
        #         ax.set_title('pr#' + str(pr_rank) + "\nid#" + str(img_id)
        #                      + '\nr=' + str("{0:.2f}".format(pr[img_id]))
        #                      + '\np_cl=' + str(predicted_classes[img_id])
        #                      + '\nr_cl=' + str(true_classes[img_id]))
        #     plt.show()

        print('           ~           ')
예제 #25
0
파일: KNN.py 프로젝트: oldsheep2019/EP-FPG
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from data_tools import get_data

if __name__ == '__main__':

    data_set_name = 'WIL'
    # data_set_name = 'BLE'

    test_num = 3
    for neighbor_num in (1, 3, 5, 11):
        if data_set_name == 'WIL':
            acc_list = []
            for k in range(test_num):
                train_data, train_label, test_data, test_label = get_data(
                    data_set_name)
                neigh = KNeighborsClassifier(n_neighbors=neighbor_num)
                neigh.fit(train_data, train_label)
                predict_label = neigh.predict(test_data)

                test_sample_num = test_data.shape[0]
                err_cnt = 0
                for i in range(test_sample_num):
                    if np.argmax(predict_label[i]) != np.argmax(test_label[i]):
                        err_cnt += 1

                acc = 1 - (err_cnt / test_sample_num)
                print('kNN test accuracy = %.1f%%' % (acc * 100))
                acc_list.append(acc)
            print('\nkNN mean test accuracy (for k = %d) = %.2f%%\n' %
                  (neighbor_num, sum(acc_list) / len(acc_list) * 100))
예제 #26
0
import pandas as pd
import os

#impvar = "Breast_Stage_Z"
impvar = "log2T_use_Z"

pdf = PdfPages("plot_imputed_%s.pdf" % impvar)

# Plot only a few curves to avoid overplotting
di = []
for j in range(5):
    di.append(pd.read_csv(os.path.join("imputed_data_puberty", "%s_imp_%d.csv" % (impvar, j))))

for female in False, True:

    dx = get_data(female, impvar)

    idx = dx.ID.unique().astype(np.int).tolist()

    jj = 0
    for id0 in idx:

        vv = df.loc[df.ID == id0, :]
        v0 = dx.loc[dx.ID == id0, :]

        plt.clf()
        plt.title("ID=%d" % id0)
        plt.grid(True)

        plt.plot(vv.Age, vv[impvar], 'o', color='purple')
예제 #27
0
파일: nn.py 프로젝트: jackyzha0/vybe
epochs = 1200
learning_rate = 0.005
num_features = 193
n_hidden_units_one = 256
n_hidden_units_two = 512
n_hidden_units_three = 1024

savepath = os.getcwd() + '/ckpt'

### NN Setup
"""
Input Dims: 26 (features) x 501 (time length)
Output Dims: (num classes)
"""

db, db_size, occ = data_tools.get_data()
t_db, t_db_size, _ = data_tools.get_data(test=True)
print(t_db_size)

index = np.arange(db_size)
np.random.shuffle(index)


def get_indices(batchsize):
    global index
    if index.size < batchsize:
        index = np.arange(db_size)
        np.random.shuffle(index)
    ret = index[:batchsize]
    index = index[:-batchsize].copy()
    return ret
예제 #28
0
# Ages to impute
imp_ages = np.arange(1, maxage + 1)

# Storage for results
dx = [None, None]
preg = [None, None]
rslt = [None, None]

outf.write("Imputing %s\n\n" % impvar)

# Fit a Gaussian process model separately to females and males.
for female in 0, 1:

    outf.write("female=%d\n\n" % female)

    dx[female] = get_data(female, impvar, others=["SBP_MEAN"])
    outf.write("Loaded %d x %d values\n" % tuple(dx[female].shape))
    outf.write("%d distinct people in initial data\n\n" %
               dx[female].ID.unique().size)

    # Drop people with no SBP data
    x = dx[female][["ID", "SBP_MEAN"]].dropna().groupby("ID").size()
    x = pd.DataFrame(x, columns=["n_SBP_mean"])
    dx[female] = pd.merge(dx[female],
                          x,
                          left_on="ID",
                          right_on="ID",
                          how='outer')
    dx[female] = dx[female].loc[dx[female].n_SBP_mean > 0, :]
    dx[female] = dx[female].drop(["SBP_MEAN", "n_SBP_mean"], axis=1)
    dx[female] = dx[female].dropna()
예제 #29
0
def pr_on_fair_distribution(models=['densenet121'], top_n=100, res=4):
    test_data = dt.get_data('cifar10', (50000, 60000))

    # Add every image's cube in densities
    densities = []
    for img in test_data[0]:
        cc = metrics_color.ColorDensityCube(res)
        cc.feed(img)
        densities.append(cc.get_cube())
        # ccf = np.array(cc.get_cube()).flatten()

    # Shape densities (list of cubes) to make a list per color
    densities_lists = np.swapaxes(np.swapaxes(np.swapaxes(densities, 0, 3), 0, 2), 0, 1)
    # print(densities_lists.shape)
    densities_cube = np.empty((res, res, res), dtype=object)

    # For each color keep the ids of the top_n most dense images in this color (same image can be in 2 colors)
    for i in xrange(res):
        for j in xrange(res):
            for k in xrange(res):
                # pr_most_dense = []
                density_list = densities_lists[i][j][k].tolist()
                args_most_dense = np.argsort(density_list)[-top_n:]
                densities_cube[i][j][k] = args_most_dense
    # print(densities_cube.shape)

    # Per model analysis
    for m in models:
        # Load model predictions and ground_truth values
        model_name0 = mt.weight_file_name(m, 'cifar10-2-5', 50, False)
        y_predicted = t_log.load_predictions(model_name0, file_path=csv_path)
        true_classes = [int(k) for k in test_data[1]]
        pr = metrics.prediction_ratings(y_predicted, true_classes)

        # For each color get prediction score of the top_n images
        score_cube = np.zeros((res, res, res))
        global_cc = metrics_color.ColorDensityCube(resolution=res)
        args_most_dense_all = []
        for i in xrange(res):
            for j in xrange(res):
                for k in xrange(res):
                    pr_most_dense = []
                    densities_args = densities_cube[i][j][k].tolist()
                    # args_most_dense = np.argsort(density_list)[-topn:]
                    ijk_cc = metrics_color.ColorDensityCube(res)
                    for a in densities_cube[i][j][k].tolist():
                        pr_most_dense.append(pr[a])
                        ijk_cc.feed(test_data[0][a])
                        global_cc.feed(test_data[0][a])
                    ijk_cc.normalize()
                    ttl = 'color = (' + str(float(i/res)) + ', ' + str(float(j/res)) + ', ' + str(float(k/res)) + ')'
                    # ijk_cc.plot_cube()
                    score_cube[i][j][k] = np.mean(pr_most_dense)
                    print(np.mean(pr_most_dense))
                    # args_most_dense_all.append(args_most_dense)
                    ttl = 'color = (' + str(float(i/res)) + ', ' + str(float(j/res)) + ', ' + str(float(k/res)) + ')'
                    # plotting.show_imgs(densities_args[:10], ttl, test_data[0], showColorCube=True, resolution=4)

        global_cc.normalize()
        global_cc.plot_cube(title='Fair distributed dataset ColorCube')

        sc = metrics_color.ColorDensityCube(resolution=res, cube=score_cube)
        sc.normalize()
        sc.plot_cube(title='Scores per color for ' + m)