Exemplo n.º 1
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'    

    test_input_filename = 'test_data.csv'

    svc_model_filename = 'svc_classif.pkl'
    lr_model_filename = 'lr_classif.pkl'
    rfc_model_filename = 'rfc_classif.pkl'
    rfc_feat_imp_filename = 'rfc_feat_imp.png'
    model_comp_result_chart_filename = 'method_comp_res.png'
    
    io = lib.io.IO()
    viz = lib.viz.Viz()
    
    # Read data
    X_o, y_o = io.read_data(input_filename_x, input_filename_y)
    test_x = io.read_data(test_input_filename, None)

    print "There are " + str(len(X_o)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."

    SVC_ll , SVC_a, RFC_ll, RFC_a, LR_ll, LR_a = [], [], [], [], [], []

    comps = [10, 20, 50, 100, 150, 200, 264]
    for s in comps:
        print("Amount of components: %d"%s)
        cl = lib.cl.CL(io, viz)
        X = copy.deepcopy(X_o)
        y = copy.deepcopy(y_o)
        
        test_x = np.matrix(test_x)
        test_ids = range(1, len(test_x)+1)    
        
        # Remove outliers
        X, y = cl.lof(np.matrix(X), np.matrix(y))
        
        # Shuffle
        X, y = io.shuffle(X, y)

        # PCA
        X = cl.pca(np.matrix(X), components=s, filename=None).tolist()
        # test_x = cl.pca(np.matrix(test_x), components=s, filename=None).tolist()

        val_ids, val_x, val_y = io.pick_set(X, y, 726)
        train_ids, train_x, train_y = io.pick_set(X, y, 3200)
    
        # Train
        cl.lr_cl_train(train_x, train_y, filename=lr_model_filename)

        # Validate
        ll, a = cl.lr_cl_val(val_x, val_y)
        LR_ll.append(ll)
        LR_a.append(a)

    # Draw some results
    viz.cross_results(comps, LR_ll, LR_a, 'pca_cross_val.png')
Exemplo n.º 2
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'
    test_input_filename = 'test_data.csv'

    lr_model_filename = 'lr2_classif.pkl'
    model_comp_result_chart_filename = 'method_comp_res.png'

    io = lib.io.IO()
    viz = lib.viz.Viz()
    cl = lib.cl.CL(io, viz)

    # Read data
    X, y = io.read_data(input_filename_x, input_filename_y)
    test_x = io.read_data(test_input_filename, None)
    X_ = copy.deepcopy(X)
    y_ = copy.deepcopy(y)

    print "There are " + str(len(X)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."

    test_x = np.matrix(test_x)
    test_ids = range(1, len(test_x) + 1)

    # Remove outliers
    X, y = cl.lof(np.matrix(X), np.matrix(y))

    # Shuffle
    X, y = io.shuffle(X, y)

    # PCA
    #X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist()
    #test_x = cl.pca(np.matrix(test_x), None).tolist()

    # Split data to train and validation set
    val_ids, val_x, val_y = io.pick_set(X, y, 726)
    train_ids, train_x, train_y = io.pick_set(X, y, 3200)

    # Train
    # cl.lr_cl_train(train_x, train_y, filename=lr_model_filename)
    cl.lr_cl_load(lr_model_filename)

    # Validate
    cl.lr_cl_val(val_x, val_y)

    # predict
    pred_class, pred_proba = cl.lr_cl_pred(test_x)

    # Output
    io.write_classes('classes_lr2_result.csv', test_ids, pred_class)
    io.write_probabilities('probabilities_lr2_result.csv', test_ids,
                           pred_proba)
Exemplo n.º 3
0
def run(data_path, grid_search_path, ensemble_output_path, score_output_path,
        number_of_partitions, number_of_iterations, best_proportion,
        used_proportion):

    # Read partitioned input data
    data = read_partitioned_data(data_path, number_of_iterations,
                                 number_of_partitions)

    # Read true values from the partitioned data set
    true_values = get_true_values(data)

    # Read the grid search results as input data
    results = read_data(grid_search_path)

    # Construct the ensemble based on the results of the grid search and the
    # proportion parameters passed to this script
    ensemble = construct_ensemble(results, best_proportion, used_proportion)

    # Retrieve the classification results from the ensemble based on a
    # popularity vote
    predicted_values = ensemble_vote(ensemble)

    # Score the classification results of the ensemble against the true values
    result = Result()
    result.add_values(true_values, predicted_values)
    result.calculate()

    # Output the ensemble into the specified file
    write_data(ensemble_output_path, ensemble)

    # Output the ensemble score into the specified file
    write_data(score_output_path, result)
Exemplo n.º 4
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'    

    test_input_filename = 'test_data.csv'
    
    io = lib.io.IO()
    viz = lib.viz.Viz()
    cl = lib.cl.CL(io, viz)
    
    # Read data
    X, y = io.read_data(input_filename_x, input_filename_y)
    test_x = io.read_data(test_input_filename, None)
    
    print "There are " + str(len(X)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."
    
    viz.label_hist(y, 'label_hist.png')
def run(input_path, output_path):

    # Read the results as input data
    results = read_data(input_path)

    # Retrieve the best result
    best_result = sorted(results, key=lambda k: k.average_f1(),
                         reverse=True)[0]

    # Output the score into the specified file
    write_data(output_path, best_result)
Exemplo n.º 6
0
def run(input_path, output_path):

    # Read the integrated score results
    scores = read_data(input_path)
    print(scores)  # TODO: Remove

    # Visualise the scores as a heatmap
    pass  # TODO

    # Save the heatmap to the specified destination
    pass  # TODO
Exemplo n.º 7
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'
    test_input_filename = 'test_data.csv'

    lr_model_filename = 'lr2_classif.pkl'
    model_comp_result_chart_filename = 'method_comp_res.png'

    io = lib.io.IO()
    viz = lib.viz.Viz()
    cl = lib.cl.CL(io, viz)

    # Read data
    X, y = io.read_data(input_filename_x, input_filename_y)
    test_x = io.read_data(test_input_filename, None)
    X_ = copy.deepcopy(X)
    y_ = copy.deepcopy(y)

    print "There are " + str(len(X)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."

    test_x = np.matrix(test_x)
    test_ids = range(1, len(test_x) + 1)

    cl.lr_cl_load(lr_model_filename)

    # predict
    pred_class, pred_proba = cl.lr_cl_pred(X)

    viz.plot_confusion_matrix(y, pred_class, np.arange(1, 11))
    viz.plot_confusion_matrix(y,
                              pred_class,
                              np.arange(1, 11),
                              normalize=True,
                              filename='confusion_matrix_norm.png')
Exemplo n.º 8
0
def run(input_path):

    # Read the data set
    data = read_data(input_path)

    # Print header
    print('--------------------------------------------------')
    print('Descriptive analysis')
    print('--------------------------------------------------')

    # Analyse keys
    keys = sorted(data.keys())
    print('There are ' + str(len(keys)) + ' keys in total:')
    print('  ' + str(keys))
    print('')

    # Analyse subjects
    print('There are ' + str(len(set(data['subjects']))) + ' ' +
          'unique subjects tests were performed on.')
    print('Their frequencies within the data set are the following:')
    output_frequencies(data['subjects'])
    print('')

    # Analyse Brodmann areas
    print('There are ' + str(len(set(data['areas']))) + ' ' +
          'unique Brodmann areas used in the tests.')
    print('Their frequencies within the data set are the following:')
    output_frequencies(data['areas'])
    print('')

    # Analyse image categories
    print('There are ' + str(len(set(data['image_category']))) + ' unique ' +
          'image categories that the images have been classified into.')
    print('Their frequencies within the data set are the following:')
    output_frequencies(data['image_category'])
    print('')

    # Analyse tests
    print('In total, there were ' + str(len(data['subjects'])) + ' tests ' +
          'performed on each of the ' + str(len(data['image_category'])) +
          ' ' +
          'images. For each of these test and image pairs, there is an ' +
          'integer denoting the neural response in the specifed Brodmann ' +
          'area of the specified patient after showing them the specified ' +
          'image.')
    print('--------------------------------------------------')
def run(raw_input_path, output_path_recall, output_path_precision,
        output_path_f1, time_windows, frequency_bands):

    # Convert time windows to integers
    time_windows = [int(time_window) for time_window in time_windows]

    # Initialise the integrated score data dictionaries
    integrated_recall = {}
    integrated_precision = {}
    integrated_f1 = {}
    for time_window in time_windows:
        integrated_recall[time_window] = {}
        integrated_precision[time_window] = {}
        integrated_f1[time_window] = {}
    for time_window in time_windows:
        for frequency_band in frequency_bands:
            integrated_recall[time_window][frequency_band] = None
            integrated_precision[time_window][frequency_band] = None
            integrated_f1[time_window][frequency_band] = None

    # Read F1-scores from the input files into the integrated data dictionary

    # Iterate through each time window and frequency band pair
    for time_window in time_windows:
        for frequency_band in frequency_bands:

            # Construct the input file path
            input_path = raw_input_path.replace('TIMEWINDOW', str(time_window))\
                .replace('FREQUENCYBAND', frequency_band)

            # Read the input file
            input_data = read_data(input_path)

            # Add the F1-score received from the data into the integrated data
            # dictionary
            integrated_recall[time_window][
                frequency_band] = input_data.average_recall()
            integrated_precision[time_window][
                frequency_band] = input_data.average_precision()
            integrated_f1[time_window][frequency_band] = input_data.average_f1(
            )

    # Output the integrated scores into the specified files
    write_data(output_path_recall, integrated_recall)
    write_data(output_path_precision, integrated_precision)
    write_data(output_path_f1, integrated_f1)
Exemplo n.º 10
0
def main(argv):

    X, y = [], []

    io = lib.io.IO()
    
    # Read data
    X, y = io.read_data()    
    X,y = numpy.matrix(X), numpy.matrix(y).T
    print X.shape
    print y.shape

    sys.exit()
        
    plt.n, bins, patches = plt.hist(errors)

    plt.ylabel('Frequency')
    plt.xlabel('MSE')

    # Save figure
    plt.savefig('xx.png')    
def run(input_path, output_path, classes):

    # Read the input data set from the specified input path
    input_data = read_data(input_path)

    # Change the list of classes to a set
    classes = set(classes)

    # Construct the output data set, filtering to only have the selected classes
    output_data = {
        'subjects': input_data['subjects'],
        'areas': input_data['areas'],
        'image_category': [],
        'neural_responses': []
    }
    for i in range(len(input_data['image_category'])):
        if input_data['image_category'][i] in classes:
            for field in ['image_category', 'neural_responses']:
                output_data[field].append(input_data[field][i])

    # Write the output data set to the specified output path
    write_data(output_path, output_data)
Exemplo n.º 12
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'

    test_input_filename = 'test_data.csv'

    svc_model_filename = 'svc_classif.pkl'
    # lr_model_filename = 'lr_classif.pkl'
    lr_model_filename = 'classif.pkl'
    rfc_model_filename = 'rfc_classif.pkl'
    rfc_feat_imp_filename = 'rfc_feat_imp.png'
    model_comp_result_chart_filename = 'method_comp_res.png'

    io = lib.io.IO()
    viz = lib.viz.Viz()
    cl = lib.cl.CL(io, viz)

    # Read data
    X, y = io.read_data(input_filename_x, input_filename_y)
    test_x = io.read_data(test_input_filename, None)
    X_ = copy.deepcopy(X)
    y_ = copy.deepcopy(y)

    print "There are " + str(len(X)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."

    test_x = np.matrix(test_x)
    test_ids = range(1, len(test_x) + 1)

    # PCA etc.
    X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist()
    # test_x = cl.pca(test_x, None).tolist()

    val_ids, val_x, val_y = io.pick_set(X, y, 1063)
    _, no_pca_val_x, no_pca_val_y = io.pick_set(X_, y_, 1063)
    #train_ids, train_x, train_y = io.pick_set(X, y, 4000)
    #_, no_pca_train_x, no_pca_train_y = io.pick_set(X_, y_, 4000)

    # Train
    # cl.svc_cl_train(train_x, train_y, filename=svc_model_filename)
    # cl.lr_cl_train(train_x, train_y, filename=lr_model_filename)
    #cl.rfc_cl_train(no_pca_train_x, no_pca_train_y,
    #                filename=rfc_model_filename,
    #                feat_imp_plot_filename=rfc_feat_imp_filename)
    cl.svc_cl_load(svc_model_filename)
    cl.lr_cl_load(lr_model_filename)
    cl.rfc_cl_load(rfc_model_filename)

    # validate
    results = {}
    results['SVC'] = cl.svc_cl_val(val_x, val_y)
    results['Linear Regression'] = cl.lr_cl_val(no_pca_val_x, no_pca_val_y)
    results['Random Forest Classifier'] = cl.rfc_cl_val(
        no_pca_val_x, no_pca_val_y)

    # Draw some results
    viz.model_comp_results(results, model_comp_result_chart_filename)

    # predict
    # pred_class, pred_proba = cl.svc_cl_pred(test_x)
    pred_class, pred_proba = cl.rfc_cl_pred(test_x)
    # pred_class, pred_proba = cl.lr_cl_pred(test_x)

    # Output
    io.write_classes('classes_sub_result.csv', test_ids, pred_class)
    io.write_probabilities('probabilities_sub_result.csv', test_ids,
                           pred_proba)
Exemplo n.º 13
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'

    test_input_filename = 'test_data.csv'

    svc_model_filename = 'svc_classif.pkl'
    lr_model_filename = 'lr_classif.pkl'
    rfc_model_filename = 'rfc_classif.pkl'
    rfc_feat_imp_filename = 'rfc_feat_imp.png'
    model_comp_result_chart_filename = 'method_comp_res.png'

    io = lib.io.IO()
    viz = lib.viz.Viz()

    # Read data
    X_o, y_o = io.read_data(input_filename_x, input_filename_y)
    test_x = io.read_data(test_input_filename, None)

    print "There are " + str(len(X_o)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."

    SVC_ll, SVC_a, RFC_ll, RFC_a, LR_ll, LR_a = [], [], [], [], [], []

    for s in range(50):
        print("Iteration %d" % s)
        cl = lib.cl.CL(io, viz)
        X = copy.deepcopy(X_o)
        y = copy.deepcopy(y_o)
        X_ = copy.deepcopy(X_o)
        y_ = copy.deepcopy(y_o)

        test_x = np.matrix(test_x)
        test_ids = range(1, len(test_x) + 1)

        # Remove outliers
        X, y = cl.lof(np.matrix(X), np.matrix(y))
        X_, y_ = cl.lof(np.matrix(X_), np.matrix(y_))

        # Shuffle
        X, y = io.shuffle(X, y)
        X_, y_ = io.shuffle(X_, y_)

        # PCA
        X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist()
        test_x = cl.pca(np.matrix(test_x), None).tolist()

        val_ids, val_x, val_y = io.pick_set(X, y, 726)
        _, no_pca_val_x, no_pca_val_y = io.pick_set(X_, y_, 726)
        train_ids, train_x, train_y = io.pick_set(X, y, 3200)
        _, no_pca_train_x, no_pca_train_y = io.pick_set(X_, y_, 3200)

        # Train
        cl.svc_cl_train(train_x, train_y, filename=svc_model_filename)
        cl.lr_cl_train(train_x, train_y, filename=lr_model_filename)
        cl.rfc_cl_train(no_pca_train_x,
                        no_pca_train_y,
                        filename=rfc_model_filename,
                        feat_imp_plot_filename=rfc_feat_imp_filename)

        # validate
        ll, a = cl.svc_cl_val(val_x, val_y)
        SVC_ll.append(ll)
        SVC_a.append(a)

        ll, a = cl.lr_cl_val(val_x, val_y)
        LR_ll.append(ll)
        LR_a.append(a)

        ll, a = cl.rfc_cl_val(no_pca_val_x, no_pca_val_y)
        RFC_ll.append(ll)
        RFC_a.append(a)

    # Draw some results
    results = {
        'SVC': (sum(SVC_ll) / len(SVC_ll), sum(SVC_a) / len(SVC_a)),
        'Logistic Regression':
        (sum(LR_ll) / len(LR_ll), sum(LR_a) / len(LR_a)),
        'Random Forest Classifier':
        (sum(RFC_ll) / len(RFC_ll), sum(RFC_a) / len(RFC_a))
    }
    viz.model_comp_results(results, model_comp_result_chart_filename)

    # predict
    pred_class, pred_proba = cl.lr_cl_pred(test_x)

    # Output
    io.write_classes('classes_sub_result.csv', test_ids, pred_class)
    io.write_probabilities('probabilities_sub_result.csv', test_ids,
                           pred_proba)
Exemplo n.º 14
0
    options, args = parser.parse_args()

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'
    test_input_filename = 'test_data.csv'

    model_filename = 'model.yaml'
    weights_filename = 'weights.h5'

    io = lib.io.IO()
    viz = lib.viz.Viz()
    cl = lib.cl.CL(io, viz)

    # Read data
    print "Reading train data..."
    X, y = io.read_data(input_filename_x, input_filename_y)
    y = io.shift_v(y, shift=-1)

    print "Reading test data..."
    test_x = io.read_data(test_input_filename, None)

    print "There are " + str(len(X)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."

    test_x = np.matrix(test_x)
    test_ids = range(1, len(test_x) + 1)

    # load from file
    if options.load_path is not None:
        model = load_model(options.load_path + '/' + model_filename,
                           options.load_path + '/' + weights_filename)
Exemplo n.º 15
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'    

    test_input_filename = 'test_data.csv'

    svc_model_filename = 'svc_classif.pkl'
    lr_model_filename = 'lr_classif.pkl'
    rfc_model_filename = 'rfc_classif.pkl'
    rfc_feat_imp_filename = 'rfc_feat_imp.png'
    model_comp_result_chart_filename = 'method_comp_res.png'
    
    nn_model_filename = 'kf-nn1.pkl'
    
    io = lib.io.IO()
    viz = lib.viz.Viz()    
    nn = lib.nn.NN(io, viz)
    
    # Read data
    print "Reading train data..."
    X, y = io.read_data(input_filename_x, input_filename_y)
    y = io.shift_v(y, shift=-1)

    print "Reading test data..."
    test_x = io.read_data(test_input_filename, None)
    
    print "There are " + str(len(X)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."
    
    test_x = np.matrix(test_x)
    test_ids = range(1, len(test_x)+1)    
    
    # Split data to train and validation set
    # mini_batches
    num_of_batches = 5
    ids, batches_x, batches_y = io.split_data(X, y, num_of_batches)
    
    #val_ids, val_x, val_y = io.pick_set(X, y, 563)
    #train_ids, train_x, train_y = io.pick_set(X, y, 3800)
       
    # Train
    training_errors = []
    validation_errors = []
    model_sizes = [3, 9, 27, 81, 243]
    
    for nn2 in model_sizes:
        nn1 = nn2*2
        
        avg_error_validation = 0
        avg_error_train = 0
        for batch_num in range(num_of_batches):
            nn.initialize(batches_x[0].shape[1], nn1=nn1, nn2=nn2)
            val_x = batches_x[batch_num]
            val_y = batches_y[batch_num]            

            # train
            for train_batch_num in range(num_of_batches):
                if train_batch_num == batch_num: continue                                
                nn.train(batches_x[train_batch_num], batches_y[train_batch_num], val_x, val_y, training_steps=1000, plot_prefix='k-fold-')

            # Calculate average training error with optimal w
            train_error = 0
            for train_batch_num in range(num_of_batches):
                if train_batch_num == batch_num: continue                    
                c = nn.get_cost(batches_x[train_batch_num], batches_y[train_batch_num])
                train_error += (c - train_error)/(train_batch_num + 1)                    

            avg_error_train += (c - avg_error_train)/(batch_num + 1)

            # Validate
            error_validation = nn.get_cost(val_x, val_y)
            avg_error_validation += (error_validation - avg_error_validation)/(batch_num+1)

            # Output
            print 'Batch ' + str(batch_num)+' validation error: '+str(error_validation)
            print 'AVG validation error after validation batch: ' + str(batch_num)+': '+str(avg_error_validation)
            print ' '
            print '-----'

        validation_errors.append(avg_error_validation)
        training_errors.append(avg_error_train)
        
    nn.save_nn(nn_model_filename)
            
    # Draw some results
    # viz.model_comp_results(results, model_comp_result_chart_filename)    
    viz.model_size_comp(model_sizes, validation_errors, training_errors, 'nn1_model_size_comp.png')
Exemplo n.º 16
0
def run(input_path, output_path, cv_amount, use_even_distribution):

    # Read the data set
    data = read_data(input_path)

    # Find the number of images in the data set
    number_of_images = len(data['image_category'])

    # Find all image classes in the data set
    classes = sorted(set(data['image_category']))

    # Initialise the list of partitioned indices
    partitioned_indices = [[] for i in range(cv_amount)]

    # If even distribution is set to be used, partition data within each class
    # separately and merge the resulting partitions into the partitioned
    # indices list, so the image class distribution in each partition would be
    # roughly the same.
    if use_even_distribution:

        # Construct a list of image indices corresponding to each image class
        indices = {}
        for image_class in classes:
            indices[image_class] = []
        for i in range(number_of_images):
            indices[data['image_category'][i]].append(i)

        # Randomly split each of these lists into k nearly equal parts, and
        # merge them by partitions
        for image_class in classes:

            # Partition the indices list for the current image class into k
            # nearly equal parts
            partitions_list = partition_list(indices[image_class], cv_amount)

            # Shuffle the partition list to ensure that cumulative partitions
            # after merging by partitions are roughly of equal size
            shuffle(partitions_list)

            # Merge the partitioned indices list for the current image class
            # into the general partitioned indices list by partitions
            for i in range(cv_amount):
                partitioned_indices[i] += partitions_list[i]

    # If even distribution is not set to be used, partition data randomly.
    else:

        # Partition the indices list into k nearly equal parts
        partitioned_indices = partition_list(range(number_of_images),
                                             cv_amount)

    # Sort all of the partitions
    for partition in partitioned_indices:
        partition.sort()

    # Partition data
    partitions = []
    for i in range(cv_amount):
        partitions.append({
            'subjects':
            data['subjects'],
            'areas':
            data['areas'],
            'image_category':
            [data['image_category'][j] for j in partitioned_indices[i]],
            'neural_responses':
            [data['neural_responses'][j] for j in partitioned_indices[i]]
        })

    # Save partitioned data
    for i in range(cv_amount):
        write_data(add_suffix_to_path(output_path, '-', i + 1), partitions[i])
Exemplo n.º 17
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'

    test_input_filename = 'test_data.csv'

    svc_model_filename = 'svc_classif.pkl'
    lr_model_filename = 'lr_classif.pkl'
    rfc_model_filename = 'rfc_classif.pkl'
    rfc_feat_imp_filename = 'rfc_feat_imp.png'
    model_comp_result_chart_filename = 'method_comp_res.png'

    nn_model_filename = 'nn1.pkl'

    io = lib.io.IO()
    viz = lib.viz.Viz()
    nn = lib.nn.NN(io, viz)
    cl = lib.cl.CL(io, viz)

    # Read data
    print "Reading train data..."
    X, y = io.read_data(input_filename_x, input_filename_y)
    y = io.shift_v(y, shift=-1)

    print "Reading test data..."
    test_x = io.read_data(test_input_filename, None)

    print "There are " + str(len(X)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."

    test_x = np.matrix(test_x)
    test_ids = range(1, len(test_x) + 1)

    # PCA etc.
    X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist()
    test_x = cl.pca(test_x, None).tolist()

    # Split data to train and validation set
    # mini_batches
    #ids, batches_x, batches_y = io.split_data(X, y, 100, 100)

    val_ids, val_x, val_y = io.pick_set(X, y, 563)
    train_ids, train_x, train_y = io.pick_set(X, y, 3800)

    nn.initialize(train_x.shape[1], nn1=18, nn2=9,
                  alpha=0.01)  #, filename=nn_model_filename)

    # Train
    pred, proba, acc = nn.predict(train_x, train_y)
    print("Train set classification accuray before training: %.4f" % acc)

    nn.train(train_x, train_y, val_x, val_y, training_steps=100000)
    nn.save_nn(nn_model_filename)

    # validate
    pred, proba, acc = nn.predict(train_x, train_y)
    print("Train set classification accuray after training: %.4f" % acc)
    pred, proba, acc = nn.predict(val_x, val_y)
    print("Validation set classification accuray after training: %.4f" % acc)

    # Draw some results
    # viz.model_comp_results(results, model_comp_result_chart_filename)

    # predict
    pred_class, pred_proba, _ = nn.predict(test_x)
    pred_class = io.shift_v(pred_class, shift=1)

    # Output
    io.write_classes('nn_classes_sub_result.csv', test_ids, pred_class)
    io.write_probabilities('nn_probabilities_sub_result.csv', test_ids,
                           pred_proba)