예제 #1
0
    music_test.X = z_norm_by_feature(music_test.X, mean_X, std_X)

    # Balacing train data.
    # print "Balacing train data."
    # music_train.balance_data_oversampling_smote_regular()

    # Set train parameters.
    # lambdav = 0.00001
    lambdav = 0
    # alpha = 0.0000001
    # iterations = 1000000
    alpha = 0.1
    iterations = 1200

    # print "Solving normal equation."
    theta = solve_normal_equation(music_train.X, music_train.y, lambdav)

    print "Solving using gradient descent."
    # theta = gradient_descent(music_train.X, music_train.y, None, alpha, lambdav, iterations)
    #theta, J_history = gradient_descent_with_J_history(music_train.X, music_train.y, None, alpha, lambdav, iterations)
    #plot_history(J_history)

    print "Computing cost."
    print compute_cost(music_train.X, music_train.y, theta, lambdav)
    print compute_cost(music_validation.X, music_validation.y, theta, lambdav)
    print compute_cost(music_test.X, music_test.y, theta, lambdav)

    for delta_year in range(10):
        print delta_year

        print "Computing train accuracy."
        for year2 in range(2000, 2010):
            print year1, year2
            delta_year = 5
            less_year = music_train.y <= year1
            less_year.shape = (len(music_train.y))
            greater_year = music_train.y > year1
            greater_year.shape = (len(music_train.y))

            music_train_y_year_yes_or_not = np.array(music_train.y)
            music_train_y_year_yes_or_not[less_year] = year1
            music_train_y_year_yes_or_not[greater_year] = year2
            music_train_y_year_yes_or_not.shape = (
                len(music_train_y_year_yes_or_not), 1)

            # < year or > year classifier.
            theta_year_yes_or_not = solve_normal_equation(
                music_train.X, music_train_y_year_yes_or_not, 0)

            # < year classifier.
            y = np.array(music_train.y[less_year])
            y.shape = (len(y), 1)
            X = music_train.X[np.where(less_year)]
            theta_year_less = solve_normal_equation(X, y, 0)
            print compute_accuracy(X, y, theta_year_less, delta_year)

            # > year classifier.
            y = np.array(music_train.y[greater_year])
            y.shape = (len(y), 1)
            X = music_train.X[np.where(greater_year)]
            theta_year_more = solve_normal_equation(
                music_train.X[greater_year], y, 0)
            print compute_accuracy(X, y, theta_year_more, delta_year)
 #music_train.balance_data_undersampling_cluster_centroids()
 #music_train.balance_data_undersampling_tomek_links()
 music_train.balance_data_ensemblesampling_balance_cascade()
 #music_train.balance_data_ensemblesampling_balance_cascade()
 after_balacing_size = len(music_train.X)
 print "Before balacing size: " + str(before_balacing_size)
 print "After balacing size: " + str(after_balacing_size)
 
 # Set train parameters.
 lambdav = 0.0000000001
 n = len(music_train.X[0])
 
 print "Solving normal equation."
 
 # Get thetas to reduce data.
 theta = solve_normal_equation(music_train.X, music_train.y, lambdav)
 ordered_theta = np.argsort(np.abs(theta).reshape(len(theta)))
 ordered_theta = ordered_theta[::-1]
 
 # Initialize costs.
 J_history_train = np.zeros(n)
 J_history_validation = np.zeros(n)
 
 for iteration in range(n):
     theta = solve_normal_equation(music_train.X[:, ordered_theta[:(n - iteration)]], music_train.y, lambdav)
     J_history_train[iteration] = compute_cost(music_train.X[:, ordered_theta[:(n - iteration)]], music_train.y, theta, 0)
     J_history_validation[iteration] = compute_cost(music_validation.X[:, ordered_theta[:(n - iteration)]], music_validation.y, theta, 0)
     
     print "Theta size: " + str(n - iteration)
     print "J_train: %f" % J_history_train[iteration]
     print "J_validation: %f" % J_history_validation[iteration]