input2 = pickle.load(open("msd_train_t2.pkl", "rb"))
    input3 = pickle.load(open("msd_train_t3.pkl", "rb"))
    input4 = pickle.load(open("msd_train_t4.pkl", "rb"))
    input5 = pickle.load(open("msd_train_t5.pkl", "rb"))
    # print input1.shape[0]
    # input = pickle.load(open("msd_train.pkl", "rb"))

    maxval1 = crop_rock.find_second_max_value(input1)
    maxval2 = crop_rock.find_second_max_value(input2)
    maxval3 = crop_rock.find_second_max_value(input3)
    maxval4 = crop_rock.find_second_max_value(input4)
    maxval5 = crop_rock.find_second_max_value(input5)
    # print maxval1
    # maxval = crop_rock.find_second_max_value(input)

    filtered1 = crop_rock.drop_excess_rows(input1, maxval1)
    filtered2 = crop_rock.drop_excess_rows(input2, maxval2)
    filtered3 = crop_rock.drop_excess_rows(input3, maxval3)
    filtered4 = crop_rock.drop_excess_rows(input4, maxval4)
    filtered5 = crop_rock.drop_excess_rows(input5, maxval5)
    # print filtered1.shape[0]
    # filtered = crop_rock.drop_excess_rows(input, maxval)

    #handling missing data
    filtered1 = filtered1[filtered1['Genre']!='UNCAT']; filtered1 = filtered1.dropna()
    filtered2 = filtered2[filtered2['Genre']!='UNCAT']; filtered2 = filtered2.dropna()
    filtered3 = filtered3[filtered3['Genre']!='UNCAT']; filtered3 = filtered3.dropna()
    filtered4 = filtered4[filtered4['Genre']!='UNCAT']; filtered4 = filtered4.dropna()
    filtered5 = filtered5[filtered5['Genre']!='UNCAT']; filtered5 = filtered5.dropna()
    # print filtered1
    # filtered = filtered[filtered['Genre']!='UNCAT']; filtered.dropna()
#Get rid of the rows that have missing values (nan) and UNCAT
df_full = df_full[ df_full["Genre"] != "UNCAT" ]
df_full = df_full.dropna()
y_full = df_full["Genre"]
X_full = df_full.drop(["Genre", "Track ID", "Year"], axis=1)

#Split the 80% of data to 70% Training and 30% Validation Data
from sklearn.cross_validation import train_test_split
X_train, X_validation, y_train, y_validation = \
                            train_test_split(X_full, y_full, train_size=0.7, random_state=42)
print "DEBUG: Data splitted"
df_train_toCrop = pd.concat([y_train, X_train], axis=1, join='inner')

#Crop the dataset
maxval = crop_rock.find_second_max_value(df_train_toCrop)
df_cropped = crop_rock.drop_excess_rows(df_train_toCrop, maxval)
y_cropped = df_cropped["Genre"]
X_cropped = df_cropped.drop(["Genre"], axis=1)

# # Start LDA Classification
# print "Performing LDA Classification:"
# from sklearn.lda import LDA
# clf = LDA(solver='svd', shrinkage=None, n_components=None).fit(X_cropped, np.ravel(y_cropped[:]))
#
# #Use X_cropped to get best model
# y_train_predicted = clf.predict(X_train)
# print "Error rate for LDA on Training: ", ml_aux.get_error_rate(y_train,y_train_predicted)
# # ml_aux.plot_confusion_matrix(y_cropped, predicted, "CM on LDA cropped")
# # plt.show()
#
# y_validation_predicted = clf.predict(X_validation)