# Start timer
start_time = time.time()

# Load the data
from income_data import X, y, X_train,  X_test, y_train, y_test

# Scale the data
scaler = StandardScaler()
scaler.fit(X)
X_train_std = scaler.transform(X)
X_test_std = scaler.transform(X)
X_toCluster = X_train_std
y_inputs = y

# Reduce Dimensionality (PCA)
projection = ProjectionAlgorithm(n_components=34)
X_toCluster = projection.fit_transform(X_toCluster)

######
# Run k-means clustering with 1:n clusters determine scores for each
######
scores = []
silhouette_avg = []
BIC = []
maxClusters = 100
minClusters = 1
for i in range(minClusters,maxClusters):
    kmeans = KMeans(n_clusters=i+1, random_state=0)
    cluster_labels = kmeans.fit_predict(X_toCluster)
    scores.append(kmeans.score(X_toCluster))
    silhouette_avg.append(silhouette_score(X, cluster_labels))
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from plot_learning_curve import drawLearningCurve

# Scale the data
scaler = StandardScaler()
scaler.fit(X)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
X_toTransform = X_train_std
y_train = y_train
y_test = y_test

# Define the classifier
svm = SVC(random_state=1, kernel='linear', gamma=0.1, C=10)
pipe = Pipeline([('reduce_dim', ProjectionAlgorithm()), ('classify', svm)])
N_FEATURES_OPTIONS = range(2, 46)
parameters = {
    'reduce_dim__n_components': N_FEATURES_OPTIONS,
}
clf = GridSearchCV(pipe, cv=3, param_grid=parameters)

# Run the classifier
clf.fit(X_train_std, y_train)

# Identify training and test accuracy
y_pred = clf.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
y_pred_train = clf.predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
train_accuracy = accuracy_score(y_train, y_pred_train)