def SVM_Ranking_Model_Extraction_And_Encoding(): # Pandas readin Training Samples Training_Table_Raw = pd.read_csv("FeatureToTrainWithoutTester.csv") Training_Table_Raw = Training_Table_Raw.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Dataset Start Time', 'Dataset End Time', 'executionStartTime', 'Dataset Group', 'Users Group'], axis = 1) Training_Table = Training_Table_Raw.copy() # Feature Encoding Training_Table = transform_features(Training_Table) # Training/Testing DataSet Split Train_Test_Split = Training_Table.copy() X, y = Train_Test_Split.drop('userName', axis = 1), Train_Test_Split['userName'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # SVM configuration parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100, max_iter = 100, probability = True))]) grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy') Grid_Fit = grid_search.fit(X_train, y_train) predictions = grid_search.predict(X_test) Top_N_Recommendder = Accumulation(Training_Table_Raw, Training_Table, Grid_Fit) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) return Top_N_Recommendder
def main(): # Data Pre-Processing: Join the username table and service log table df1 = pd.read_csv("NewForm1.csv") df2 = pd.read_csv("serviceExecutionLog_dataset2.csv") df3 = pd.merge(df1, df2, on = ['userName', 'executionStartTime'], how = 'left') # Uppercase transformation df3['model'] = df3['model'].map(str.upper) # Write out to csv file df3.to_csv("NewForm1WithExecutionTime.csv") # Data Pre-Processing: Join the Climate Dataset table to feature to train df4 = pd.read_csv("/Users/dennis/Documents/SVM-Tasks/Climate_Datasets.csv") # Encoding: Grouping df4['Dataset Group'] = df4['Dataset Group'].map(datasetgrouping) # Duplicate & Fillna df4['userName'] = df4['userName'].fillna('Unknown') df4['Users Group'] = df4['userName'] df4['Users Group'] = df4['Users Group'].map(usergrouping) # Write out to FeaturesForTrain.csv df4.to_csv("FeaturesForTrain.csv") # Training/Testing Data and split Preparation X, y = df4.astype(str).map(str.strip), df4['userName'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # Pipeline building pipeline = Pipeline([('vect', TfidfVectorizer(stop_words = 'english', lowercase = False)), ('clf', SVC(kernel=['rbf', 'linear'], gamma=0.01, C=100, max_iter = 100))]) # Check the training data shape print X_train.shape # parameters setting parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } # training with grid_search: parameters fillin grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # training with grid_search with X_train data grid_search.fit(X_train, y_train) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # Predictions predictions = grid_search.predict(X_test) predictions_probability = grid_search.predict_proba(X_test) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
def classification(FV_N): " PCA reduction dimension & Random Forest Classification" pca = decomposition.PCA() RFC = RandomForestClassifier() estimators = [('reduce_dim', pca), ('Random_Forest', RFC)] pipe = Pipeline(estimators) # Search the best parameters for the classification #for i in range(100,700,100): # cc=[i]+cc #nb_tree=[] #random_st=[] #for i in range(50,350,50): # nb_tree=[i]+nb_tree # random_st=[0]+random_st cc = [70, 80, 90] nb_tree = [200, 200, 200] random_st = [0, 0, 0] aa = [100, 200, 300] cc = [] params = dict(reduce_dim__n_components=cc, Random_Forest__n_estimators=nb_tree, Random_Forest__random_state=random_st) grid_search = GridSearchCV(pipe, param_grid=params) X = FV_N yr = Get_true_y(Data_FRAMES) filename_yr = projectpath + 'io/Output/yr.npy' np.save(filename_yr, yr) yr = np.load(filename_yr) Data_FRAMES.loc[Data_FRAMES.indice == 1595] X = X[:yr.shape[0]] X.shape yr = yr[:X.shape[0]] np.save(filename_yr, yr) yr = np.load(filename_yr) grid_search.fit(X, yr) print(grid_search.best_estimator_) plt.figure() plt.axvline( grid_search.best_estimator_.named_steps['reduce_dim'].n_components, linestyle=':', label='n_components chosen') plt.legend(prop=dict(size=12)) plt.show() plt.figure() plt.axvline( grid_search.best_estimator_.named_steps['Random_Forest'].n_estimators, linestyle=':', label='n_estimators chosen') plt.legend(prop=dict(size=12)) plt.show() n_est_rdf = grid_search.best_estimator_.named_steps[ 'Random_Forest'].n_estimators n_compo_pca = grid_search.best_estimator_.named_steps[ 'reduce_dim'].n_components pca = decomposition.PCA(n_components=n_compo_pca, svd_solver='auto') pca.fit(X) variance_Ratio = pca.explained_variance_ratio_ plt.figure(1, figsize=(4, 3)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(pca.explained_variance_ratio_.cumsum(), linewidth=1) plt.axis('tight') plt.xlabel('n_components') plt.ylabel('Cumulative Explained variance') M = pca.transform(X) plt.figure() plt.plot(M[yr == 1, 0], M[yr == 1, 1], 'or') plt.title('Astrocytes') plt.figure() plt.plot(M[yr == 2, 0], M[yr == 2, 1], 'ob') plt.title('Neurons') grid_search.predict(X) metrics.accuracy_score(yr, grid_search.predict(X)) RFC = RandomForestClassifier(n_estimators=n_est_rdf, random_state=0) predictedVAL = cross_val_predict(RFC, X, yr, n_jobs=-1) metrics.accuracy_score(yr, predictedVAL) Conf_Mat = confusion_matrix(yr, predictedVAL) import seaborn as sns sns.heatmap(Conf_Mat.T, square=True, annot=True, cbar=False) plt.xlabel('True label') plt.ylabel('predicted label') return ()
def SVM_Ranking_Model_Extraction_And_Encoding(): # Pandas readin Training Samples df = pd.read_csv("FeatureToTrainWithoutTester.csv") df2 = df.copy() df2 = df2.drop(['Dataset Start Time', 'Dataset End Time', 'executionStartTime', 'Dataset Group', 'Users Group'], axis = 1) df2.head() # Feature Encoding transform_features(df2) df2.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1) df2.head() # Encoded Features df = pd.read_csv("Transform_features.csv") # Training/Testing DataSet Split df3 = df.copy() y = df3['userName'] df3 = df3.drop(['userName'], axis = 1) X = df3 X_train, X_test, y_train, y_test = X, X, y, y # SVM configuration parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100, max_iter = 100, probability = True))]) grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy') result2 = grid_search.fit(X_train, y_train) #coef = (result.best_estimator_.get_params()['clf'].coef_) #coef2 = coef_sum(coef) #coef2 index = ['DatasetName', 'Agency', 'Instrument', 'Physical variable', 'var', 'Units', 'Grid Dimension', 'Variable Name in Web Interface', 'model'] # Model Estimation model = [] for i in index: # Features' distance/relevant to category prediction model.append(feature_training(X_train, y_train, i)) # Training data distance to single column PCA weight_set = numpy.zeros((len(X_train), len(index))) for j in range(0, len(X_train)): dict_index = 0 for i in index: # Features' distance/relevant to category prediction model_extraction = model[dict_index] sample = X_train[j:j+1] weight = feature_distance(sample, i, model_extraction) weight_set[j, dict_index] = weight dict_index = dict_index + 1 print "[INFO] Data Points: ", j, "Columns Iteration: ", dict_index print "[INFO] Weight : ", weight if j % 100 == 0: weight_set_file = pd.DataFrame(weight_set.copy()) weight_set_file.to_csv("weight_set.csv") # Delivery: Training data with Label Training_matrix = pd.DataFrame(weight_set.copy()) Training_matrix['Label'] = y_train # SVM Ranking Formatting SVM_Rank_Formatted_Training_data = Training_matrix.copy() for j in range(0, len(X_train)): for i in range(0, 9): SVM_Rank_Formatted_Training_data.ix[j, i] = str(i + 1) + ":" + str(SVM_Rank_Formatted_Training_data.ix[j, i]) SVM_Rank_Formatted_Training_data.ix[j, 'Label'] = str(int(SVM_Rank_Formatted_Training_data.ix[j, 9])) # Columns Reorder Rank_format_columns = SVM_Rank_Formatted_Training_data.columns.tolist() Rank_format_columns = Rank_format_columns[-1:] + Rank_format_columns[:-1] SVM_Rank_Formatted_Training_data = SVM_Rank_Formatted_Training_data[Rank_format_columns] # Write to CSV format SVM_Rank_Formatted_Training_data.to_csv("SVM_Rank_Formatted_Training_data2.dat", index = False, sep = ' ', index_label = False, header = False) SVM_Rank_Formatted_Training_data.to_csv("SVM_Rank_Formatted_Training_data2.csv") predictions = grid_search.predict(X_test) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
randomforestclassifier() # In[56]: from sklearn import svm, grid_search def svc_param_selection(X, y, nfolds): Cs = [0.001, 0.01, 0.1, 1, 1.1, 2, 3, 10] gammas = [0.001, 0.01, 0.1, 1] #kernels = [‘linear’, ‘rbf’, ‘poly’] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds) grid_search.fit(X, y) grid_search.best_params_ return grid_search, grid_search.best_params_ grid_search, params = svc_param_selection(x_train, y_train, 10) y_pred = grid_search.predict(x_test) print('best param:', params) kernals = "linear,rbf,poly" kernals = kernals.split(',') for kernel in kernals: svc = SVC(kernel=kernel, C=params['C'], gamma=params['gamma']) svc.fit(x_train, y_train) y_pred = svc.predict(x_test) print("kernal name:", kernel) print('Accuracy Score:') print(metrics.accuracy_score(y_test, y_pred))
print(metrics.accuracy_score(Y_test, y_pred_class_svm),"SVM-SGD -countvectorizer") svm_t = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42) svm_t.fit(X_train_tfidf, Y_train) y_pred_svm_t = svm_t.predict(X_test_tfidf) print(metrics.accuracy_score(Y_test, y_pred_svm_t),"SVM-SGD -tfidf") #grid print("grid") from sklearn import svm, grid_search Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':('poly', 'rbf')} grid_search = GridSearchCV(svm.SVC(), param_grid, cv=5) grid_search.fit(X_train_dtm, Y_train) print(grid_search.best_score_) print(grid_search.best_params_) y_grid_search_svm = grid_search.predict(X_test_dtm) print(metrics.accuracy_score(Y_test,y_grid_search_svm),"grid search- SVM") ''' #X_train, X_test, y_train, y_test = train_test_split(corpus, labels, random_state=1,train_size=0.90) #X_train_tfidf, vectorizer = generate_features(X_train) #X_test_tfidf, vectorizer = generate_features(X_test) from sklearn.naive_bayes import MultinomialNB,GaussianNB from sklearn.metrics import accuracy_score clf = MultinomialNB()
# grid_search.fit(X_train, Y_train) # print grid_search.grid_scores_ # print grid_search.best_estimator_ #Best estimator was C=0.5. #Narrowing down by order of magnitude. parameters = {'C':[0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55]} grid_search = grid_search.GridSearchCV(model_svm, parameters) grid_search.fit(X_train, Y_train) print grid_search.grid_scores_ print grid_search.best_estimator_ #Best estimator was C=0.45. Because we already compared 0.4 to 0.5 two searches above, and 0.5 was selected, we induce that 0.45 is the optimal value without searching between 0.40 and 0.45. #Returning model results with optimal 'C' value. expected = Y_test predicted = grid_search.predict(X_test) print classification_report(expected, predicted) print metrics.confusion_matrix(expected, predicted) print metrics.accuracy_score(expected, predicted) #Support Vector Machine: Model fit, transform, and testing with optimized 'C' value splits = cv.train_test_split(X_train_tfidf, dataset.target, test_size=0.2) X_train, X_test, Y_train, Y_test = splits model_svm = svm.LinearSVC(C=0.45, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) model_svm.fit(X_train, Y_train)
# Write out to FeaturesForTrain.csv df4.to_csv("FeaturesForTrain.csv") # Training/Testing Data and split Preparation X, y = df4.astype(str).map(str.strip), df4['userName'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # Pipeline building pipeline = Pipeline(['vect', TfidfVectorizer()), ('clf', LogisticRegression())]) # Check the training data shape print X_train.shape # parameters setting parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } # training with grid_search: parameters fillin grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # training with grid_search with X_train data grid_search.fit(X_train, y_train) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # Predictions predictions = grid_search.predict(X_test) predictions_probability = grid_search.predict_proba(X_test) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
# TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear). # Print the best params, using .best_params_, and print the best score, using .best_score_. # Get the training and test set accuracy values after hyperparameter tuning. # XXX Cs = [1, 10, 100] kernels = ['linear', 'rbf'] param_grid = {'C': Cs, 'kernel': kernels} grid_search = GridSearchCV(SVC(), param_grid=param_grid, cv=5) grid_search.fit(rescaledX, y_data) print("best params ", grid_search.best_params_) print("best score ", grid_search.best_score_) tuningpredict = grid_search.predict(X_test) print("Train Accuracy ", accuracy_score(y_train, grid_search.predict(X_train))) print("Test Accuracy ", accuracy_score(y_test, tuningpredict.round(), normalize=True)) svclassifier2 = SVC(kernel='linear', C=1) svmd2 = svclassifier2.fit(X_train, y_train) y_pred2 = svmd2.predict(X_test) print("Train Accuracy ", accuracy_score(y_train, svmd2.predict(X_train))) print("Test Accuracy ", accuracy_score(y_test, y_pred2.round(), normalize=True)) # XXX # TODO: Calculate the mean training score, mean testing score and mean fit time for the
"criterion": ["gini", "entropy"], "max_features": [sqrtfeat], "max_depth": [5, 10, 25], "min_samples_split": [2, 5, 10, minsampsplit] } forest = RandomForestClassifier(oob_score=1) print("Hyperparameter optimization using GridSearchCV...") grid_search = model_selection.GridSearchCV(forest, grid_test1, n_jobs=-1, cv=10) grid_search.fit(X, y) Y_pred = grid_search.predict(X_test) print(grid_search.score(X, y)) # # random_forest = RandomForestClassifier(oob_score=True, n_estimators=30000,max_depth=25, n_jobs=-1) # random_forest.fit(X,y) # # # Y_pred = random_forest.predict(X_test) # print(random_forest.score(X, y)) ResultSubmission = pd.DataFrame({ 'PassengerId': list(X_test_original['PassengerId']), 'Survived':