def AllModels (file, in_columns, out_columns): data = numpy.genfromtxt(file ,delimiter="," , autostrip = True ) data = data[2:] # numpy.asarray(numpy.random.shuffle(data[:2400])) array = data X = array[50:-50,in_columns] # print X X = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0).fit_transform(X) Y = array[50:-50,out_columns] #print X Y = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0).fit_transform(Y) # print Y validation_size = 0.2 #scoring = 'accuracy' # X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state = 0) X_train, X_validation, Y_train, Y_validation = X[0:2400], X[2400:], Y[0:2400], Y[2400:] # print X_train.pvalues_() lr = LogisticRegression() lr.fit(X_train, Y_train) predictions = lr.predict (X_validation) print 'LR : ' + str(accuracy_score(Y_validation, predictions)) lda = LinearDiscriminantAnalysis() lda.fit(X_train, Y_train) predictions = lda.predict (X_validation) print 'LDA: ' +str(accuracy_score(Y_validation, predictions)) knn = KNeighborsClassifier() knn.fit(X_train, Y_train) predictions = knn.predict (X_validation) print 'KNN: '+str(accuracy_score(Y_validation, predictions)) rf = DecisionTreeClassifier() rf.fit(X_train, Y_train) predictions = rf.predict (X_validation) print 'DT : ' +str(accuracy_score(Y_validation, predictions)) nb = GaussianNB() nb.fit(X_train, Y_train) predictions = nb.predict (X_validation) print 'NB : '+str(accuracy_score(Y_validation, predictions)) svm = SVC() svm.fit(X_train, Y_train) predictions = svm.predict (X_validation) print 'SVM: '+str(accuracy_score(Y_validation, predictions)) print '--------------------' rf=RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) rf.fit(X_train, Y_train) print 'rf: '+str(rf.score(X_validation,Y_validation)) et=ExtraTreesClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) et.fit(X_train, Y_train) print 'et: '+ str(et.score(X_validation,Y_validation)) #cnf_matrix = confusion_matrix(Y_validation, y_pred) #print cnf_matrix rf = [] for i in range(1,5): rf.append(ExtraTreesClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=i*6, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)) #cnf_matrix = confusion_matrix(Y_validation, y_pred) #print cnf_matrix l = [] for i in range(len(rf)): l.append((str(i),rf[i])) lda = LinearDiscriminantAnalysis() # l.append(('a',lda)) # l.append(('b',lda)) l.append(('c',lda)) l.append(('d',lda)) ecl = VotingClassifier(estimators = l, voting = 'hard') # ecl = AdaBoostClassifier(base_estimator = rf[0]) ecl.fit(X_train, Y_train) y_pred = ecl.predict(X_validation) ret = accuracy_score(Y_validation, y_pred) print ret cnf_matrix = confusion_matrix(Y_validation, y_pred,labels=[-3,-2,-1,0,1,2,3]) #print cnf_matrix s1 = 0.0 for i in cnf_matrix: s1 = s1 + sum(i) print '---------------' s = 0.0 for i in cnf_matrix[0:3,0:3]: s = s+sum(i) for i in cnf_matrix[4:7,4:7]: s = s+sum(i) print s/s1 return ret
for j in i: if j not in best: best.append(j) X2=X[best[0:20]] X_test2=X_test[best[0:20]] #Building a loop to find best model and feature selection (results are lda with the 23 best features) model=[] score=[] for i in range(10,len(best)): X2=X[best[0:i]] X_test2=X_test[best[0:i]] #running the train and test data in LDA (this typically gives the best model) model.append(['lda',i]) lda= LDA(n_components=2) lda_x_axis = lda.fit(X2, y).transform(X2) score.append(lda.score(X_test2, y_test, sample_weight=None)) #Look at Decision Tree Accuracy model.append(['dt',i]) dt = DecisionTreeClassifier(class_weight='balanced') dt.fit(X2,y) score.append(dt.score(X_test2,y_test)) #Look at Random Forest Accuracy model.append(['rf',i]) rf = RandomForestClassifier(class_weight='balanced') rf.fit(X2,y) score.append(rf.score(X_test2,y_test))