def FeatureImportance(X_train, X_test, y_train, y_test, n): modeltrain = ExtraTreesClassifier(n_estimators=n) #modeltest = ExtraTreesClassifier(n_estimators = n) modeltrain.fit(X_train, y_train) #modeltest.fit(X_test, y_test) return modeltrain.fit_transform( X_train, y_train), modeltrain.fit_transform( X_test, y_test), modeltrain.feature_importances_
def getSelectedValues(self): (train, trainLabels, test) = self.getScaledValues() selector = ExtraTreesClassifier(compute_importances=True, random_state=0) train = selector.fit_transform(train, trainLabels) return (train, trainLabels, test) test = selector.transform(test)
def selecao_feature(X, y, resp1): print( '\n********************************************************************' ) print('Shape Entrada: ', X.shape) if resp1 == 1: clf = ExtraTreesClassifier(n_estimators=100).fit(X, y) model = SelectFromModel(clf, prefit=True) X_new = model.transform(X) print('Extra Trees - New Shape: ', X_new.shape) nomeFeature = 'Extra Trees' elif resp1 == 2: clf = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) model = SelectFromModel(clf, prefit=True) X_new = model.transform(X) print('LinearSVC - New Shape: ', X_new.shape) nomeFeature = 'LinearSVC' elif resp1 == 3: clf = VarianceThreshold(threshold=(.9 * (1 - .9))) X_new = clf.fit_transform(X) print('Variance Threshold - New Shape: ', X_new.shape) nomeFeature = 'Variance Threshold' return X_new, nomeFeature
def voting(peptide_predict_file,nucleotide_predict_file,effector_train,noneffector_train): total = 0 with open(peptide_predict_file) as f: for line in f: finded = line.find('>') if finded == 0: total =total+ 1 print('Total number of sequences to be classified: ',total) import time start_time = time.clock() import random import pandas import numpy as np import csv from sklearn import svm from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from random import shuffle from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split f=random.seed() from sklearn.metrics import accuracy_score import numpy as np np.random.seed(123) from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Convolution2D, MaxPooling2D from keras.utils import np_utils from sklearn.model_selection import cross_val_score from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import StandardScaler from keras.models import Sequential from keras.layers import Dense from imblearn.over_sampling import SMOTE, ADASYN from collections import Counter from sklearn.ensemble import ExtraTreesClassifier import warnings from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression warnings.filterwarnings("ignore") f=random.seed() #getting feature vector of sequence to be predicted featurevector=featureextraction(peptide_predict_file, nucleotide_predict_file, total) print(len(featurevector)) #getting training data dataframe = pandas.read_csv(effector_train, header=None, sep=',') dataset = dataframe.values eff = dataset[:,0:1000].astype(float) dataframe = pandas.read_csv(noneffector_train, header=None, sep=',') dataset = dataframe.values noneff = dataset[:,0:1000].astype(float) a1=eff.shape a2=noneff.shape X = np.ones((a1[0]+a2[0],a1[1])) Y = np.ones((a1[0]+a2[0],1)) for i in range(a1[0]): for j in range(a1[1]): X[i][j]=eff[i][j] Y[i,0]=0 #print(i) for i in range(a2[0]): for j in range(a2[1]): X[i+a1[0]][j]=noneff[i][j] Y[i+a1[0]][0]=1 warnings.filterwarnings("ignore") print('Resampling the unbalanced data...') X_resampled, Y_resampled = SMOTE(kind='borderline1').fit_sample(X, Y) #Standardize features by removing the mean and scaling to unit variance scaler = StandardScaler().fit(X_resampled) X = scaler.transform(X_resampled) #Removing features with low variance model = ExtraTreesClassifier() model.fit(X_resampled, Y_resampled) X_resampled=model.fit_transform(X_resampled, Y_resampled) featurevector=model.transform(featurevector) newshape=X_resampled.shape print("Training Classifiers...") #train and test set X_train, X_test, y_train, y_test = train_test_split(X_resampled, Y_resampled, test_size=0.15, random_state=f) y_t=y_train y_te=y_test y_train=np.ones((len(y_t),2)) y_test=np.ones((len(y_te),2)) for i in range(len(y_t)): if y_t[i]==0: y_train[i][1]=0 if y_t[i]==1: y_train[i][0]=0 for i in range(len(y_te)): if y_te[i]==0: y_test[i][1]=0 if y_te[i]==1: y_test[i][0]=0 #ANN print("Training Artificial Neural Network...") model = Sequential() model.add(Dense(newshape[1]+1, activation='relu', input_shape=(newshape[1],))) model.add(Dense(500, activation='relu')) #model.add(Dense(800, activation='relu')) #model.add(Dense(500, activation='relu')) model.add(Dense(250, activation='relu')) model.add(Dense(90, activation='relu')) # Add an output layer model.add(Dense(2, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model.fit(X_train, y_train,epochs=1000, batch_size=25, verbose=0) score = model.evaluate(X_test, y_test,verbose=0) ANN = model.predict(X_test) ANN = model.predict(featurevector) y_train=[] y_test=[] y_train=y_t y_test=y_te #SVM print("Training Support Vector Machine...") clf1 = svm.SVC(decision_function_shape='ovr', kernel='linear', max_iter=1000) clf1.fit(X_train, y_train) y_pred=clf1.predict(X_test) results=cross_val_score(clf1, X_test, y_test, cv=10) SVM=clf1.predict(featurevector) #KNN print("Training k-Nearest Neighbor ...") neigh = KNeighborsClassifier(n_neighbors=10) neigh.fit(X_train, y_train) results=cross_val_score(neigh, X_test, y_test, cv=10) y_pred=neigh.predict(X_test) KNN=neigh.predict(featurevector) #Naive Bayes print("Training Naive Bayes...") clf = MultinomialNB() clf.fit(X_train, y_train) results=cross_val_score(clf, X_test, y_test, cv=10) y_pred=clf.predict(X_test) DT=clf.predict(featurevector) #RandomForest print("Training Random Forest...") rf = RandomForestClassifier(random_state=0, min_samples_leaf=100) rf.fit(X_train, y_train) results=cross_val_score(rf, X_test, y_test, cv=10) y_pred=rf.predict(X_test) RF=clf.predict(featurevector) vote_result = [[0 for x in range(2)] for y in range(len(SVM))] for i in range(len(ANN)): if round(ANN[i][0])==1.0: vote_result[i][0]=vote_result[i][0]+1 if round(ANN[i][1])==1.0: vote_result[i][1]=vote_result[i][1]+1 if SVM[i]==0: vote_result[i][0]=vote_result[i][0]+1 if SVM[i]==1: vote_result[i][1]=vote_result[i][1]+1 if KNN[i]==0: vote_result[i][0]=vote_result[i][0]+1 if KNN[i]==1: vote_result[i][1]=vote_result[i][1]+1 if DT[i]==0: vote_result[i][0]=vote_result[i][0]+1 if DT[i]==1: vote_result[i][1]=vote_result[i][1]+1 if RF[i]==0: vote_result[i][0]=vote_result[i][0]+1 if RF[i]==1: vote_result[i][1]=vote_result[i][1]+1 print('-----------------------Results-----------------------') for i in range(len(ANN)): if vote_result[i][0]>=vote_result[i][1]: print('Sequence ',i+1,' is a probable Type 6 Effector') else: print('Sequence ',i+1,' is not a Type 6 Effector') end_time = time.clock() print('Execution time',(end_time-start_time))
import numpy as np from sklearn import preprocessing as pp from sklearn import cross_validation as cv from sklearn.ensemble import ExtraTreesClassifier from sklearn.svm import SVC workDir = r'C:\users\Akshay\Downloads\kaggle\\' # Read data train = np.genfromtxt(open(workDir + 'train.csv','rb'), delimiter=',') target = np.genfromtxt(open(workDir + 'trainLabels.csv','rb'), delimiter=',') test = np.genfromtxt(open(workDir + 'test.csv','rb'), delimiter=',') # Scale data train = pp.scale(train) test = pp.scale(test) # Select features selector = ExtraTreesClassifier(compute_importances=True, random_state=0) train = selector.fit_transform(train, target) test = selector.transform(test) # Estimate score classifier = SVC(C=8, gamma=0.17) scores = cv.cross_val_score(classifier, train, target, cv=30) print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) # Predict and save result = classifier.fit(train, target).predict(test) np.savetxt(workDir + 'a.csv', result, fmt='%d')
return [ds, passengerIds] if __name__ == '__main__': [train, trainPassengerIds] = cleanUpData('Data/train.csv') [test, testPassengerIds] = cleanUpData('Data/test.csv') # Fit the training data to the Survived labels and create the decision trees target = train.filter(['Survived']) target = np.array(target.values).ravel() train = train.drop(['Survived'], axis=1) # Scale data train = pp.scale(train) test = pp.scale(test) # Select features selector = ExtraTreesClassifier(compute_importances=True, random_state=0) train = selector.fit_transform(train, target) test = selector.transform(test) # Estimate score classifier = SVC(C=8, gamma=0.17) scores = cv.cross_val_score(classifier, train, target, cv=30) print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) # Predict and save result = classifier.fit(train, target).predict(test) submissionData = {'PassengerId': testPassengerIds, 'Survived': result} submissionDF = pd.DataFrame(submissionData) submissionDF.to_csv('Data/Titanic_Preprocess_XtraTrees_SVC.csv', index=False)
import numpy as np import pandas as pd from sklearn import preprocessing as pp from sklearn.ensemble import ExtraTreesClassifier print("Preparing the data") train = pd.io.parsers.read_csv(r"D:\shared\datascience\phy_train_clean.csv", sep=',', header=0) test = pd.io.parsers.read_csv(r"D:\shared\datascience\phy_test_clean.csv", sep=',', header=0) test_index = test.Id test = test.iloc[:,2:] target = train.kind train_index = train.Id train = train.iloc[:,2:] print("Preparing an Feature classifier") selector = ExtraTreesClassifier(compute_importances=True, random_state=0) print("Transforming the original dataset") train = pd.DataFrame(selector.fit_transform(train, target), index = train_index) test = pd.DataFrame(selector.transform(test), index = test_index) train['kind'] = target print("Storing the data...") train.to_csv(r"D:\shared\datascience\phy_train.csv", sep=',') test.to_csv(r"D:\shared\datascience\phy_test.csv", sep=',') print("Job finished")
def main(): X =[] Y=[] featuresDB = Base(os.getcwd()+"\\Databases\\features.db") featuresDB.open() print "features open" for rec in featuresDB: vec = [] vec.append(rec.f1) vec.append(rec.f3) vec.append(rec.f4) vec.append(rec.f5) vec.append(rec.f6) vec.append(rec.f7) vec.append(rec.f10) vec.append(rec.f11) vec.append(rec.f12) vec.append(rec.f13) vec.append(rec.f14) vec.append(rec.f15) vec.append(rec.f16) vec.append(rec.f17) vec.append(rec.f18) vec.append(rec.f19) vec.append(rec.f20) vec.append(rec.f21) vec.append(rec.f22) vec.append(rec.f23) X.append(vec) Y.append(rec.score) print "building classifier" Y = np.array(Y) ybar = Y.mean() for i in range(len(Y)): if Y[i]<ybar: Y[i]=1 else: Y[i]=2 scaler = Scaler().fit(X) X = scaler.transform(X) X= np.array(X) Y=np.array(Y) skf = cross_validation.StratifiedKFold(Y,k=2) for train, test in skf: X_train, X_test = X[train], X[test] y_train, y_test = Y[train], Y[test] clf = ExtraTreesClassifier(n_estimators=8,max_depth=None,min_split=1,random_state=0,compute_importances=True) scores = cross_validation.cross_val_score(clf,X_train,y_train,cv=5) clf.fit_transform(X_train,y_train) print "Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) print clf.feature_importances_ y_pred =clf.predict(X_test) print classification_report(y_test,y_pred) model=(scaler,clf) joblib.dump(model,'AestheticModel\\aestheticModel.pkl') print "Done"
print test_data_array.shape file_label.close() # normalize the features in the train and test dataset train_data_array_norm = preprocessing.scale(train_data_array) test_data_array_norm = preprocessing.scale(test_data_array) # run the module of PCA #pca = PCA(n_components = 10) #train_data_array_norm_pca = pca.fit_transform(train_data_array_norm, train_result_array) #test_data_array_norm_pca = pca.transform(test_data_array_norm) #print 'train data shape', train_data_array_norm_pca.shape # tree-based feature selection classifier = ExtraTreesClassifier() train_data_array_norm_pca = classifier.fit_transform(train_data_array_norm, np.ravel(train_result_array)) test_data_array_norm_pca = classifier.transform(test_data_array_norm) print 'train data shape', train_data_array_norm_pca.shape ## build SVM # random shuffle np.random.seed(0) indices = np.random.permutation(len(train_result_array)) classifer = svm.SVC(C=20, gamma = 0.05) # cross validation scores = cv.cross_val_score(classifier, train_data_array_norm_pca, np.ravel(train_result_array), cv = 30)
#print(model.feature_importances_) #(c) nmf = decomposition.NMF(n_components=10).fit(xTrain) def score(model, data, score=metrics.explained_variance_score): prediction = model.inverse_transform(model.transform(data)) return score(data, prediction) #print(score(nmf, xTrain)) #(d) xTrainnormal = (transformer.transform(xTrain)) xTrainpca = pca.fit_transform(xTrain) model = NMF(n_components=10, init='random', random_state=0) xTrainnmf = model.fit_transform(xTrain) xTestnormal = (transformer.transform(xTest)) xTestpca = pca.fit_transform(xTest) xTestnmf = model.fit_transform(xTest) clfnormal = LogisticRegression(penalty='none', random_state=0, solver='lbfgs', max_iter=1000, multi_class='multinomial').fit( xTrainnormal, yTrain) clf.predict(xTestnormal) ypredict1 = clf.predict_proba(xTestnormal) clfpca = LogisticRegression(penalty='none', random_state=0, solver='lbfgs', max_iter=1000,