skdnnBO = BayesianOptimization(skdnncv, { 'h1': (10, 100), 'h2': (10, 100), 'learning_rate_init': (-5, -1) }) skdnnBO.explore({ 'h1': [10, 100], 'h2': [10, 100], 'learning_rate_init': [-5, -1] }) skdnnBO.maximize(init_points=10, n_iter=20) print('SKDNN: %f' % skdnnBO.res['max']['max_val']) #---- set classifiers to be combined for voting ------------- RF = RFC(n_estimators=int(rfBO.res['max']['max_params']['n_estimators']), max_features=int(rfBO.res['max']['max_params']['max_features'])) SVM = SVC(C=10**svcBO.res['max']['max_params']['C'], gamma=10**svcBO.res['max']['max_params']['gamma'], random_state=None, probability=True) XGB = xgboost.XGBClassifier( learning_rate=10**xgbBO.res['max']['max_params']['learning_rate'], n_estimators=int(xgbBO.res['max']['max_params']['n_estimators'])) SKDNN = MLPClassifier( solver='adam', alpha=1e-5, batch_size='auto', hidden_layer_sizes=(int(skdnnBO.res['max']['max_params']['h1']),
for j in range(i + 1, len(data.keys())): if chronology[i] > chronology[j]: chronology[i], chronology[j] = chronology[j], chronology[i] date = [chronology[len(chronology) * i // 6-1] for i in range(1,7)] del data, label gc.collect() clf_option = [ Boosting(), LR(n_jobs = -1), NB(), LinearSVC(), Neighbors(), RFC() ] mre_pred = [] for iter in tqdm(range(5)): if settings.DEBUG_MODE: print("Memulai pengambilan data") mre_total = [] query = "Select * from berita WHERE Date <= "+str(date[iter]) c.execute(query) train_data = c.fetchall() query = "Select * from berita WHERE Date <= "+str(date[iter+1])+" AND "+str(date[iter]) c.execute(query)
# generate random features wx = rp(333,[0.2,2,20],1) wy = rp(333,[0.2,2,20],1) wz = rp(333,[0.2,2,20],2) # generate training data print 'generating training data...' (x,y) = pairset(10000) #(x_te,y_te,m_te) = tuebingen() # load test data (pairs,num_features,num_pairs) = boston_housing() # train the classifier and predict the test data print 'training the random forest classifier...' reg = RFC(n_estimators=100,random_state=0,n_jobs=4).fit(x,y); y_prob = reg.predict_proba(pairs) # save the predictive probability of pairs np.savetxt('housing_predict.txt',y_prob,fmt='%.5f') # visualize the directed graph node_labels = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"] desp = """ 1. CRIM per capita crime rate by town 2. ZN proportion of residential land zoned for lots over 25,000 sq.ft. 3. INDUS proportion of non-retail business acres per town 4. CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
scaler = RobustScaler() #second case using this scaler x_scaled = scaler.fit_transform(x_train) x_new = pd.DataFrame(x_scaled, columns=x.columns) x_new.head() skpca = PCA(n_components=55) x_pca = skpca.fit_transform(x_new) print('Variance sum : ', skpca.explained_variance_ratio_.cumsum()[-1]) from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.metrics import classification_report, confusion_matrix model = RFC(n_estimators=100, random_state=0, oob_score=True, max_depth=30, max_features='sqrt') model.fit(x_pca, y_train) x_test_scaled = scaler.transform(x_test) x_test_new = pd.DataFrame(x_test_scaled, columns=x.columns) x_test_pca = skpca.transform(x_test_new) y_pred = model.predict(x_test_pca) print(classification_report(y_pred, y_test)) sns.heatmap(confusion_matrix(y_pred, y_test), annot=True, fmt="d", cmap=plt.cm.Accent,
def get_base_learners(self): self.m_randomForest = RFC(n_estimators=100, criterion='entropy', random_state=3) self.m_randomForest.fit(self.m_sourceDataFeature, self.m_sourceLabel)
def main(): cols = [ 'd_age', 'samerace', 'attractive_partner', 'interests_correlate', 'like', 'guess_prob_liked', 'match', 'attractive', 'attractive_partner' ] df = pd.read_csv('raw/data.csv', usecols=cols) df = df.replace('?', np.nan) df = df.dropna() df = df.sample(frac=1) nrows = df.shape[0] label = df['match'] df = df.drop(['match'], axis=1) n = int(0.8 * nrows) trainX = df.iloc[:n] testX = df.iloc[n:] trainY = label.iloc[:n] testY = label.iloc[n:] rf = RFC(n_estimators=10) X = trainX.values y = trainY.values rf.fit(X, y) Xtest = testX.values Ytest = testY.values predicted = rf.predict(Xtest) out = Ytest == predicted out = np.where(out == True) acc = len(out[0]) / Ytest.shape[0] print(acc) # write match test pd.DataFrame(testY.index).to_csv('psl/match_test.txt', header=False, sep='\t', index=False) # write match obs pd.DataFrame(trainY).to_csv('preprocessed/match_train.txt', header=False, sep='\t') # write rf predictions outdf = pd.DataFrame(predicted, index=testX.index, columns=['predicted']) outdf.to_csv('preprocessed/rf.txt', header=False, sep='\t') # write similarities out = [] t = testX['attractive'].astype( np.float64) - testX['attractive_partner'].astype(np.float64) for i, r in t.items(): for j, rr in t.items(): sim = abs(r - rr) out.append(f'{i}\t{j}\t{sim}') s = '\n'.join(out) with open('preprocessed/sim.txt', 'w') as f: f.write(s)
""" ################################################################################ ### your code here! name your classifier object clf if you want the from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = accuracy_score(pred, labels_test) print "Gaussian acc", acc from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.metrics import accuracy_score clf = RFC() clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = accuracy_score(pred, labels_test) print "defaule acc", acc print "For different n_estimators:" print "----------------------------" max_acc, max_est = 0, 0 for i in [1, 2, 4, 8, 10, 16, 25, 50, 32, 64, 100, 200]: #For estimators in n_estimators clf = RFC(n_estimators=i, random_state=42) clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = accuracy_score(pred, labels_test) if (acc > max_acc):
lgr_parameters = {"penalty": ("l1", "l2"), "C": C_range} sgd_parameters = { "loss": ("hinge", "log", "modified_huber", "squared_hinge", "perceptron", "squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"), "penalty": ("none", "l2", "l1", "elasticnet") } rfc_parameters = {"n_estimators": np.arange(50, 201, 10)} efc_parameters = {} # abc_parameters = {} # gbc_parameters = {} classifiers = [[LDA(), "LDA", lda_parameters], [SVC(), "SVC", svc_parameters], [LGR(), "LogReg", lgr_parameters], [SGD(), "StochGradDesc", sgd_parameters], [RFC(), "Random Forest", rfc_parameters], [EFC(), "Extra Tree", efc_parameters]] # [KNN(), "KNearestNeighbor", knn_parameters], # , # [ABC(), "AdaBoost", abc_parameters], # [GBC(), "Gradient Boosting Classifier", gbc_parameters] count = 0 clf_count = len(classifiers) channels = data["X_train"].shape[1] # T = Normalizer() cv = ShuffleSplit(n_splits, test_size)
print() print(df_wine.tail()) print() X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test =\ T_T_S(X,y,test_size = .3, random_state = 0, stratify = y) # stratify ensures same class proportions of training and test data sets print('Training Data Size = ', len(X_train)) print('Test Data Size = ', len(X_test)) print() pause() feat_labels = df_wine.columns[1:] forest = RFC(n_estimators=500, random_state=1) forest.fit(X_train, y_train) tic_fwd = time() sfs_forward = SequentialFeatureSelector(forest, n_features_to_select=5, direction='forward').fit( X_train, y_train) toc_fwd = time() tic_bwd = time() sfs_backward = SequentialFeatureSelector(forest, n_features_to_select=5, direction='backward').fit( X_train, y_train) toc_bwd = time()
print('Saving finished dataset') finished_dataset = np.insert(instances,instances.shape[1],labels, axis=1) with open('finished_dataset.csv', 'w') as output: writer = csv.writer(output, lineterminator='\n') for line in finished_dataset: writer.writerow(line) """ """ The parameter class_weight can penalise mistakes to the minority class in order to mitigate the imbalance of the training dataset include the argument probability=True if it is useful to enable probability estimates for SVM algorithms.""" svc_classifier = SVC() naiveb_classifier = GaussianNB() randomfor_classifier = RFC(criterion='entropy', n_jobs=2, n_estimators=100) #solver parameter works better with lbfgs instead of adam (tested beforehand) ann_classifier = MLPClassifier(solver='lbfgs') """scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn implementation does not support categorical variables for now. CART (Classification and Regression Trees) is very similar to C4.5, but it differs in that it supports numerical target variables (regression) and does not compute rule sets. CART constructs binary trees using the feature and threshold that yield the largest information gain at each node.""" tree_classifier = tree.DecisionTreeClassifier() lr_classifier = LogisticRegression() classify(svc_classifier, 'Support Vector Machines', smote=False) classify(naiveb_classifier, 'Naive Bayes', smote=False) classify(randomfor_classifier, 'Random Forest', smote=False) classify(ann_classifier, 'Multi-layer Perceptron', smote=False)
'AgeFill'] = median_ages[i,j] df['AgeIsNull'] = pd.isnull(df.Age).astype(int) df['FamilySize'] = df['SibSp'] + df['Parch'] df['Age*Class'] = df.AgeFill * df.Pclass df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1) df = df.dropna() train_data = df.values label = train_data[:, 1] df = df.drop(['Survived'], axis=1) train_data = df.values X_train, X_test, Y_train, Y_test = skc.train_test_split(train_data, label) clf = RFC(n_estimators=60) clf.fit(X_train, Y_train) print(clf.score(X_test, Y_test)) df1 = pd.read_csv('test.csv', header=0) df1['Gender'] = df1['Sex'].map({'female': 0, 'male': 1}).astype(int) # df1['AgeFill'] = df1['Age'] median_ages = np.zeros((2, 3)) for i in range(0, 2): for j in range(0, 3): df1.loc[ (df1.Age.isnull()) & (df1.Gender == i) & (df1.Pclass == j+1),\ 'AgeFill'] = median_ages[i,j] df1['AgeIsNull'] = pd.isnull(df1.Age).astype(int) df1['FamilySize'] = df1['SibSp'] + df1['Parch']
target = df[['survival']] from sklearn import cross_validation as cv splits = cv.train_test_split(feat, target, test_size=0.2) xtrain, xtest, ytrain, ytest = splits expected = [2, 2, 1, 1] from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error as mse from sklearn.linear_model import LinearRegression from sklearn.svm import SVC from sklearn.metrics import classification_report as CSR from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier as RFC model_svc = SVC() model_rfc = RFC() model_LogRegr = LogisticRegression() model_LinRegr = LinearRegression() model_LinRegr.fit(xtrain, np.ravel(ytrain)) model_LogRegr.fit(xtrain, np.ravel(ytrain)) model_svc.fit(xtrain, np.ravel(ytrain)) model_rfc.fit(xtrain, np.ravel(ytrain)) pd1 = pd.read_csv('/home/khany1/allcode/datasets/t1.txt', header=None, names=['aa', 'bb', 'cc']) pr_linregr = model_LinRegr.predict(pd1) pr_logregr = model_LogRegr.predict(pd1) pr_svc = model_svc.predict(pd1) pr_rfc = model_rfc.predict(pd1)
def MultiTrAdap(Xs, Ys, Xa, Ya, Xt, Yt, nIters=200): # s for source domain, a for auxilary data, t for test data p = progressbar.ProgressBar() Xsa = np.concatenate((Xs, Xa)) Ysa = np.concatenate((Ys, Ya)) Ns = Ys.shape[0] Na = Ya.shape[0] Epss = [] # Epsilons in each iteration TestAcc = [] # Accuracy on test set in each iteration TestPrd = {} # Predictions made in each iteration Weights = {} # Sample weights in each iteration AdaAcc = [ ] # if only use the aux data with weights (Only adaboost), what will be the accuracy Beta = 1 / (1 + np.sqrt(2 * np.log(Ns) / nIters) ) # for updateting the source samples Wsa = np.ones(Ns + Na) / (Ns + Na) # Init the weights evenly p.start(nIters) for ni in range(nIters): Weights[ni] = Wsa #---- update P, train and predict ---- Psa = Wsa / np.sum(Wsa) clf = RFC(n_estimators=5, criterion='entropy', max_depth=2) # clf = LRC(solver = 'liblinear',multi_class='ovr') # clf = LinearSVC(multi_class='ovr') # clf = TreeC(splitter='best',max_depth=3) # update the W if 0: # Update the weights alternatively: train on A, prd on S => update S; then train on S, prd on A, => update A clf.fit(Xa, Ya, sample_weight=Psa[-Na:]) YsPrd = clf.predict(Xs) RorWs = 1 * (YsPrd != Ys) clf.fit(Xs, Ys, sample_weight=Psa[:Ns]) YaPrd = clf.predict(Xa) RorWa = 1 * (YaPrd != Ya) RorW = np.concatenate((RorWs, RorWa)) else: # Normal TrAdaBoost, Train on S&A, prd on S&A clf.fit(Xsa, Ysa, sample_weight=Psa) YsaPrd = clf.predict(Xsa) # calculate the accuracy on XYa RorW = 1 * (YsaPrd != Ysa) Eps = np.sum((Wsa * RorW)[-Na:]) / np.sum( Wsa[-Na:]) # Epss are only from A data Epss.append(1 - Eps) # adjust Eps if Eps >= 0.4: Eps = 0.4 elif Eps <= 0: Eps = 0.01 # Weight update if 1: Alpha = np.sqrt(Eps / (1 - Eps)) # Alpha = Eps/(1-Eps) # This is the original update from Dai's Coef = np.concatenate( (Beta * np.ones(Ns), (1 / Alpha) * np.ones(Na))) wUpdate = np.power(Coef, RorW) else: # Update with momentum Alpha = np.sqrt((1 - Eps) / (1 + Eps)) Ct = 2.5 * (1 - Eps) Coef = np.concatenate( (Ct * Beta * np.ones(Ns), Alpha * np.ones(Na))) wUpdate = np.power(Coef, -25 * RorW / nIters) # Now update Wsa = Wsa * wUpdate # result & summary Yprd = clf.predict(Xt) TestPrd[ni] = Yprd TestAcc.append(Metrics.Accuracy(Yt, Yprd)) clf.fit(Xa, Ya, sample_weight=Psa[-Na:]) AdaAcc.append(Metrics.Accuracy(Yt, clf.predict(Xt))) p.update(ni + 1) # progress bar # print(np.mean(Target)) p.finish() return Weights, Epss, TestPrd, TestAcc, AdaAcc
p.update(ni + 1) # progress bar # print(np.mean(Target)) p.finish() return Weights, Epss, TestPrd, TestAcc, AdaAcc #============================================================ # 3 data set are tested. Synthetic, UCI heart desease, Amazon+Webcam idChanged = [] Xs, Ys, Xa, Ya, Xt, Yt, idChanged = Datasets.gen_noisy_classi_data() #Xs,Ys,Xa,Ya,Xt,Yt = Datasets.load_heart() #Xs,Ys,Xa,Ya,Xt,Yt = Datasets.load_pics() nIters = 50 # Baseline, from A to T clf0 = RFC(n_estimators=5, criterion='entropy', max_depth=2) clf0.fit(Xa, Ya) Acc0 = Metrics.Accuracy(Yt, clf0.predict(Xt)) SPweights, Acc_auxi, All_test_prd, Acc_test, Acc_AdaOnly = MultiTrAdap( Xs, Ys, Xa, Ya, Xt, Yt, nIters=nIters) PrdDf = pd.DataFrame.from_dict(All_test_prd) HalfDf = PrdDf.iloc[:, round(nIters / 2):] # use the last half only BoostPrd = HalfDf.mode(axis=1) # Boosting: simply vote AccB = Metrics.Accuracy(Yt, BoostPrd[0]) # ================ Plot=========================== import matplotlib.pyplot as plt fig1 = plt.figure() ax1 = fig1.add_subplot(111)
def __init__(self): self.model = RFC(n_estimators=10,n_jobs=7)#MLPClassifier([50, 10])#BernoulliNB()#
orig_stdout = sys.stdout o = open('trainingtime.txt', 'w') sys.stdout = o data = pd.concat([data_neg, data_pos]) data.index = range(len(data.index)) vectors_FI = data[data.columns[4:]] labels_FI = data[data.columns[3]] rfc1 = RFC( n_estimators=50, max_features=10, max_depth=30, min_samples_split=3, criterion="entropy", n_jobs=-1 ) start = time.time() rfc1.fit(vectors_FI, labels_FI) end = time.time() print ('PROCESSING TIME = ', end - start) FI = rfc1.feature_importances_
estimator_result = [] for tree in self.trees: estimator_result.append(tree.predict(x.reshape(1, -1))[0]) results.append(np.mean(estimator_result)) return np.array(results) if __name__ == "__main__": X, y = make_classification(n_samples=200, n_features=8, n_informative=4, random_state=2) RF1 = RandomForestClassifier(n_estimators=10, max_depth=3) RF2 = RFC(n_estimators=10, max_depth=3) RF1.fit(X, y) res1 = RF1.predict(X) RF2.fit(X, y) res2 = RF2.predict(X) print('结果一样的比例', (np.abs(res1 - res2) < 1e-5).mean()) X, y = make_regression(n_samples=200, random_state=2) RF1 = RandomForestRegressor(n_estimators=10, max_depth=3) RF2 = RFR(n_estimators=10, max_depth=3) RF1.fit(X, y)
for cluster_id in np.unique(best_model.labels_): # print("Cluster", cluster_id) in_cluster = best_model.labels_ == cluster_id faces = X_train[in_cluster].reshape(-1, 64, 64) labels = y_train[in_cluster] # plot_faces(faces, labels) from sklearn.ensemble import RandomForestClassifier as RFC # rfc = RFC(n_estimators = 150, random_state = 42) # rfc.fit(X_train_pca, y_train) # print(rfc.score(X_valid_pca, y_valid)) # 0.9 X_train_reduced = best_model.transform(X_train_pca) X_valid_reduced = best_model.transform(X_valid_pca) X_test_reduced = best_model.transform(X_test_pca) rfc = RFC(n_estimators=150, random_state=42) rfc.fit(X_train_reduced, y_train) # print(rfc.score(X_valid_reduced, y_valid)) # 0.75 from sklearn.pipeline import Pipeline for n_clusters in k_range: pipeline = Pipeline([("kmeans", KMeans(n_clusters=n_clusters, random_state=n_clusters)), ("forest_clf", RFC(n_estimators=150, random_state=42))]) pipeline.fit(X_train_pca, y_train) # print(n_clusters, pipeline.score(X_valid_pca, y_valid)) X_train_extended = np.c_[X_train_pca, X_train_reduced] X_valid_extended = np.c_[X_valid_pca, X_valid_reduced]
################################################################################ """ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.metrics import accuracy_score from sklearn.svm import SVC import sys from time import time import math myClassifier1 = RFC(n_estimators=1000, criterion="entropy", min_samples_split=80, max_features=None, n_jobs=-1) startTrainingTime = time() myClassifier1.fit(features_train, labels_train) print "The training time is: ", round(time() - startTrainingTime, 3), "seconds" startPredictionTime = time() myPredictions1 = myClassifier1.predict(features_test) print "The prediction time is: ", round(time() - startPredictionTime, 3), "seconds" accuracy1 = accuracy_score(labels_test, myPredictions1) print "Accuracy is: ", accuracy1
from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.model_selection import cross_val_score train_data = pd.read_csv( r'G:\pycharm_pyproject\AiLearning\my_try\caicai_try\dataset\digit_data\digit_data_train.csv' ) X_raw = train_data.iloc[:, 1:] Y = train_data.iloc[:, 0] X_raw = X_raw.values Y = Y.values.reshape(-1, 1) print(X_raw.shape) print(Y.shape) print('*' * 50) rfc = RFC(n_estimators=10, random_state=0) feature_to_select = 340 step = 50 selector = RFE(rfc, n_features_to_select=feature_to_select, step=50).fit(X_raw, Y.flatten()) # support_:bool array表示特征是否被选中 # print(selector.support_) # print('*'*50) # print(selector.support_.sum()) # print('*'*50) # ranking_:表示综合排名 # print(selector.ranking_) # print('*'*50) # X_wrapper = selector.transform(X_raw)
def RFCClassifier(cls, ): n_estimators_range = np.arange(10, 260, 30) param_grid = {'n_estimators': n_estimators_range} return cls(RFC(n_estimators=100), par_grid_dict=param_grid)
Default = data['Default'] data.drop(['Default'], axis=1, inplace=True) plt.scatter(data['PAY_0'], data['PAY_2']) #Removing multicollinear variables data['PAY_5'] = (data['PAY_5'] + data['PAY_6']) / 2 data['PAY_2'] = (data['PAY_2'] + data['PAY_3']) / 2 multi = ['PAY_6', 'PAY_4', 'PAY_3'] data.drop(multi, axis=1, inplace=True) data.drop(['PAY_2'], axis=1, inplace=True) #Varying n_estimators in RFC and GBC to analyze the variance in accuracy Accuracy = [] for i in range(1, 5): from sklearn.ensemble import RandomForestClassifier as RFC RFC = RFC(max_features=3, n_jobs=5, n_estimators=i * (10)) RFC.fit(x_tr, y_tr) Accuracy.append(RFC.score(x_te, y_te)) accuracy = [] for i in range(1, 5): from sklearn.ensemble import GradientBoostingClassifier as RFC RFC = RFC(max_leaf_nodes=5, n_estimators=i * (10)) RFC.fit(x_tr, y_tr) accuracy.append(RFC.score(x_te, y_te)) fig = plt.figure(figsize=(7, 7)) plt.plot(np.arange(1, 5), Accuracy, c='r') plt.plot(np.arange(1, 5), accuracy, c='g') plt.xlabel('variable') plt.ylabel('accuracy')
# print(type(allY_train[0])) # print("\n\nSAX train:", SAX_train) # print("\n\nSAX test:", SAX_train) # print("\n\nSAY train:", SAY_train) # print("\n\nSAY test:", SAY_train) allY_test = allY_test.astype(float) allY_train = allY_train.astype(float) SAY_test = SAY_test.astype(float) SAY_train = SAY_train.astype(float) all_kNN = KNeighborsClassifier() SA_kNN = KNeighborsClassifier() all_Tree = tree.DecisionTreeClassifier() SA_Tree = tree.DecisionTreeClassifier() all_RandFor = RFC(n_estimators=25) SA_RandFor = RFC(n_estimators=25) all_kNN.fit(allX_train, allY_train) SA_kNN.fit(SAX_train, SAY_train) all_Tree.fit(allX_train, allY_train) SA_Tree.fit(SAX_train, SAY_train) all_RandFor.fit(allX_train, allY_train) SA_RandFor.fit(SAX_train, SAY_train) model_tuples = [(all_Tree, 'All Tree Classifier'), (SA_Tree, 'SA Tree Classifier'), (all_kNN, "All kNN Classfier"), (SA_kNN, 'SA kNN Classifier'), (all_RandFor, "All Random Forest Classifier"), (SA_RandFor, 'SA Random Forest Classifier')]
test_target = effects[test_split:] i = 0 test_target = np.asarray(test_target) fp_train = [] for mol in train_sm: fp_train.append(mol2imageT(mol, N=2048)) fp_train = np.asarray(fp_train) fp_test = [] for mol in test_sm: fp_test.append(mol2imageT(mol, N=2048)) fp_test = np.asarray(fp_test) classifier = RFC(n_estimators=100, oob_score=True) ''' print fp_train print "------printing special fp-train-------" print fp_train[:,None] ''' classifier.fit(fp_train, train_target.ravel()) #using oob_for for data set pred_ans = classifier.predict(fp_test) #pred_ans = classifier.oob_decision_function_ #get rounded answers binary_pred_ans = [] for p in pred_ans:
features_train, features_test, labels_train, labels_test = model_selection.train_test_split( training_data[features], training_data['target'], test_size=0.3, random_state=0) # parameters parameters = { 'n_estimators': [20, 25], 'random_state': [0], 'max_features': [2], 'min_samples_leaf': [150, 200, 250] } # implementing my classifier model = RFC(n_jobs=-1) grid = GS(estimator=model, param_grid=parameters) grid.fit(features_train, labels_train) # Calculate the logloss of the model prob_predictions_class_test = grid.predict(features_test) prob_predictions_test = grid.predict_proba(features_test) logloss = log_loss(labels_test, prob_predictions_test) accuracy = accuracy_score(labels_test, prob_predictions_class_test, normalize=True, sample_weight=None) # predict class probabilities for the tourney set
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.label) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] print("featureNum", len(self.fn[0])) # print("non zero feature num", sum(self.fn[0])) totalTransferNumList = [] np.random.seed(3) np.random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum*1.0/foldNum) foldInstanceList = [] for foldIndex in range(foldNum-1): foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):] foldInstanceList.append(foldIndexInstanceList) # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True) cvIter = 0 # random.seed(3) totalAccList = [0 for i in range(10)] posRatioList = [] # self.PCAFeature(10) for foldIndex in range(foldNum): # self.m_clf = LinearSVC(random_state=3) # self.m_clf = SVC(random_state=3) # self.m_clf = LR(random_state=3) self.m_clf = RFC(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex+1, foldNum): train.extend(foldInstanceList[postFoldIndex]) # trainNum = int(totalInstanceNum*0.2) # print("trainNum", trainNum) fn_test = self.fn[test] label_test = self.label[test] fn_train = self.fn[train] label_train = self.label[train] testOneNum = np.sum(label_test) testNum = len(fn_test) posRatio = testOneNum*1.0/testNum posRatioList.append(posRatio) self.m_clf.fit(fn_train, label_train) label_preds = self.m_clf.predict(fn_test) acc = accuracy_score(label_test, label_preds) totalAccList[cvIter] = acc cvIter += 1 print("posRatioList", posRatioList, np.mean(posRatioList), np.sqrt(np.var(posRatioList))) print("totalAccList", totalAccList, np.mean(totalAccList), np.sqrt(np.var(totalAccList))) totalACCFile = modelVersion+".txt" f = open(totalACCFile, "w") for i in range(10): f.write(str(totalAccList[i])) # for j in range(totalAlNum): # f.write(str(totalAccList[i][j])+"\t") f.write("\n") f.close()
def oneTrialWithCertainTrainSize( num_pos_sample=50, neg_pos_ratio=1, pos_training_dataset=None, pos_testing_dataset=None, neg_dataset=None, train_test_split=0, # obselete feature, keep default parameter to bypass, feature achieved by "num_pos_sample" param test_stratify=True, # obselete feature, keep default parameter to bypass, feature achieved by "num_pos_sample" param scoring="f1", plt_or_not=True): assert (type(pos_training_dataset) == list and type(neg_dataset) == list), "input datasets should be lists" num_neg_sample = int(num_pos_sample * neg_pos_ratio) # take sample of num_pos_sample number of positive examples (posPicked, posNotPicked) = takingSamples(pos_training_dataset, num=num_pos_sample) (negPicked, negNotPicked) = takingSamples(neg_dataset, num=num_neg_sample) # create train_X, train_y train_X = pd.DataFrame(posPicked + negPicked) train_y = np.array([1 for i in range(len(posPicked))] + [0 for i in range(len(negPicked))]) # create test_X and test_y if train_test_split != 0: testSize = int((num_pos_sample + num_neg_sample) / train_test_split * (1 - train_test_split)) # size of test set if test_stratify: testPosSize = int(float(testSize) / (neg_pos_ratio + 1)) testNegSize = testSize - testPosSize test_X = pd.DataFrame( takingSamples(posNotPicked, num=testPosSize)[0] + takingSamples(negNotPicked, num=testNegSize)[0]) # test_y = np.array([1 for i in range(testPosSize)] + [0 for i in range(testNegSize)]) else: for idx in range(len(posNotPicked)): posNotPicked[idx].append(1) for idx in range(len(negNotPicked)): negNotPicked[idx].append(0) test_X = pd.DataFrame( takingSamples(posNotPicked + negNotPicked, num=testSize)[0]) test_y = np.array() for i in test_X: if i[-1] == 1: test_y.append(1) else: test_y.append(0) for idx in range(len(test_X)): del test_X[idx][-1] else: if (pos_testing_dataset == None): test_X = pd.DataFrame(posNotPicked + negNotPicked) test_y = np.array([1 for i in range(len(posNotPicked))] + [0 for i in range(len(negNotPicked))]) else: test_X = pd.DataFrame(pos_testing_dataset + negNotPicked) test_y = np.array([1 for i in range(len(pos_testing_dataset))] + [0 for i in range(len(negNotPicked))]) # train and test the model reg = RFC(n_estimators=100) # reg = RFC(n_estimators=200, max_features='log2') # reg = LogisticRegressionCV(scoring=scoring) LogModel = reg.fit(train_X, train_y) y_predlog = LogModel.predict_proba(test_X) y_predlog_1 = y_predlog[:, 1] prec, rec, thresholds = precision_recall_curve(test_y, y_predlog_1) if plt_or_not: plt.plot(rec, prec) plt.xlabel("Recall") plt.ylabel("Precision") plt.title("Rec-Prec Curve of Logistic Regression Trials") # pred_combine sorted pred_combine = [] for i in range(len(test_y)): pred_combine.append((y_predlog_1[i], test_y[i])) pred_combine = sorted(pred_combine, key=operator.itemgetter(0)) # create an array of 0.1:0.01:0.99 thres_new = [] initial = 0.1 while initial <= 0.99: thres_new.append(initial) initial += 0.01 initial = round(initial, 2) # generate "threshold, prec, rec, f1" list # test_y is truth, y_predlog_1 is prob of being 1 result = [] item_index = 0 FN_accu = 0 TN_accu = 0 TP_accu = list(test_y).count(1) FP_accu = list(test_y).count(0) for i in thres_new: # i is [0.1:0.01:0.99] if (item_index < len(pred_combine)): while pred_combine[item_index][0] < i: if pred_combine[item_index][ 1] == 1: # this item actually 1, predict as 0 FN_accu += 1 TP_accu -= 1 else: # this item is actually 0, predict as 0, pred_combine[item_index][1] == 0 TN_accu += 1 FP_accu -= 1 item_index += 1 if (item_index == len(pred_combine)): break # print "th: " + str(i) + ", TP: " + str(TP_accu) + ", FP: " + str(FP_accu) + ", FN: " + str(FN_accu) + ", TN: " + str(TN_accu) if (TP_accu == 0): preci = 0 else: preci = float(TP_accu) / (TP_accu + FP_accu) if (TP_accu == 0): recal = 0 else: recal = float(TP_accu) / (FN_accu + TP_accu) if (2 * preci * recal == 0): fone = 0 else: fone = 2 * preci * recal / (preci + recal) result.append([i, preci, recal, fone]) return result # 90
from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(analyzer=text_process).fit(X) print(len(vectorizer.vocabulary_)) X = vectorizer.transform(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101) from sklearn.naive_bayes import MultinomialNB as MNB from sklearn.linear_model import LogisticRegression as LR from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.ensemble import VotingClassifier as VC mnb = MNB(alpha=10) lr = LR(random_state=101) rfc = RFC(n_estimators=80, criterion="entropy", random_state=42, n_jobs=-1) clf = VC(estimators=[('mnb', mnb), ('lr', lr), ('rfc', rfc)], voting='hard') clf.fit(X_train,y_train) predict = clf.predict(X_test) from sklearn.metrics import confusion_matrix, classification_report print(confusion_matrix(y_test, predict)) print('\n') print(classification_report(y_test, predict)) def predictor(s): s = vectorizer.transform(s) pre = clf.predict(s)
import pandas as pd import numpy as np df = pd.read_csv("/home/shaury/Downloads/nptel/Iris.csv", delimiter=",") x, y = df[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]], df["Species"] from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.15) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda l1 = lda(n_components=1) x_train = l1.fit_transform(x_train, y_train) x_test = l1.fit_transform(x_test, y_test) from sklearn.ensemble import RandomForestClassifier as RFC cl = RFC(max_depth=2, random_state=0) cl.fit(x_train, y_train) y_pred = cl.predict(x_test) from sklearn.metrics import confusion_matrix, accuracy_score cm = confusion_matrix(y_test, y_pred) print(cm) print('Accuracy' + str(accuracy_score(y_test, y_pred)))
if word_count == 0: # handle the exceptional case where tweet is empty. tweet_mean_vec = np.zeros((K, 1)) else: tweet_mean_vec = sum_vec / word_count if file_no == 0: # assign positive labels tweet_label = 1 elif file_no == 1: # assign negative labels tweet_label = -1 else: raise "Out Of Range File Error" x_train[:, index] = tweet_mean_vec.flatten() y_train[index] = tweet_label with open('train_data.pkl', 'wb') as train_data_picklefile: pickle.dump((x_train, y_train), train_data_picklefile) else: with open('train_data.pkl', 'rb') as train_data_picklefile: x_train, y_train = pickle.load(train_data_picklefile) # Classification rfc = RFC() print(extract_features('I am happy', K)) # print(vector_dict['good']) # print(x_train.shape, y_train.shape) # rfc.fit(np.transpose(x_train), y_train.flatten()) # print(rfc.predict(np.array(vector_dict['good']).reshape(1, K)))