def run_random_forest(training_data, training_labels, validation_data, validation_labels, best_max_depth=[], best_min_samples_leaf=[]): n_estimators_list = range(1, 51) training_accuracy_list = [] validation_accuracy_list = [] for this_n_estimator in n_estimators_list: print('Processing n estimator: ' + str(this_n_estimator) + '/' + str(len(n_estimators_list))) if best_max_depth == []: clf = rfc(n_estimators=this_n_estimator) else: clf = rfc(n_estimators=this_n_estimator, max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf) (training_accuracy, validation_accuracy) = get_training_accuracy.run(clf, training_data, training_labels, validation_data, validation_labels) training_accuracy_list.append(training_accuracy) validation_accuracy_list.append(validation_accuracy) print(CURSOR_UP_ONE + ERASE_LINE + CURSOR_UP_ONE) # Plot data ------------------------------------------------------------------------------------ training_accuracy_list = [training_accuracy*100 for training_accuracy in training_accuracy_list] validation_accuracy_list = [validation_accuracy*100 for validation_accuracy in validation_accuracy_list] pylab.plot(n_estimators_list, training_accuracy_list) pylab.plot(n_estimators_list, validation_accuracy_list) pylab.xlabel('N Estimators') pylab.ylabel('Accuracy (% out of 100)') pylab.legend(['Training Accuracy', 'Validation Accuracy'], loc=2) pylab.grid(True) if best_max_depth == []: pylab.title('Training and Validation Accuracy as function of N Estimators') pylab.savefig("Accuracy_vs_N_Estimators.png") else: pylab.title('Training and Validation Accuracy as function of N Estimators With' + ' Best Max Depth and Best Min Sample Leaf') pylab.savefig("Accuracy_vs_N_Estimators_modified.png") #pylab.show() pylab.close() pylab.clf() # End plot data -------------------------------------------------------------------------------- (best_index, best_accuracy) = max(enumerate(validation_accuracy_list), key = itemgetter(1)) best_n_estimator = n_estimators_list[best_index] return (best_n_estimator, best_accuracy)
def regression(data, y, model="forest"): if model == "forest": from sklearn.ensemble import RandomForestRegressor as rfc est = rfc(n_estimators=10, n_jobs=-1) elif model == "tree": from sklearn.tree import DecisionTreeRegressor as dtc est = dtc() elif model == "extra": from sklearn.ensemble import ExtraTreesRegressor as etc est = etc(n_estimators=10, n_jobs=-1) elif model == "linear": from sklearn.linear_model import LinearRegression as lr cases = y.nunique() est = lr(n_jobs=-1) elif model == "svm": from sklearn.svm import SVR as svc est = svc() elif model == "boost": from sklearn.ensemble import GradientBoostingRegressor as gbc est = gbc(n_estimators=10) elif model == "neural": from sklearn.neural_network import MLPRegressor as nnc est = nnc(max_iter=10, learning_rate_init=1) est.fit(data, y) return est
def classifier(data, y, model="forest"): if model == "forest": from sklearn.ensemble import RandomForestClassifier as rfc est = rfc(n_estimators=10, n_jobs=-1) elif model == "tree": from sklearn.tree import DecisionTreeClassifier as dtc est = dtc() elif model == "extra": from sklearn.ensemble import ExtraTreesClassifier as etc est = etc(n_estimators=10, n_jobs=-1) elif model == "logistic": from sklearn.linear_model import LogisticRegression as lr cases = y.nunique() if cases > 2: est = lr(solver="newton-cg", multi_class="multinomial") else: est = lr(n_jobs=-1) elif model == "svm": from sklearn.svm import SVC as svc est = svc() elif model == "boost": from sklearn.ensemble import GradientBoostingClassifier as gbc est = gbc(n_estimators=10) elif model == "neural": from sklearn.neural_network import MLPClassifier as nnc est = nnc(max_iter=10, learning_rate_init=1) est.fit(data, y) return est
def test_model(): dataX, dataY = readData() dataY = dataY.reshape(dataY.shape[0]) x_train = dataX[:m, :] x_test = dataX[m:, :] y_train = dataY[:m] y_test = dataY[m:] for i in range(100): n_estimators = 100 model = rfc(n_estimators=n_estimators) model.fit(x_train, y_train) # save the model to disk pickle.dump(model, open(filename, 'wb')) score = model.score(x_test, y_test) print("acc = ", score) acc_pre = np.max(model.predict_proba(x_train), 1) # print (acc_pre[:1000]) label_pre = model.predict(x_train) for i in range(len(acc_pre)): if label_pre[i] != y_train[i]: if (acc_pre[i] > 0.8): print( "label_predict = {}, label_Real = {}, ACC = {}".format( label_pre[i], y_train[i], acc_pre[i])) y_train[i] = label_pre[i]
def rfc_learning_curve(features, labels, training_sizes, gini, score='accuracy', perc=False, return_raw=False): st = 10 clf = rfc(n_estimators=20, max_depth=8, max_features=None, min_impurity_decrease=gini, random_state=st) ss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=st) #f1 = f1_score( train_sizes, train_scores, test_scores = learning_curve( clf, features, labels, cv=ss, train_sizes=training_sizes, shuffle=True, scoring=score, random_state=st) test_scores_mean = np.mean(test_scores, axis=1) test_scores_var = np.percentile( test_scores, 95, axis=1) if perc else np.std(test_scores, axis=1) if return_raw: return train_sizes, test_scores else: return train_sizes, test_scores_mean, test_scores_var
def train_model(currHist, context): #training datasets trainingX = [] trainingY = [] priceChanges = [] #delta for i in range(len(currHist) - 1): priceChanges.append((currHist[i + 1] - currHist[i]) / currHist[i]) #dataset creation for i in range( len(currHist) - (context.historicalDays + context.predictionDays)): currDay = (i + context.historicalDays + context.predictionDays) currValue = 0 if currHist[currDay] > currHist[currDay - context.predictionDays] * ( 1 + context.percentChange): currValue = 1 elif currHist[currDay] < currHist[currDay - context.predictionDays] * ( 1 - context.percentChange): currValue = -1 tempList = [] for j in range(context.historicalDays - 1): tempList.append(priceChanges[i + j]) trainingX.append(tempList) trainingY.append(currValue) #classifier clf = rfc() clf.fit(trainingX, trainingY) return (clf)
def createMoodTestModel(train, test): forest = rfc() forest.fit(np.delete(train, -2, 1), train[:, -2]) scores = cross_val_score(forest, np.delete(test, -2, 1), test[:, -2]) print test.shape print train.shape print scores.mean()
def randomForest_new(trainData, trainTarget, testData, testTarget, Act): print '==========Using Random forest classifier==========' trainX = np.array(trainData).astype(np.float) trainy = np.array(trainTarget).astype(np.float) testX = np.array(testData).astype(np.float) testy = np.array(testTarget).astype(np.float) clf = rfc(n_estimators=120) clf.fit(trainX, trainy) print clf.score(testX, testy) y_pred = clf.predict(testX) y_preAr = precision_score(testy, y_pred, average=None) if Act != 'dummy': perf_measure(testy, y_pred, Act + ', Random Forest') y_preAr = precision_score(testy, y_pred, average=None) precision, recall, fscore, support = score(testy, y_pred) #Sprint clf.predict_proba(X_test) x = roc_auc_score(testy, y_pred) print 'The roc auc score is: ', x print 'The Avg precision score is:', average_precision_score( testy, y_pred) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(fscore)) print('support: {}'.format(support)) return x
def randomForest(X, y, Act): print '==========Using Random forest classifier==========' X1 = np.array(X).astype(np.float) y1 = np.array(y).astype(np.float) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X1, y1, test_size=0.4, random_state=0) clf = rfc(n_estimators=120) clf.fit(X_train, y_train) print clf.score(X_test, y_test) y_pred = clf.predict(X_test) y_preAr = precision_score(y_test, y_pred, average=None) #EERCalc(clf.predict_proba(X_test), y_test, y_pred,"RF") #print clf.predict_proba(X_test) if Act != 'dummy': perf_measure(y_test, y_pred, Act + ', Random Forest') y_preAr = precision_score(y_test, y_pred, average=None) precision, recall, fscore, support = score(y_test, y_pred) #Sprint clf.predict_proba(X_test) x = roc_auc_score(y_test, y_pred) print 'The roc auc score is: ', x print 'The Avg precision score is:', average_precision_score( y_test, y_pred) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(fscore)) print('support: {}'.format(support)) return EERCalc(clf.predict_proba(X_test), y_test, y_pred, "RF")
def RandomForest(df, df_pred): df_X = df.drop('goes_up', axis=1) df_y = df['goes_up'] X = df_X.to_numpy() y = df_y.to_numpy() pred_arr = df_pred.to_numpy() pred_arr = np.nan_to_num(pred_arr) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #params = {'criterion':('gini', 'entropy'), # 'max_depth':(3, 5, 7, 9), # 'min_samples_leaf':(3, 5, 8, 10) # } #clf = GridSearchCV(rfc(), param_grid=params, cv=5) #clf.fit(X_train, y_train) #clf.best_params_ model_rfc = rfc(criterion='entropy', max_depth=5, random_state=0, min_samples_leaf=5) model_rfc.fit(X_train, y_train) df_RFC = model_rfc.predict(pred_arr) return roc_auc_score(y_test, model_rfc.predict(X_test)), df_RFC
def train_model_rfc_calibrated (features, labels) : # First, set aside a some of the training set for calibration # Use stratified shuffle split so that class ratios are maintained after the split splitter = StratifiedShuffleSplit(labels, n_iter = 1, train_size = 0.7, random_state = 30) # Length is 1 in this case since we have a single fold for splitting print (len(splitter)) for train_idx, calib_idx in splitter: features_train, features_calib = features[train_idx], features[calib_idx] labels_train, labels_calib = labels[train_idx], labels[calib_idx] print ("features_train shape: ", features_train.shape) print ("features_calib shape: ", features_calib.shape) print ("labels_train shape: ", labels_train.shape) print ("labels_calib shape: ", labels_calib.shape) print ("Performing Grid Search ...") # params_dict = {'criterion': ['entropy'], 'n_estimators':[30, 35, 40, 45], 'max_depth':[5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10]} params_dict = {'criterion': ['entropy'], 'n_estimators':[60, 70, 80, 90], 'max_depth':[5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10], 'max_features' : [6, 7, 8]} clf = GridSearchCV(rfc(random_state = 30, n_jobs = 4), params_dict, scoring = 'roc_auc', cv = 5) clf.fit(features_train, labels_train) print ("Best estimator: ", clf.best_estimator_) print ("Best best scores: %.4f" %(clf.best_score_)) # print ("Best grid scores: ", clf.grid_scores_) # Perform calibration # Use 'sigmoid' because sklearn cautions against using 'isotonic' for lesser than 1000 calibration samples as it can result in overfitting print ("Performing Calibration now ...") sigmoid = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid') sigmoid.fit(features_calib, labels_calib) return sigmoid
def getResult(url): #Importing dataset filen = 'dataset.csv' r = open(filen,'rt') data = np.loadtxt(r, delimiter = ",") # loading the dataset #lets seperating features and labels X = data[: , :-1] y = data[: , -1] #Seperating training features, testing features, training labels & testing labels X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) clf = rfc() clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print(score*100) # accuracy score X_new = [] X_input = url # checking for catogery X_new=feature_extraction.generate_data_set(X_input) # extracting features of given url X_new = np.array(X_new).reshape(1,-1) # converting try: prediction = clf.predict(X_new) if prediction == -1: return "Omg!!!.. its Phishing Url" else: return "hureeh!!....its a Genuine Url" except: return "Omg!!... its a Phishing Url"
def rf_classifier(X_train, y_train, X_test, y_test, method, estimators, num_features, preprocessing_method): print('Random Forest Classification using estimators', estimators, 'and preprocessing via', method) classifier = rfc(n_estimators=estimators) if method == 'pca': print('Performing dimensional reduction with features', num_features) X_train = dimensional_reduction(X_train.astype(float), y_train, num_features=num_features) X_test = dimensional_reduction(X_test.astype(float), y_test, num_features=num_features) else: X_train = sklearn_preprocessing(X_train.astype(float), y_train.astype(float), preprocessing_method) X_test = sklearn_preprocessing(X_test.astype(float), y_test.astype(float), preprocessing_method) classifier.fit(X_train, y_train) y_test_predicted = classifier.predict(X_test) return classifier, X_train, y_train, y_test_predicted
def initialize_models(X_train, y_train, X_test, y_test, accuracy, fscore): # TODO: Initialize the three models clf_A = dtc(random_state=13) clf_B = rfc(random_state=13) clf_C = abc(random_state=13) # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data # HINT: samples_100 is the entire training set i.e. len(y_train) # HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`) # HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`) samples_100 = len(y_train) samples_10 = len(y_train) // 10 samples_1 = len(y_train) // 100 # Collect results on the learners results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test) # Run metrics visualization for the three supervised learning models chosen vs.evaluate(results, accuracy, fscore) return clf_C
def resolve(url): # Importing dataset data = np.loadtxt(os.path.dirname(__file__) + "/dataset.csv", delimiter=",") # Seperating features and labels X = data[:, :-1] y = data[:, -1] # Seperating training features, testing features, training labels & testing labels X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = rfc() clf.fit(X_train, y_train) # rfc.predict(X_train, X_test); score = clf.score(X_test, y_test) # print("accuracy = ", score * 100) X_new = [] X_input = url X_new = feature_extraction.generate_data_set(X_input) X_new = np.array(X_new).reshape(1, -1) try: prediction = clf.predict(X_new) if prediction == 1: return "Legitimate Url" else: return "Suspicious Url" except: return "Phishing Url"
def RandomForestClassifer(): #loading dataset data = np.loadtxt("dataset.csv", delimiter = ",") #seperate features and labels, 1-30 are features and 31 is label x = data[: , :-1] y = data[: , -1] #Seperating training features, testing features, training labels & testing labels x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) #here 20% data is for testing #x variables contain features and y contains results print("Training a random forest model on given dataset") start = time.time() #store the start time for training and testing of model classifier = rfc() #using random forest classifier model print("Random Forest classifier created.") print("Beginning model training.") classifier.fit(x_train, y_train) #train the model print("Model training completed.") predictions = classifier.predict(x_test) #do predictions on the model for testing data print("Predictions on testing data computed.") end = time.time () #store the end time for training and testing of model accuracy = 100.0 * accuracy_score(y_test, predictions) #calculate accuracy of the model and store it in 'score' variable print("The accuracy of your random forest model on testing data is: " + str(accuracy) + " %") f1score = f1_score (y_test, predictions) #calculate f1 score of the model and store it in 'f1score' variable print ("The f1-score of your random forest model on testing data is: " + str (f1score)) precision = precision_score (y_test, predictions) #calculate precision score of the model and store it in 'precision' variable print ("The precision of your random forest model on testing data is: " + str (precision)) recall = recall_score (y_test, predictions) #calculate recall score of the model and store it in 'recall' variable print ("The recall of your random forest model on testing data is: " + str (recall)) runtime = end - start #calculate and store total time taken for training and testing of model print ("Total time taken for training and testing by random forest model is: " + str (runtime) + " s")
def getResult(url): #Importing dataset data = np.loadtxt("dataset.csv", delimiter=",") #Seperating features and labels X = data[:, :-1] y = data[:, -1] #Seperating training features, testing features, training labels & testing labels X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = rfc() clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print(score * 100) X_new = [] X_input = url X_new = feature_extraction.generate_data_set(X_input) X_new = np.array(X_new).reshape(1, -1) try: prediction = clf.predict(X_new) if prediction == -1: return "Phishing Url" else: return "Legitimate Url" except: return "Phishing Url"
def HCC(header_in, train_in, test_in, tscore="SP", baseClassifier=rfc(), type_prop="all_probabilities"): return hcc.HCC(header_in, train_in, test_in, tscore, type_prop, baseClassifier)
def decision_tree(num_tree, depth, need_stretch): train = preprocess(images, need_stretch) test = preprocess(test_img, need_stretch) model = rfc(n_estimators=num_tree, max_depth=depth) model.fit(train, train_y) train_res = model.predict(train) test_res = model.predict(test) return train_res, test_res
def random_forest_classifier(x_train, y_train, x_test, y_test,tree): model = rfc(n_estimators=tree, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) model.fit(x_train, y_train) predicted = model.predict(x_test) expected = y_test return expected, predicted
def get_feature_importances (features, labels) : # clf = gbc(random_state = 30, max_depth = 3, n_estimators = 100, min_samples_leaf = 2, min_samples_split = 2, learning_rate = 0.05, subsample = 0.9) clf = rfc(random_state = 30, max_depth = 6, n_estimators = 100, min_samples_leaf = 1, min_samples_split = 2, n_jobs = 4, criterion = 'entropy') clf.fit(features, labels) # print ("Feature Importances: ", clf.feature_importances_) print ("Header", header) print ("Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), clf.feature_importances_), header[1:]), reverse=True)) return clf
def random_forest_pred(input): x = iris.iloc[:, :4] y = iris.iloc[:, 4] clf = rfc(n_jobs=2) clf.fit(x, y) y_pred = clf.predict(input) return y_pred
def randomClassification(x, y, testX, testY): model = rfc() model.fit(x, y) print("Fitting Complete. Displaying Results... / 모델 피팅 성공. 결과 출력...") print("R^2 Score:",model.score(testX, testY))
def RFC(test_data, test_label, train_data, train_label, d): rfc_classifier = rfc(n_estimators=d) #It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. rfc_train_error = rfc_classifier.fit(train_data, train_label).score( train_data, train_label) rfc_test_error = rfc_classifier.score(test_data, test_label) y_predict = rfc_classifier.predict(test_data) return y_predict, (1 - rfc_train_error) * len(train_data), ( 1 - rfc_test_error) * len(test_data)
def get_default_classifier(self): if(self.classifier == 'rfc'): return rfc() if(self.classifier == 'gbc'): return gbc() if(self.classifier == 'svc'): return svc() raise Exception("Only the classifiers 'rfc', 'svc', or 'gbc' are allowed")
def __init__(self, trees, depth, class_to_int, int_to_class, feat, classes): self.class_to_int = class_to_int self.int_to_class = int_to_class self.model = rfc(n_estimators=trees, max_depth=depth, n_jobs=-1, verbose=1) self.model.fit(feat, classes)
def random_forest(): x = iris.iloc[:, :4] y = iris.iloc[:, 4] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) clf = rfc(n_jobs=2) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) return accuracy_score(y_test, y_pred)
def randomforest(filename): trainX, trainY = modelcv.modelinput(tempfile, 33) clf = rfc(n_estimators=250, min_samples_split=6, n_jobs=-1, class_weight='balanced') clf.fit(trainX, trainY) #score=cross_val_score(clf, trainX, trainY, cv=3,verbose=True,n_jobs=-1) #print(np.average(score)) inputfile = '../models/RFCmodel.sav' joblib.dump(clf, inputfile, compress=9)
def randomforest(X_train, y_train): param_rfc = { 'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 0.5, 'n_jobs': 4 } clf_rfc = rfc(**param_rfc) clf_rfc.fit(X_train, y_train) return clf_rfc
def randomForest_new(X_train1, y_train1, X_test1, y_test1): X_train = np.array(X_train1).astype(np.float) y_train = np.array(y_train1).astype(np.float) X_test = np.array(X_test1).astype(np.float) y_test = np.array(y_test1).astype(np.float) clf = rfc(n_estimators=120) print '==========Using Random forest classifier==========' clf.fit(X_train, y_train) print clf.score(X_test, y_test) y_pred = clf.predict(X_test) perf_measure_new(y_test, y_pred) return EERCalc(clf.predict_proba(X_test), y_test, y_pred, "knn")
def train_model_adab_stacked_rfc (features, labels) : base_model = rfc(n_estimators = 80, max_features = 7, max_depth=6, random_state = 30, criterion = 'entropy') # model = BaggingClassifier(base_estimator = base_model) params_dict = {'learning_rate' : [0.03, 0.05, 0.1], 'n_estimators':[20, 50, 100]} clf = GridSearchCV(adab(random_state = 30, base_estimator = base_model), params_dict, n_jobs = -1, scoring = 'roc_auc', cv = 5) clf.fit(features, labels) print ("Best estimator: ", clf.best_estimator_) print ("Best best scores: %.4f" %(clf.best_score_)) return clf
def fit(self, X, s): #Function that calculates value of c. if self.trad_clf is None: self.trad_clf = rfc(n_estimators=1500, class_weight="balanced", n_jobs=4) c = np.zeros(self.n_folds) skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True) for i, (itr, ite) in enumerate(skf.split(X, s)): self.trad_clf.fit(X[itr], s[itr]) c[i] = self.trad_clf.predict_proba(X[ite][s[ite] == 1])[:, 1].mean() self.c = c.mean() return self
def train_model_bagging (features, labels) : base_model = rfc(n_estimators = 80, max_features = 20, max_depth=6, random_state = 30, criterion = 'entropy') # model = BaggingClassifier(base_estimator = base_model) params_dict = {'max_features': [0.5, 0.8], 'max_samples': [0.5, 0.8, 1], 'n_estimators':[25, 50, 75]} clf = GridSearchCV(BaggingClassifier(random_state = 30, n_jobs = -1, base_estimator = base_model), params_dict, scoring = 'roc_auc', cv = skf(labels, n_folds = 5, random_state = 30)) clf.fit(features, labels) print ("Best estimator: ", clf.best_estimator_) print ("Best best scores: %.4f" %(clf.best_score_)) return clf
def getTrainedCLassifier(classifierType, train): if classifierType == "naiveBayes": from nltk.classify import NaiveBayesClassifier trainedClassifier = NaiveBayesClassifier.train(train) elif classifierType == "randomForest": from sklearn.ensemble import RandomForestClassifier as rfc trainedClassifier = SklearnClassifier(rfc(n_estimators=25, n_jobs = 2)) trainedClassifier.train(train) elif classifierType == "knn5": from sklearn.neighbors import KNeighborsClassifier as knn trainedClassifier = SklearnClassifier(knn(5)) trainedClassifier.train(train) return trainedClassifier
def model_randomforest_classifier(X_train, X_test, y_train, y_test): model_name = f'model_{count}_randomforest_classifier' model = rfc() model.fit(X_train, y_train) model.independentcols = independentcols y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) # print(classification_report(y_test, y_pred)) score = accuracy_score(y_test, y_pred) print(f'{model_name} accuracy: {score}') joblib.dump(model, f'model/{model_name}.joblib')
def train_model_rfc_calibrated_cv (features, labels, hold_out = False, train_sz = 0.9) : features_train, features_test = [], [] labels_train, labels_test = [], [] if (hold_out == True) : # First, set aside a some of the training set for calibration # Use stratified shuffle split so that class ratios are maintained after the split splitter = StratifiedShuffleSplit(labels, n_iter = 1, train_size = train_sz, random_state = 30) # Length is 1 in this case since we have a single fold for splitting print (len(splitter)) for train_idx, test_idx in splitter: features_train, features_test = features[train_idx], features[test_idx] labels_train, labels_test = labels[train_idx], labels[test_idx] else : features_train = features labels_train = labels print ("features_train shape: ", features_train.shape) print ("labels_train shape: ", labels_train.shape) if (hold_out == True) : print ("features_test shape: ", features_test.shape) print ("labels_test shape: ", labels_test.shape) print ("Parameters selected based on prior grid Search ...") #clf = rfc(random_state = 30, n_jobs = 4, criterion = 'entropy', max_depth = 7, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 50) #clf = rfc(random_state = 30, n_jobs = 4, criterion = 'gini', max_depth = 8, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 120) # clf = rfc(random_state = 30, n_jobs = 4, criterion = 'gini', class_weight = 'auto', max_depth = 5, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 100) clf = rfc(random_state = 30, n_jobs = 4, criterion = 'entropy', class_weight = 'auto', max_depth = 5, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 60) # Perform calibration # Use 'sigmoid' because sklearn cautions against using 'isotonic' for lesser than 1000 calibration samples as it can result in overfitting # 05/22 - Looks like isotonic does better than sigmoid for both Brier score and roc_auc_score. # Using 30-40% holdout actually improves ROC AUC for holdout score from 0.88 to 0.925 with CV=5 print ("Performing Calibration now ...") # sigmoid = CalibratedClassifierCV(clf, cv=5, method='sigmoid') sigmoid = CalibratedClassifierCV(clf, cv=5, method='isotonic') sigmoid.fit(features_train, labels_train) if (hold_out == True) : # Calculate Brier score loss y_probs = sigmoid.predict_proba(features_test)[:, 1] clf_score = brier_score_loss(labels_test, y_probs) print ("Brier score: ", clf_score) auc_score = estimate_roc_auc (sigmoid, features_test, labels_test) return sigmoid
def train_model_rfc (features, labels) : # Start with reduced param space # Best came in at the higher end of 1000, 6, so increase # params_dict = {'criterion': ['entropy'], 'n_estimators':[40, 60, 80, 100], 'max_depth':[5, 6, 7], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10], 'max_features' : [6, 7]} params_dict = {'class_weight' : ['auto'], 'criterion': ['entropy'], 'n_estimators':[50, 60, 70], 'max_depth':[4, 5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10]} # params_dict = {'criterion': ['entropy'], 'n_estimators':[100, 150, 200, 250, 300], 'max_depth':[None], 'min_samples_split': [1, 2, 5], 'max_features': [6, 7, 8, 9]} ### Train estimator (initially only on final count # skf = StratifiedKFold clf = GridSearchCV(rfc(random_state = 30, n_jobs = 4), params_dict, scoring = 'roc_auc', cv = 5) clf.fit(features, labels) print ("Best estimator: ", clf.best_estimator_) print ("Best best scores: %.4f" %(clf.best_score_)) #print ("Best grid scores: ", clf.grid_scores_) return clf
def train_model_rfc_pipeline (features, labels) : scaler = StandardScaler() clf_rfc = rfc(random_state = 30, n_jobs = 4, criterion = 'entropy') # Transforms are applied exactly in the order specified estimators = [('sscaler', scaler), ('rfc', clf_rfc)] t0 = time.clock() # Use pipeline directly in GridSearchCV params_dict = {'rfc__n_estimators': [100, 300, 500, 700], 'rfc__max_depth': [1, 2, 3], 'rfc__min_samples_split':[10, 20, 50], 'rfc__min_samples_leaf':[1, 2, 5]} clf = GridSearchCV(Pipeline(estimators), params_dict, cv = 5, scoring = 'roc_auc') clf.fit(features, labels) print ("Grid Search CV time: ", time.clock() - t0 ) print ("Best estimator: ", clf.best_estimator_) print ("Best grid scores: %.4f" %(clf.best_score_)) return clf
data=[] for row in csv_file_object: data.append(row) test_data = np.array(data) # write the test data in to another file open_file_object = csv.writer(open("cleaned_test.csv", "wb")) for row in cleaned_train_data: open_file_object.writerow(row) cleaned_test_data = clean_test_data(test_data) ################################################################################ # Create the random forest object which will include all the parameters # for the fit Forest = rfc(n_estimators = 100) # fit the training input and output to create the # decision trees Forest = Forest.fit(cleaned_train_data[0::,2::],cleaned_train_data[0::,1]) output = Forest.predict(cleaned_test_data[0::,1::]) # generate and save the output csv file fo = csv.writer(open("result.csv", "wb")) res_array = np.zeros(shape=(len(output), 2)) print ('output is') print (output) print ('data is')
# [1, 1, 0, ...] print len(all_team) print len(result) print 'Elapsed time: %.2fs' % (time.time() - st) st = time.time() X_train, X_test, y_train, y_test = cross_validation.train_test_split(all_team, result, test_size=0.2, random_state=1) # Try classifier clf = SVC() print 'done' clf.fit(X_train, y_train) result1 = clf.predict(X_test) print classification_report(y_test, result1) print accuracy_score(y_test, result1) print 'Elapsed time: %.2fs' % (time.time() - st) st = time.time() clf2 = rfc(n_estimators=5) clf2.fit(X_train, y_train) result2 = clf2.predict(X_test) print classification_report(y_test, result2) print accuracy_score(y_test, result2) print 'Elapsed time: %.2fs' % (time.time() - st) cursor.close() conn.commit() conn.close()
#training = training.astype('float64') #validation = training.astype('float64') print 'partitioned data...' # Splitting Training Up y_train = training[:,1] # labels x_train = training[:,2:] # everything else # Splitting validation up y_valid = validation[:,1] # read y values x_valid = validation[:,2:] tune_grid = [{'n_estimators':[10,100,250,500], 'criterion':['gini','entropy'] }] best_model = GridSearchCV( rfc(), tune_grid, cv=10, verbose=2, n_jobs=5).fit(x_train,y_train) y_pred = best_model.predict(x_valid) p = []; v = []; for i in range(len(y_pred)): p.append(int(y_pred[i])) for j in range(len(y_valid)): v.append(int(y_valid[j])) cm = confusion_matrix(v,p) asm = accuracy_score(v,p) print cm print "Accuracy: %f" % (asm) print best_model.best_estimator_
''' # load data into pandas data frame trdata,testdata=mg.loadData() # get the id's for the test set testid = np.array(testdata.PassengerId) # determine if each passenger has a known surviving family member trdata,tesrdata=mg.addFamSurvivors(trdata,testdata) # munge the data to generate one-hot labels for gender, titles, ticket departments trdata=mg.mungeData(trdata) testdata=mg.mungeData(testdata) # initialize classifier model= rfc(n_estimators=1000,oob_score=True,compute_importances=True) model = model.fit(trdata.iloc[:,1:],trdata.iloc[:,0]) accur = model.oob_score_ print('Out of Bag accuracy: %f \n' %accur) # generate predictions preds = model.predict(testdata) # save out mg.writeout(preds,testid,'predictions/rfcmodel_test.csv')
train_data = np.array(train_data) test_data = np.array(test_data) valid_data = np.array(valid_data) train_class = np.ravel(np.array(train_class)) test_class = np.ravel(np.array(test_class)) valid_class = np.ravel(np.array(valid_class)) print train_data.shape print test_data.shape print valid_data.shape #train_data = train_data[0:100,:] #train_class = train_class[0:100] svc = rfc(n_estimators=500, min_samples_split = 9,criterion='gini',compute_importances=True) svm_parameters = {'n_estimators':[100,200,500,1000,2000], 'min_samples_split' : [3,5,7,9,11,13,15,17,19]} clf = gsc(svc, svm_parameters) #clf = svc clf.fit(train_data,train_class) print clf.grid_scores_ print clf.best_score_ print clf.best_estimator_ print clf.best_params_
# initialize classifier if __name__ == "__main__": num = 4 train_data_file, test_data_file, test_result_dir = cmd.TrainTestFileParser(sys.argv, num) test_result_file = open(test_result_dir + "rfc_test_result" + str(num) + ".txt", "w+") trdata = pd.read_csv(train_data_file, header=None, sep=" ") tedata = pd.read_csv(test_data_file, header=None, sep=" ") # depthlist = [5,10,15,20,50,100] model = rfc(n_estimators=5000, oob_score=True, max_features=None, max_depth=10) model = model.fit(trdata.iloc[:, 1:], trdata.iloc[:, 0]) accur = model.score(tedata.iloc[:, 1:], tedata.iloc[:, 0]) resultClass = model.predict(tedata.iloc[:, 1:]) # resultLogProba = model.predict_log_proba(tedata.iloc[:,1:]) resultProba = model.predict_proba(tedata.iloc[:, 1:]) for x, y in zip(resultClass, resultProba): test_result_file.write(str(x) + " ") for z in y: test_result_file.write(str(z) + " ") test_result_file.write("\n") print len(resultProba) print ("Test data accuracy: %f\n" % accur)
dt = iris_data.data lbls = iris_data.target #train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test. num_fold = 10 gen_k_sets = StratifiedKFold(lbls,num_fold) ab = [] for nb in range(1,136,1): dst_mdl = nn(n_neighbors=nb) overall_mis = 0 mdl = SVC(C=1.0,kernel='linear') mdl=rfc(n_estimators=500) for train_index, test_index in gen_k_sets: train_data, test_data = dt[train_index], dt[test_index] train_class, test_class = lbls[train_index], lbls[test_index] j = 0 for i,td in enumerate(test_data): td = np.array(td) tst_class_act=test_class[i]
lbls = np.array(dgts_lbl) print lbls.shape train_lbl,test_lbl,train_dt,test_dt = train_test_split(lbls,dt,test_size = 0.15,random_state=1299004) clstrs = 10 clst = KMeans(n_clusters = clstrs,n_init=30,tol=0.00001,max_iter=500) clst.fit(train_dt) clsts_lbl = np.reshape(np.array(clst.labels_),(train_dt.shape[0],1)) #td = np.hstack((train_dt,clsts_lbl)) mdls = [] for i in range(clstrs): t_idx = np.where(clsts_lbl==i)[0] mdl = rfc(n_estimators=50,criterion='entropy',oob_score=True) mdl = etc(n_estimators=5000,criterion='entropy',oob_score=True,bootstrap=True,min_samples_split=30) #mdl = SVC(C=10000,gamma=0.00001,kernel='rbf') mdl = knn(n_neighbors=1) td = train_dt[t_idx] tc = train_lbl[t_idx] #print tc mdl.fit(td,tc) print mdl.score(td,tc) #print mdl.oob_score_ mdls.append(mdl) scrs= [] clst_lbl_tst = np.reshape(np.array(clst.predict(test_dt)),(test_dt.shape[0],1)) for i in range(clstrs):
dgts_data = np.array(dgts_data) print dgts_data.shape print dgts_data dgts_lbl = pd.read_csv("abcd_l.csv",index_col=0) print dgts_lbl.head() print dgts_lbl.shape dgts_lbl = np.array(dgts_lbl) print dgts_lbl.shape print dgts_lbl #train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test. gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.20) mdl = SVC() mdl = rfc() dst_mdl = nn(n_neighbors=100) for train_index, test_index in gen_k_sets: train_data, test_data = dgts_data[train_index], dgts_data[test_index] train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index] #test_data= test_data[:1000,] #test_class = test_class[:1000] #print g dst_mdl.fit(train_data) #print mdl.score(train_data,train_class) print train_data.shape j = 0 for i,td in enumerate(test_data): td = np.array(td)
cols = np.array(cols) cols = list(cols[:,0]) print cols train_data = train_data[:,cols] test_data = test_data[:,cols] valid_data = valid_data[:,cols] print train_data.shape print test_data.shape print valid_data.shape """ # train_class = train_class[0:100] svc = rfc(n_estimators=500, min_samples_split=9, criterion="gini") svm_parameters = {"n_estimators": [500], "min_samples_split": [9]} clf = gsc(svc, svm_parameters) clf = svc clf.fit(train_data, train_class) print print clf.score(valid_data, valid_class) print clf.score(test_data, test_class) # print svc.feature_importances_ """ print clf.grid_scores_ print clf.best_score_
# plt.subplot(2,5,i) # plt.hist(X[:,i],bins=500) # Train the model # pca = PCA(n_components=40) # pca.fit(nptrd[:,range(1,94)]) # X = pca.transform(nptrd[:,range(1,94)]) # PCAExplained = sum(pca.explained_variance_ratio_) # Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher. # This indicates that only a few values are non-zero for most features. # This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure forest = rfc(n_estimators=100, n_jobs=-1, min_samples_split=20, min_samples_leaf=10) # forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1]) forest = forest.fit(X, nptrd[:, -1]) # temp = forest.predict(nptrd[:,range(1,94)]) temp = forest.predict(X) TrainError = sum(temp == nptrd[:, -1]) / (len(nptrd) * 1.0) # Need to spend some time checking for overfit - using some elbow techiques maybe # Cross validate the model using the cross validation dataset XCv = pipeline.transform(npcvd[:, range(1, 94)]) # output = forest.predict(npted[:,range(1,94)]) outputCv = forest.predict(XCv)
white_corr_rho=pd.DataFrame(white_corr_rho,index=range(0,11),columns=range(0,11)) white_corr_pval=pd.DataFrame(white_corr_pval,index=range(0,11),columns=range(0,11)) print white_corr_rho print white_corr_pval #RANDOM FOREST MODELING: RED--------------------------------------------------- #set iterations iterations=20 #create empty data frames for prediction results and feature importances red_results=pd.DataFrame(index=dfr_exp.index, columns=range(0,iterations)) red_features=pd.DataFrame(index=range(0,11), columns=range(0,iterations)) #fit model using StratifiedKFold rf=rfc(n_estimators=360, max_features=5, criterion='gini') for j in range(0,iterations): folds = skf(dfr_res, 5, shuffle=True) for train, test in folds: model=rf.fit(dfr_exp.ix[train,], dfr_res[train]) red_results.ix[test,j] = pd.Series(model.predict(dfr_exp.ix[test,]), index=test, name=[j]) red_features[j]=pd.Series(model.feature_importances_) print j #write results to file red_results.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=True) red_features.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt', sep='\t', header=True) #retrieve results as needed #red_results=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=False, names=range(0,iterations)) #red_features=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt', sep='\t', header=False, names=range(0,iterations))
print('Optimal N Estimator with default settings was: ' + str(best_n_estimator) + ' with accuracy: ' + str(best_n_estimator_accuracy)) print('Optimal N Estimator with modified settings was: ' + str(best_n_estimator_modified) + ' with accuracy: ' + str(best_n_estimator_modified_accuracy)) # Get Test error with best configuration of Decision Tree and Random Forest (training_data, training_labels, _, _) = preprocess_data.run_for_training_data(1) (test_data, test_labels) = preprocess_data.run_for_test_data() # Align data missing_headers = training_data.columns.diff(test_data.columns) test_data[missing_headers] = training_data[missing_headers].applymap(lambda x: False) # Decision Tree clf = dtc(criterion='entropy', max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf) (_, test_accuracy_dt) = get_training_accuracy.run(clf, training_data, training_labels, test_data, test_labels) # Random Forest clf = rfc(n_estimators=best_n_estimator_modified, max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf) (_, test_accuracy_rf) = get_training_accuracy.run(clf, training_data, training_labels, test_data, test_labels) print('Test accuracy for Decision Tree: ' + str(test_accuracy_dt)) print('Test accuracy for Random Forest: ' + str(test_accuracy_rf)) print('\n=========================================================================================') print('Script complete')
print f_data.shape ''' pca = PCA(n_components=100) pca.fit(dgts_data) tr_dt_p = pca.transform(dgts_data) print pca.explained_variance_ratio_ print tr_dt_p.shape print sum(pca.explained_variance_ratio_) ''' mdl = knn(n_neighbors= 13) mdl = rfc(n_estimators=500,min_samples_split=5,min_samples_leaf=3,criterion='entropy') gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.15,random_state=10987) for train_index, test_index in gen_k_sets: train_data, test_data = dgts_data[train_index], dgts_data[test_index] train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index] mdl.fit(train_data,train_class) print mdl.score(test_data,test_class) #print mdl.feature_importances_ ''' mdl = KMeans(n_clusters=10) mdl.fit(tr_dt_p) print mdl.labels_
# Split data into training and cross-validation dataset nptrd, npcvd = tts(trd,test_size=0.33) # Train the model pca = PCA(n_components=40) pca.fit(nptrd[:,range(1,94)]) X = pca.transform(nptrd[:,range(1,94)]) PCAExplained = sum(pca.explained_variance_ratio_) # Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher. # This indicates that only a few values are non-zero for most features. # This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure forest = rfc(n_estimators=500,criterion = 'entropy' , n_jobs=-1,min_samples_split=5,min_samples_leaf=5,max_depth=20) #forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1]) forest = forest.fit(X,nptrd[:,-1]) #temp = forest.predict(nptrd[:,range(1,94)]) temp = forest.predict(X) TrainError = sum(temp == nptrd[:,-1]) / (len(nptrd)*1.0) # Need to spend some time checking for overfit - using some elbow techiques maybe # Cross validate the model using the cross validation dataset XCv = pca.transform(npcvd[:,range(1,94)]) #output = forest.predict(npted[:,range(1,94)]) outputCv = forest.predict(XCv)
n_estimators = [100, 200, 300, 400, 500] best_cv_score = -9999.9999 best_n_est = 10000 avg_scores = [] for i in n_estimators: forest = rfc(n_estimators=i, oob_score=True) scores = cross_val_score(forest, trainData[0::, 1::], trainData[0::, 0], scoring='log_loss', cv=5, n_jobs=-1) avg_scores.append(auxiliary.calc_avg(scores)) if avg_scores[-1] > best_cv_score: best_cv_score = avg_scores[-1] best_n_est = i plt.plot(n_estimators, avg_scores) plt.show() ''' forest_v = rfc(n_estimators=100, oob_score=True) forest_v = AdaBoostClassifier() forest = forest_v.fit(trainData[0::,1::], trainData[0::,0]) # Feature importances importances = forest.feature_importances_ print "Feature Importances: ", importances print 'Predicting...' output = forest.predict_proba(testData).astype(float) output = output.tolist() predictions_file = open("../submissionRF.csv", "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(["Id",'ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION',
def main(): if len(sys.argv) != 3: print("Usage: python filename [training_set] [queries]") print("Hello World") headings = [ "ID", "age", "job", "marital", "education", "default", "balance", "housing", "loan", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y" ] answerData = [] """Read in data""" trainingdata = pandas.read_csv("Data/trainingset.txt",header = None, names = headings) queries = pandas.read_csv("Data/queries.txt",header = None, names = headings) idnum = [0] target = [17] cont = [1, 5, 6, 10, 12, 13, 14, 15] cat = [2, 3, 4, 7, 8, 9, 11, 16] """Learn from data""" ##Continuous Relevant Data: age, balance, previous ##Categorical Relevant Data: job, housing, loan, contact ## ##INSERT CODE THAT DOES THINGS HERE ##Implement Random Forest predictive algorithm ##Format data into numerical format relevantFeatures = ["age","balance","previous","job","housing","loan","contact"] model = rfc(n_estimators=1000) #made all data numeric length = len(trainingdata.index) print(length) trainingdata = numerify(trainingdata) """Answer Queries""" dataHeader = ["ID","Y"] answerData.append(dataHeader) #the following is test code to get id and target and put it into the answers for x in range (0, length): temp = [] #trainingdata.set_value(x, 'ID', 'bork') newid = trainingdata.iloc[x]['ID'] temp.append(newid) newtarget = trainingdata.iloc[x]['y'] temp.append(newtarget) answerData.append(temp) #id = trainingdata.iloc[x]['ID'] #newid = job_to_numeric(id) #trainingdata.set_value(x, 'ID', newid) """Output Queries""" #Write all the data from the array into a text file. #Each iteration of queries should be written into the answerData list, as lists themselves. newfile = open('./data/C12449618+C12474932.txt', 'w') writerObject = csv.writer(newfile, lineterminator='\n') for line in answerData: writerObject.writerow(line) newfile.flush() newfile.close()
import sys from sklearn.cross_validation import KFold as kfold from sklearn.ensemble import AdaBoostClassifier as rfc from sklearn.externals import joblib import numpy as np if __name__== "__main__": train_data = np.loadtxt(sys.argv[1],delimiter =',') train_label = np.loadtxt(sys.argv[2],delimiter=',') trees_list = [500] for i in trees_list: est = rfc(n_estimators=i) est.fit(train_data, train_label) filename = "ada_boost" + str(i) + ".pkl" joblib.dump(est, filename)
testDf = auxiliary.initialise_test(False) ids = testDf['Id'].values # Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,Year,Week,Hour testDf = testDf.drop(['Id', 'Dates', 'Address'], axis=1) # Random Forest Algorithm print list(trainDf.columns.values) print list(testDf.columns.values) #print list(trainDf.X.values) # back to numpy format trainData = trainDf.values testData = testDf.values print 'Training...' forest = rfc(n_estimators=25) forest = forest.fit(trainData[0::,1::], trainData[0::,0]) print 'Predicting...' output = forest.predict_proba(testData).astype(float) output = output.tolist() predictions_file = open("../submissionRF.csv", "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(["Id",'ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION', 'FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT', 'LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT','PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY', 'SECONDARY CODES','SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY', 'SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM','VEHICLE THEFT','WARRANTS',
lbls = iris_data.target print dt.shape num_fold = 10 gen_k_sets = StratifiedKFold(lbls,num_fold,shuffle=True) ab = [] overall_mis = 0 err=[] c= 1.0 mdl = SVC(C=c,kernel='rbf',degree=1,tol=0.0001) mdl = rfc(n_estimators=100,criterion='entropy',min_samples_leaf=5,min_samples_split=10,max_features=8) mdl = knn(n_neighbors=1) imgsize = 8 patchsize = 6 ab= [] for train_index, test_index in gen_k_sets: train_data, test_data = dt[train_index], dt[test_index] train_class, test_class = lbls[train_index], lbls[test_index] dtsize= train_data.shape[0] train_data = train_data.reshape(dtsize,imgsize,imgsize) c1 = train_data[:,0:patchsize,0:patchsize] ''' a= c1[0,:,:] print a.shape
for line in f: line = line.strip() labels.append(line) f.close() #train_data = np.reshape(train_data, (17000,100)) #test_data = np.reshape(test_data,(len(test_data),100)) overall_s = 0 for i in range(0,len(data),len(data)/10): #labels = np.array(labels) train_data = data[0:i] + data[i+len(data)/10:] test_data = data[i:i+len(data)/10] train_label = labels[0:i] + labels[i+len(data)/10:] test_label = labels[i:i+len(data)/10] test_label = np.array(test_label) train_label = np.array(train_label) train_data = np.array(train_data) test_data = np.array(test_data) clf = rfc(n_estimators=300) y_pred = clf.fit(train_data,train_label).predict(test_data) #pickle.dump( y_pred, open( "out .p", "wb" ) ) print y_pred print test_label count = 0 for i in range(0,len(y_pred)): if y_pred[i] == test_label[i]: count+=1 print (float(count)/len(y_pred))*100 overall_s+=(float(count)/len(y_pred))*100 print overall_s/10