def rf_fit(): train_inp,valid_inp,train_target,valid_target = prepare_input() rf = RandomForestClassifier(random_state=31,n_jobs=-1,verbose=1,n_estimators=100,min_samples_split=5) start = time.time() rf.fit(train_inp,train_target) end = time.time() print "fitting took {:0.4} seconds".format(end-start) training_output = rf.predict_proba(train_inp) validation_output = rf.predict_proba(valid_inp) training_error = log_loss(train_target,training_output) validation_error = log_loss(valid_target,validation_output) print "Train error: {:02.4f}".format(training_error) print "Validation error: {:02.4f}".format(validation_error) joblib.dump(rf,rf_filename) return rf
def init_turns_module(values, trees, data, labels): # Fit regression model global turns_regr turns_regr = RandomForestClassifier(n_estimators=trees) turns_regr.fit(data[:, [0,1]], labels) print "init_turns, importances: ", turns_regr.feature_importances_ return
def fit_rf(path, index_filter=None, class_filter=None, feature_filter=None, folds=10, inverse=False, lc_filter=None): """ path: Dirección del dataset a ocupar para entrenar index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar class_filter: Lista de clases que se quiere utilizar feature_filter: Lista de features que se quiere utilizar """ data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14, min_samples_split=5) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) return pd.concat(results)
def randomforest(df1,df2): newsT=df1.L L= ['L'] for x in L: del df1[x] news=df1 TRAINING=df1.as_matrix(columns=None) TEST=newsT.as_matrix(columns=None) newsT=df2['L'] L= ['L'] for x in L: del df2[x] X_test=df2.as_matrix(columns=None) y_test=newsT.as_matrix(columns=None) clf = RandomForestClassifier(n_estimators=200) clf.fit(TRAINING, TEST) y_pred1 = clf.predict_proba(X_test)[:, 1] y_pred = clf.predict(X_test) recall_score(y_test, y_pred) precision_score(y_test, y_pred) precision_score(y_test, y_pred,pos_label=0) recall_score(y_test, y_pred,pos_label=0) roc_auc_score(y_test, y_pred1) print 'roc: ',roc_auc_score(y_test, y_pred1) print 'precision: ',precision_score(y_test, y_pred) print 'recall:', recall_score(y_test, y_pred) print 'precision Negatives: ',precision_score(y_test, y_pred,pos_label=0) print 'recall Negatives: ', recall_score(y_test, y_pred,pos_label=0) return roc_auc_score(y_test, y_pred1),precision_score(y_test, y_pred),recall_score(y_test, y_pred),precision_score(y_test, y_pred,pos_label=0), recall_score(y_test, y_pred,pos_label=0)
def __init__(self, data, classes, tree_features, n_trees=100): self.n_features = np.shape(data)[1] n_rows = np.shape(data)[0] n_nans = np.sum(np.isnan(data), 0) data = data[:, n_nans < n_rows] self.n_features = np.shape(data)[1] n_nans = np.sum(np.isnan(data), 1) data = data[n_nans < self.n_features, :] self.n_rows = np.shape(data)[0] if (tree_features > self.n_features): tree_features = self.n_features self.col_list = np.zeros((n_trees, tree_features), dtype='int') self.n_trees = n_trees self.bags = [] for i in range(n_trees): cols = sample(range(self.n_features), tree_features) cols.sort() self.col_list[i, :] = cols data_temp = data[:, cols] n_nans = np.sum(np.isnan(data_temp), 1) data_temp = data_temp[n_nans == 0, :] classes_temp = classes[n_nans == 0] #bag = BaggingClassifier(n_estimators=1, max_features=tree_features) bag = RandomForestClassifier(n_estimators=1, max_features=tree_features) bag.fit(data_temp, classes_temp) self.bags.append(bag) print(np.shape(data_temp))
def train_and_predict(): print('Converting data...') config.X = np.array(config.X) config.Y = np.array(config.Y) config.X_test = np.array(config.X_test) #print(config.X.shape) #print(config.Y.shape) #print(config.X_test.shape) print('Training...') print('Time Elapsed: ' + str((time.time() - config.start_time)/60)) num_classes = len(config.Y[1, :]) for i in range(num_classes): print('Creating Classifier: ', i) rf = RandomForestClassifier(n_estimators=500, max_depth=5, n_jobs=-1, oob_score=True, verbose=2, criterion="entropy") gbm = xgb.XGBClassifier(n_estimators=500, objective='binary:logistic') print('Fitting Random Forest Classifier: ', i) rf.fit(config.X, config.Y[:, i]) print('Fitting With XGBoost Classifier: ', i) gbm.fit(config.X, config.Y[:, i]) print('Getting Random Forest Predictions for attribute: ', i) y_pred_rf = rf.predict(config.X_test) config.Y_pred_rf.append(y_pred_rf) print(y_pred_rf) print('Getting XGBoost Predictions for attribute: ', i) y_pred_xgb = gbm.predict(config.X_test) config.Y_pred_xgb.append(y_pred_xgb) print(y_pred_xgb)
def TrainRandomForestVariance(p_subject, p_save): print "Welcome to TrainRandomForestVariance(" + p_subject + ", " + str(p_save) + ")" training_data_raw = pd.read_pickle(input_data_paths[p_subject]) training_data = training_data_raw[["variance" in x or "classification" in x for x in training_data_raw.index]] # Ictal vs interictal forest_seizure = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_features="sqrt", max_depth=None, min_samples_split=1) y_seizure = [1 * (x > 0) for x in training_data.T["classification"]] forest_seizure.fit(training_data[:-1].T, y_seizure) # IctalA vs IctalB forest_early = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_features="sqrt", max_depth=None, min_samples_split=1) y_early = [1 * (x == 2) for x in training_data.T["classification"]] forest_early.fit(training_data[:-1].T, y_early) # Save models if p_save: saved_files = joblib.dump(forest_seizure, "RFV_" + p_subject + "_seizure.pkl") for saved_file in saved_files: os.system("mv " + saved_file + " /Users/dryu/Documents/DataScience/Seizures/data/models") saved_files = joblib.dump(forest_early, "RFV_" + p_subject + "_early.pkl") for saved_file in saved_files: os.system("mv " + saved_file + " /Users/dryu/Documents/DataScience/Seizures/data/models") return {"seizure":forest_seizure, "early":forest_early}
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def random_forest_classify(train_data,train_label,test_data): rf = RandomForestClassifier(n_estimators=100) rf.fit(train_data, ravel(train_label)) test_label=rf.predict(test_data) save_result(test_label,'sklearn_random_forest_classify_Result.csv') return test_label
def predict_rf(train_features, test_features, train_labels, test_labels): model = RandomForestClassifier(n_estimators=1000) model.fit(train_features, train_labels) predictions = model.predict(train_features) print get_accuracy(predictions, train_labels) predictions = model.predict(test_features) print get_accuracy(predictions, test_labels)
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'pointGroup', md = None): """ Build a random forest-classifier model to predict some structure feature from compositional data. Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() le = LabelEncoder() X = s.fit_transform(df[predictorColumns].astype('float64')) y = le.fit_transform(df[targetcolumn].values) rfc = RandomForestClassifier(max_depth = md) acc = mean(cross_val_score(rfc, X, y)) X_train, X_test, y_train, y_test = train_test_split(X,y) rfc.fit(X_train,y_train) y_predict = rfc.predict(X_test) cm = confusion_matrix(y_test, y_predict) cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_) rfc.fit(X, y) return rfc, cm, round(acc,2), le
def training_and_test(token, train_data, test_data, num_classes, result): """Train and test Args: token (:obj:`str`): token representing this run train_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of training feature and label test_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of testing feature and label num_classes (:obj:`int`): Number of classes result (:obj:`pyActLearn.performance.record.LearningResult`): LearningResult object to hold learning result """ model = RandomForestClassifier(n_estimators=20, criterion="entropy") model.fit(train_data[0], train_data[1].flatten()) # Test predicted_y = model.predict(test_data[0]) predicted_proba = model.predict_proba(test_data[0]) # Evaluate the Test and Store Result confusion_matrix = get_confusion_matrix(num_classes=num_classes, label=test_data[1].flatten(), predicted=predicted_y) result.add_record(model.get_params(), key=token, confusion_matrix=confusion_matrix) # In case any label is missing, populate it if predicted_proba.shape[1] != num_classes: temp_array = np.zeros((predicted_proba.shape[0], num_classes), np.float32) for i in range(len(model.classes_)): temp_array[:, model.classes_[i]] = predicted_proba[:, i] predicted_proba = temp_array return predicted_y, predicted_proba
def cls_create(xs, ys): if algo == "SVM": classifier = svm.SVC(C = self.parm, probability=True) elif algo == "RF": classifier = RandomForestClassifier(n_estimators = int(self.parm), criterion='entropy', n_jobs = 1) # #classifier = LDA() new_xs = xs """ positive_count = len([y for y in ys if y > 0]) if positive_count >= 20: #self.selector = svm.LinearSVC(C = 1, dual = False, penalty="l1") self.selector = LDA() new_xs = self.selector.fit_transform(xs, ys) else: self.selector = None """ classifier.fit(new_xs, ys) probs = classifier.predict_proba(new_xs) #self.pclassifier = svm.SVC(parm_val = 1.0) #self.pclassifier.fit(probs, ys) self.threshold, self.positive, self.negative = best_threshold_for_f1(probs, 20, ys) return classifier
def model_and_predict(self, X_train, y_train, X_test): district_idx = self.columns.index('PdDistrict') districts = set(X_train[:,district_idx]) district_ys = {} # Grow forest and predict separately for each district's records for d in districts: district_X_train = X_train[X_train[:, district_idx] == d] district_X_train = np.delete(district_X_train, district_idx, 1) district_y_train = y_train[X_train[:, district_idx] == d] district_X_test = X_test[X_test[:, district_idx] == d] district_X_test = np.delete(district_X_test, district_idx, 1) print "Growing forest for", d # Not saving output in Git so make this deterministic # with random_state rf = RandomForestClassifier(n_estimators=self.n_trees, n_jobs=-1, random_state=782629) rf.fit(district_X_train, district_y_train) district_ys[d] = list(rf.predict(district_X_test)) print "Finished", d print "All predictions made" y_hat = [] for row in X_test: d_ys = district_ys[row[district_idx]] y_hat.append(d_ys.pop(0)) return y_hat
def run(): mean_acc = 0.0 mean_logloss = 0.0 skf, X_all, labels = gen_cv() for fold, (test_index, train_index) in enumerate(skf, start=1): logger.info('at fold: {0}'.format(fold)) logger.info('train samples: {0}, test samples: {1}'.format(len(train_index), len(test_index))) X_train, X_test = X_all[train_index], X_all[test_index] y_train, y_test = labels[train_index], labels[test_index] rfc = RandomForestClassifier(n_jobs=10, random_state=919) rfc.fit(X_train, y_train) y_test_predicted = rfc.predict(X_test) y_test_proba = rfc.predict_proba(X_test) # equals = y_test == y_test_predicted # acc = np.sum(equals) / float(len(equals)) acc = accuracy_score(y_test, y_test_predicted) logger.info('test data predicted accuracy: {0}'.format(acc)) # log loss -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp)) logloss = log_loss(y_test, y_test_proba) logger.info('log loss at test data: {0}'.format(logloss)) # logger.info('log loss at test data using label: {0}'.format(log_loss(y_test, y_test_predicted))) mean_acc += acc mean_logloss += logloss n_folds = skf.n_folds logger.info('mean acc: {0}'.format(mean_acc / n_folds)) logger.info('mean log loss: {0}'.format(mean_logloss / n_folds))
def main(): X, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.1, random_state=None) plt.scatter(X[:, 0], X[:, 1], c=y) for i in range(8): clf = RandomForestClassifier(n_estimators = 2**i) clf.fit(X,y) plot_surface(clf, X, y)
def buildModel(df): train_y = df['arr_del15'][:train_len] train_x = df[cols][:train_len] # transform categorical features train_x['unique_carrier'] = pd.factorize(train_x['unique_carrier'])[0] train_x['dep_conditions'] = pd.factorize(train_x['dep_conditions'])[0] train_x['arr_conditions'] = pd.factorize(train_x['arr_conditions'])[0] pd.set_option('display.max_rows', 500) print(train_x) # train_x['origin'] = pd.factorize(train_x['origin'])[0] # train_x['dest'] = pd.factorize(train_x['dest'])[0] # print(train_x) train_x = enc.fit_transform(train_x) print(train_x.shape) # Create Random Forest classifier with 50 trees clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1) clf_rf.fit(train_x.toarray(), train_y) del train_x, train_y print("Model built") return clf_rf
def ranforest(n_estimators, min_samples_split): from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score clf = RandomForestClassifier(n_estimators = n_estimators, min_samples_split = min_samples_split, bootstrap = True) clf.fit(features_train, labels_train) t_fit = time() clf.fit(features_train, labels_train) print "training time:", round(time()-t_fit, 3), "s" t_pred = time() pred = clf.predict(features_test) print "predict time:", round(time()-t_pred, 3), "s" print accuracy_score(pred, labels_test) try: prettyPicture(clf, features_test, labels_test) except NameError: pass
def test_save_prediction(self): model = RandomForestClassifier() model.id = get_model_id(model) model.fit(self.iris.data, self.iris.target) indexes = np.fromfunction(lambda x: x, (self.iris.data.shape[0], ), dtype=np.int32) saving_predict_proba(model, self.iris.data, indexes) os.remove('RandomForestClassifier_r0_N__m5_0p0__m4_2__m1_auto__m0_N__m3_1__m2_N__n0_10__b0_1__c1_gini__c0_N_0_149.csv')
def crossValIteration(dat,classes,cutoff,prop=0.9,reshuffle=False): if reshuffle: dat.samples = sampleReshuffle(dat) saved_samples = [i for i in dat.samples] dat.samples = ["{0}_$$_{1}".format(i,v) for i,v in enumerate(dat.samples)] train,test=dat.splitTraining(prop, classes) print test.samples selectedSampleIndicies = [int(i.split("_$$_")[0]) for i in test.samples] dat.samples = saved_samples print test.samples test.samples = [i.split("_$$_")[1] for i in test.samples] train.samples = [i.split("_$$_")[1] for i in train.samples] print "Training set has {0} samples from classes: {1}".format(len(train.samples),",".join(set(train.samples))) print "Test set has {0} samples from classes: {1}".format(len(test.samples),",".join(set(test.samples))) print "Selecting data..." # select features for each disease print "Number of selections made for each class:" print "Setting up SVM..." Xtrain = train.values.transpose() Ytrain = train.samples clf=RandomForestClassifier(n_estimators=1000) clf.fit(Xtrain,Ytrain) Xtest = test.values.transpose() Ytest = test.samples print "Predicting R-forest..." #classification results versus actual acc = zip(Ytest,clf.predict(Xtest)) # (actual,predicted)... for each sample print acc # this is the elemental form of the "result" lists processed below print sum([i[0] == i[1] for i in acc])*1.0/len(acc) return acc
def rforests(trainx, trainy, test, n_estimators=100, k=5): trainy = np.ravel(trainy) forest = RandomForestClassifier(n_estimators) forest.fit(trainx, trainy) prob_train = forest.predict_proba(trainx) prob_test = forest.predict_proba(test) # Since the index is the number of the country that's been chosen # we can use these with argsort to get the maximum 5., we will have to do this # for the entire matrix though. sort_train = np.argsort(prob_train)[:,-k:] sort_test = np.argsort(prob_test)[:,-k:] # Now we need to transform these back to countries, but to map I need to # have a dataframe. col_names = [] for i in range(k): name = "country_destination_" + str(i+1) col_names.append(name) pred_train = pd.DataFrame(sort_train, columns=col_names) pred_test = pd.DataFrame(sort_test, columns=col_names) for name in col_names: pred_train[name] = pred_train[name].map(dicts.country) pred_test[name] = pred_test[name].map(dicts.country) pred_train = np.fliplr(pred_train) pred_test = np.fliplr(pred_test) return forest, pred_train, pred_test
def get_preds(features, trees=3000, depth=19): # features is the number of latents features that I want the nmf to run on # Create dataframes df = get_nmf(k=features) df_full = add_yahoo_to_df(df) df_train = add_dummies(df_full) # Why aren't you using df_full? df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is df_test_full = add_yahoo_to_df(df_test) df_test_full = add_dummies(df_test_full) # Create models X_model_class, y_model_class = get_classifier_data(df_full) rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth) rf_class.fit(X_model_class, y_model_class) # X_model_regress, y_model_regress = get_regressor_data(df_full) rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth) rf_regress.fit(X_model_regress, y_model_regress) # Get X and y values X_classify, y_classify = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11'])) X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11'])) # Run models classifier_preds = rf_class.predict(X_classify) classifier_accuracy = accuracy_score(classifier_preds, y_classify) regressor_preds = rf_regress.predict(X_regress) regressor_mse = mean_squared_error(regressor_preds, y_regress) # I want to return the number of features, k, along with the accuracy of the classifier # and the MSE of the regressor. This will give me an idea of how well things are doing # based on the number of features. return [features, classifier_accuracy, regressor_mse]
def main(): S, col_names_S = load_data(config.paths.training_data, config.paths.cache_folder) Xs, Ys, col_names_S = extract_xy(S, col_names_S) a = RandomForestClassifier(n_estimators=1) a.fit(Xs.toarray(), Ys.toarray().ravel()) best_features = a.feature_importances_ max_ind, max_val = max(enumerate(best_features), key=operator.itemgetter(1)) print best_features print max_ind, max_val print Xs.shape print Ys.shape param_range = [1, 3, 5, 7, 10, 15, 20, 30, 60, 80] train_scores, test_scores = validation_curve(RandomForestClassifier(criterion='entropy'), Xs, Ys.toarray().ravel(), 'n_estimators', param_range) print train_scores print test_scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.title("Validation Curve for Random Forest") plt.xlabel("Number of Trees") plt.ylabel("Score") plt.plot(param_range, train_mean, label="Training Score", color='r') plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2, color='r') plt.plot(param_range, test_mean, label="Test Score", color='b') plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2, color='b') plt.legend(loc="best") plt.show()
def rand_forest(train_bow,train_labels,test_bow,test_labels,bow_indexes): print("Training rndForest") rf_classifier=RandomForestClassifier() rf_classifier.fit(train_bow,train_labels) print("Testing rndForest") test(rf_classifier,"rf",test_bow,test_labels,bow_indexes)
def randomForest_eval_func(self, chromosome): n_estimators, max_features, window_size = self.decode_chromosome(chromosome) if self.check_log(n_estimators, max_features, window_size): return self.get_means_from_log(n_estimators, max_features, window_size)[0] folded_dataset = self.create_folded_dataset(window_size) indim = 21 * (2 * window_size + 1) mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 sample_size_over_thousand_flag = False for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset(test_fold) if len(test_labels) + len(train_labels) > 1000: sample_size_over_thousand_flag = True clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features) clf.fit(train_dataset, train_labels) probas = clf.predict_proba(test_dataset) decision_values = map(lambda x: x[1], probas) # Probability of being binding residue AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] if sample_size_over_thousand_flag: break if not sample_size_over_thousand_flag: mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold self.write_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc) self.add_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc) return mean_AUC
def fit(self, x, y): models = [] preds = np.zeros((len(x), self.n_channels + self.n_features)) # create channel based models for i in xrange(self.n_channels): print('training channel model {}'.format(i)) model = LogisticRegression() feats = x[:, (i * self.n_features):((i + 1) * self.n_features)] model.fit(feats, y) models.append(model) preds[:, i] = model.predict(feats) # create band based models for i in xrange(self.n_features): print('training band model {}'.format(i)) model = LogisticRegression() feats = x[:, i:(self.n_channels * self.n_features):self.n_features] model.fit(feats, y) models.append(model) preds[:, self.n_channels + i] = model.predict(feats) # create integrating forest top_classifier = RandomForestClassifier() top_classifier.fit(preds, y) self.models = models self.c = top_classifier
class Model: """Abstraction for gibberish model. Two methods: fit and predict.""" def __init__(self, X, y, ntrees=500): """Get data and fit model.""" self.clf = RandomForestClassifier(n_estimators=ntrees) self.ntrees = ntrees self.clf = self.clf.fit(X, y) self.version = 0 def fit(self, X, y): """Updates model with data X, y.""" self.clf = RandomForestClassifier(n_estimators=self.ntrees) self.clf = self.clf.fit(X, y) print("updating model from " + str(self.version) + " to " + str(self.version + 1) + ".") self.version += 1 return(self) def predict(self, X): """Predict classification for X""" prediction = self.clf.predict(X) print("using version " + str(self.version)) return(prediction) def __repr__(self): return("<Model(version='%s')>" % (self.version))
def Random_Forest_classifier(train_input_data,train_output_data,test_input_data,test_output_data): tree_list = [] accuracy_percent = [] for trees in range(10,200,10): clf = RandomForestClassifier(trees) clf.fit(train_input_data,train_output_data) predicted_output = clf.predict(test_input_data) error_list = [] if isinstance(predicted_output,list) ==False: predicted_output = predicted_output.tolist() if isinstance(test_output_data,list) ==False: test_output_data = test_output_data.tolist() for i in range(len(test_output_data)): cur_univ_similarities = similar_univs[similar_univs['univName'] == predicted_output[i]] cur_univ_similarity_list = cur_univ_similarities.values.tolist() cur_univ_similarity_list = [item for sublist in cur_univ_similarity_list for item in sublist] if test_output_data[i] in cur_univ_similarity_list[1:]: error_list.append(0) else: error_list.append(1) tree_list.append(trees) accuracy_percent.append(100 -((sum(error_list)/float(len(error_list))) * 100)) tree_list = np.array(tree_list) accuracy_percent = np.array(accuracy_percent) plt.plot(tree_list,accuracy_percent) plt.xlabel('Number of trees') plt.ylabel('Percent of accuracy') plt.title('Varation of accuracy with trees') plt.grid(True) plt.savefig("rf1.png") plt.show() return predicted_output
def onescore(X, Y, Xtest): clf = RandomForestClassifier(oob_score=True, n_jobs=-1, n_estimators=1000, max_features=300, random_state=0) clf.fit(X, Y) print "oob_score = ", clf.oob_score_ print clf.get_params() ytest = clf.predict(Xtest) output(ytest, "try_004.csv")
def cross_validate(): print("Reading the data") data = cu.get_dataframe(train_file) print("Cross-Validating") rf = RandomForestClassifier(n_estimators=10, verbose=1, compute_importances=True, n_jobs=2) cv = cross_validation.KFold(len(data), k=10, indices=False) results = [] for traincv, testcv in cv: print "\t-- cv [%d]"%len(results) print "\t","extracting features" #... feacv = features.extract_features(feature_names, traincv) print "\t","learning" rf.fit(feacv, data["OpenStatus"]) print "\t","predicting" probs = rf.predict_proba(testcv) print "\t","evaluating" results.append( llfun(target[testcv], [x["OpenStatus"] for x in probas]) ) print "LogLoss: " + str( np.array(results).mean() )
def train_rf(feature,label,params_dummy): rf = RandomForestClassifier(random_state=10,n_estimators=70) rf.fit(feature,label) return rf
print("Training Accuracy is: ") print(log.score(X_train, Y_train) * 100) print(" Testing accuracy is: ") print(log.score(X_test, Y_test) * 100) #print("Precision is: ") #print(precision) #print("Recall is: ") #print(recall) print "-----------Random Forest Classifier----------------" X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25,random_state=0) print X_train.shape print Y_test.shape rand_forest = RandomForestClassifier(random_state=0, n_estimators=250, min_samples_split=8, min_samples_leaf=4) rand_forest.fit(X_train, Y_train) Y_predict = rand_forest.predict(X_test) confusion_mat_random = confusion_matrix(Y_test, Y_predict) print ("Random forest accuracy: ") print accuracy_score(Y_test, Y_predict) * 100 print ('Confusion Matrix for Random forest: ') print (pd.crosstab(Y_test, Y_predict, rownames=['Predicted Values'], colnames=['True Values'])) print "-------------SVM---------------------------------" clf = svm.SVC() y_pred = clf.fit(X_train, Y_train).predict(X_test) accuracy = accuracy_score(Y_test,y_pred)
from sklearn.datasets import load_files from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix, accuracy_score nltk.download('stopwords') nltk.download('wordnet')temporal_data = load_files(r"txt_sentoken") X, y = temporal_data.data, temporal_data.targetdocuments = [] stemmer = WordNetLemmatizer()# Pre-processing tasks for sen in range(0, len(X)): # Remove all the special characters document = re.sub(r'\W', ' ', str(X[sen])) # remove all single characters document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Remove single characters from the start document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Removing prefixed 'b' document = re.sub(r'^b\s+', '', document) # Converting to Lowercase document = document.lower() # Lemmatization document = document.split() document = [stemmer.lemmatize(word) for word in document] document = ' '.join(document) documents.append(document)# Bag of Words model to convert text documents into numerical features vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english')) X = vectorizer.fit_transform(documents).toarray()# Training and testing splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) classifier = RandomForestClassifier(n_estimators=1000, random_state=0) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test)# Evaluating the model print(confusion_matrix(y_test,y_pred)) print(classification_report(y_test,y_pred)) print(accuracy_score(y_test, y_pred))
# # Now its time to train the model! # # **Creating an instance of the RandomForestClassifier class and fit it to the training data from the previous step.** # In[29]: from sklearn.ensemble import RandomForestClassifier # In[30]: rfc = RandomForestClassifier(n_estimators=600) # In[31]: rfc.fit(X_train, y_train) # ## Predictions and Evaluation # # Let's predict off the y_test values and evaluate our model. # # ** Predict the class of not.fully.paid for the X_test data.** # In[32]: predictions = rfc.predict(X_test) # **creating a classification report from the results.** # In[33]:
#print(df.head()) #print(df.describe()) Y = df.iloc[:, 11] X = df.iloc[:,:11] X_train, X_test, Y_train, Y_test = np.asarray(train_test_split(X, Y, test_size = 0.1)) print("X_train contain = ", X_train.shape, " and Y_train contain = ", Y_train.shape) print("X_test contain = ", X_test.shape, " and Y_test contain = ", Y_test.shape) model1 = LogisticRegression() model1.fit(X_train, Y_train) print('Logistic Regression Test Score = ' , model1.score(X_test, Y_test)) model2 = DecisionTreeClassifier() model2.fit(X_train, Y_train) print('Decision Tree Classifier Test Score = ' , model2.score(X_test, Y_test)) model3 = AdaBoostClassifier() model3.fit(X_train, Y_train) print('Ada Boost Classifier Test Score = ' , model3.score(X_test, Y_test)) model4 = RandomForestClassifier() model4.fit(X_train, Y_train) print('Random Forest Classifier Test Score = ' , model4.score(X_test, Y_test)) model5 = MLPClassifier() model5.fit(X_train, Y_train) print('Random Forest Classifier Test Score = ' , model5.score(X_test, Y_test))
data.drop(columns_one_hot, axis=1, inplace=True) # drop()函数:将指定的列按指定的方向删除,并返回 print u'处理后数据4:\n', data.head(10) columns = list(data.columns) columns.remove('churn') x = data[columns] # 得到数据 y = data['churn'] # 得到类标记label print u'分组与one-hot编码后:\n', x.head(10) # 数组或矩阵分割成随机训练和测试子集 x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=0) clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=12, min_samples_split=5, oob_score=True, class_weight={0: 1, 1: 1/y_train.mean()}) clf.fit(x_train, y_train) # Build a forest of trees from the training set (X, y). # 特征选择 clf.feature_importances_ : 返回重要的特征权值或者说重要性 important_features = pd.DataFrame(data={'features': x.columns, 'importance': clf.feature_importances_}) print 'important_features:\n', important_features # 按照importance进行排序 important_features.sort_values(by='importance', axis=0, ascending=False, inplace=True) important_features['cum_importance'] = important_features['importance'].cumsum() # 返回累加的和 print u'特征重要度:\n', important_features # 返回important_features['cum_importance']小于0.95的数据,取出‘features’这一列 selected_features = important_features.loc[important_features['cum_importance'] < 0.95, 'features'] # 重新组织数据 x_train = x_train[selected_features] x_test = x_test[selected_features]
max_features='auto', bootstrap=True) GB = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=200, max_features='auto') ET = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False) y_train = train_data_Y.loc[:].ravel() x_train = train_data_X.values x_test = test_data_X.values RF.fit(train_data_X, train_data_Y) RF_feature = RF.feature_importances_ RF_feature rf_score = RF.score(test_data_X, test_data_Y) print("RandomForestClassifier score is:", rf_score) GB.fit(train_data_X, train_data_Y) GB_feature = GB.feature_importances_ GB_feature gb_score = GB.score(test_data_X, test_data_Y) print("GradientBoostingClassifier score is:", gb_score) ET.fit(train_data_X, train_data_Y) ET_feature = ET.feature_importances_ ET_feature et_score = ET.score(test_data_X, test_data_Y)
from sklearn.ensemble import RandomForestClassifier #model = RandomForestClassifier(criterion='gini', n_estimators=700, # min_samples_split=10,min_samples_leaf=1, # max_features='auto',oob_score=True, # random_state=1,n_jobs=-1) model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) model.fit(all_features, Targeted_feature) #prediction_rm=model.predict(X_test) #print('--------------The Accuracy of the model----------------------------') #print('The accuracy of the Random Forest Classifier is', round(accuracy_score(prediction_rm,y_test)*100,2)) #kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts #result_rm=cross_val_score(model,all_features,Targeted_feature,cv=10,scoring='accuracy') #print('The cross validated score for Random Forest Classifier is:',round(result_rm.mean()*100,2)) #y_pred = cross_val_predict(model,all_features,Targeted_feature,cv=10) #sns.heatmap(confusion_matrix(Targeted_feature,y_pred),annot=True,fmt='3.0f',cmap="summer") #plt.title('Confusion_matrix', y=1.05, size=15) # Import data test_df=pd.read_csv("data/test.csv")
#4.1 types of models # Random Forest model = RandomForestClassifier(n_estimators=100) #SVM Support Vector Machine model = SVC() #Gradient Boosting Classifie model = GradientBoostingClassifier() #k-nearest neighbors model = KNeighborsClassifier(n_neighbors=3) # Gaussian Naive Bayes model = GaussianNB() #Logistic Regression #model = LogisticRegression() #Train the model model.fit(train_X, train_y) #5. Evaluation # Score the model print(model.score(train_X, train_y), model.score(valid_X, valid_y)) # It does not work with the regression model #plot_model_var_imp(model, train_X, train_y) rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(train_y, 2), scoring='accuracy') rfecv.fit(train_X, train_y) print(rfecv.score(train_X, train_y), rfecv.score(valid_X, valid_y))
X = data.iloc[:, 2:].values y = data.iloc[:, 1].values # Encoding categorical data from sklearn.preprocessing import LabelEncoder labelencoder_X_1 = LabelEncoder() y = labelencoder_X_1.fit_transform(y) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier() rf.fit(X_train, y_train) import pickle pickle.dump(rf, open('model.pkl', 'wb'))
# In[619]: ##-----Multiclass - Random Forest classification - TFIDF, ----------## ##__________________________________________________________________## # In[620]: #Random Forest from sklearn.ensemble import RandomForestClassifier # In[621]: clf = RandomForestClassifier(max_features=2000) clf.fit(X_train_fit, y_train) # In[622]: X_train.shape # In[623]: X_test.shape # In[624]: y_train.shape
from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score df = pd.read_csv( "/Users/prateekb/Downloads/PythonPractice/Machine Learning/DecisionTree/loan_borowwer_data.csv" ) df.purpose = pd.Categorical(df.purpose) df['purpose'] = df.purpose.cat.codes x = df.iloc[:, 0:13] y = df['not.fully.paid'] X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=.20, random_state=43) dt = tree.DecisionTreeClassifier() dt.fit(X_train, Y_train) Y_pred = dt.predict(X_test) Y_pred_train = dt.predict(X_train) print("The train accuracy score is: ", accuracy_score(Y_pred_train, Y_train)) print("The test accuracy score is: ", accuracy_score(Y_pred, Y_test)) rm = RandomForestClassifier() rm.fit(X_train, Y_train) Y_pred = rm.predict(X_test) Y_pred_train = rm.predict(X_train) print("The train accuracy score is: ", accuracy_score(Y_pred_train, Y_train)) print("The test accuracy score is: ", accuracy_score(Y_pred, Y_test)) print("Random Forest gives better test accuracy")
# Labels encoding le = preprocessing.LabelEncoder() le.fit(np.concatenate([train['y'].unique(), val["y"].unique()])) train['y'] = le.transform(train['y']) val["y"] = le.transform(val["y"]) # Características ya seleccionadas cols = [u'SD1', u'SD2', u'SD3', u'SD4', u'SD5', u'P4', u'P5', u'P10', u'P11', u'P13', u'P15', u'P18', u'P21', u'P30', u'P37', u'P45', u'P48', u'P49', u'P54', u'P57', u'P60', u'P64', u'P69', u'P71', u'P72', u'P73', u'P74', u'P76', u'P77', u'P78', u'P79', u'P80', u'P82', u'P83', u'P85', u'P90', u'P91', u'P92', u'P93'] # Entrenamiento rf = RandomForestClassifier(n_estimators=35, criterion="entropy", bootstrap=True, max_depth=20 , n_jobs=-1, random_state=17) rf.fit(train[cols], train["y"]) # Predicción ypred = rf.predict(val[cols]) del rf # Métricas acc = accuracy_score(val.y.values, ypred) ck = cohen_kappa_score(val.y.values, ypred) print('Resultados para RandomForest:') print('Precisión: {}'.format(acc)) print('Coeficiente kappa: {}'.format(ck))
n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] # Split into training and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=random_state) # Run classifier classifier = OneVsRestClassifier( svm.SVC(kernel='linear', probability=True, random_state=random_state)) y_score = classifier.fit(X_train, y_train).decision_function(X_test) clf = RandomForestClassifier(n_jobs=4) y_pred = clf.fit(X_train, y_train).predict(X_test) # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve( y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) # Compute micro-average ROC curve and ROC area precision["micro"], recall["micro"], _ = precision_recall_curve( y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(y_test, y_score,
from sklearn.datasets import load_breast_cancer features, target = load_breast_cancer(return_X_y=True) print(features) print(target) from sklearn.ensemble import RandomForestClassifier seed = 888 rf_model = RandomForestClassifier(random_state=seed) rf_model.fit(features, target) preds = rf_model.predict(features) print(preds) from sklearn.metrics import accuracy_score acc = accuracy_score(target, preds) print(acc)
隨機森林參數: n_estimators: 樹的數量(default=10)。 min_samples_leaf: 最終葉節點最少樣本數(default=1); 當樣本不大時,可不設定使用預設,若樣本數量非常大時,則推薦增加此參數值。 min_samples_split:節點再劃分時所需的最小樣本數(default=2); 當樣本不大時,可不設定使用預設,若樣本數量非常大時,則推薦增加此參數值。 oob_score: 是否採用袋外樣本(out-of-bag samples)來評估模型的準確度(default=False)。 ''' from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(n_estimators=1000, min_samples_split=20, min_samples_leaf=1, oob_score=True, random_state=1, n_jobs=-1) rfc.fit(x_train, y_train) ''' 交叉驗證Cross-Validation K-fold是拆解成 k 個子樣本來做交叉測試 ''' from sklearn.model_selection import cross_val_score # 邏輯斯回歸:LR acc is 0.8239700374531835 變數不同:7 columns acc_LR = np.mean(rfecv.grid_scores_) # 隨機森林:RFC acc is 0.8239700374531835 acc_RFC = rfc.oob_score_ # 支援向量機:SVM acc is 0.6329783950617284 from sklearn import svm svc = svm.SVC()
class RF: def __init__(self): self.X = None self.Y = None self.nobj = 0 self.nvarx = 0 self.quantitative = False self.autoscale = False self.estimators = 0 self.features = '' self.random = False self.class_weight = False self.learning_curve = True self.cv = None self.n = 2 self.p = 1 self.scoring_function = None self.mux = None self.wgx = None self.TP = 0 self.TN = 0 self.FP = 0 self.FN = 0 self.TPpred = 0 self.TNpred = 0 self.FPpred = 0 self.FNpred = 0 self.SDEC = 0.00 # SD error of the calculations self.R2 = 0.00 # determination coefficient self.scoringR = 0.00 self.SDEP = 0.00 self.Q2 = 0.00 self.scoringP = 0.00 self.OOBe = 0.00 self.clf = None self.vpath = None def saveModel(self, filename): """Saves the model to a binary file in numpy file and another in pkl format """ f = file(filename, 'wb') np.save(f, self.nobj) np.save(f, self.nvarx) np.save(f, self.quantitative) np.save(f, self.autoscale) np.save(f, self.estimators) np.save(f, self.features) np.save(f, self.random) np.save(f, self.class_weight) np.save(f, self.learning_curve) np.save(f, self.cv) np.save(f, self.n) np.save(f, self.p) np.save(f, self.mux) np.save(f, self.wgx) np.save(f, self.TP) np.save(f, self.TN) np.save(f, self.FP) np.save(f, self.FN) np.save(f, self.TPpred) np.save(f, self.TNpred) np.save(f, self.FPpred) np.save(f, self.FNpred) np.save(f, self.SDEC) np.save(f, self.R2) np.save(f, self.scoringR) np.save(f, self.Q2) np.save(f, self.SDEP) np.save(f, self.scoringP) np.save(f, self.OOBe) np.save(f, self.vpath) f.close() # the classifier cannot be saved with numpy joblib.dump(self.clf, os.path.dirname(filename) + '/clasifier.pkl') def loadModel(self, filename): """Loads the model from two files, one in numpy and another in pkl format """ f = file(filename, 'rb') self.nobj = np.load(f) self.nvarx = np.load(f) self.quantitative = np.load(f) self.autoscale = np.load(f) self.estimators = np.load(f) self.features = np.load(f) self.random = np.load(f) self.class_weight = np.load(f) self.learning_curve = np.load(f) self.cv = np.load(f) self.n = np.load(f) self.p = np.load(f) self.mux = np.load(f) self.wgx = np.load(f) self.TP = np.load(f) self.TN = np.load(f) self.FP = np.load(f) self.FN = np.load(f) self.TPpred = np.load(f) self.TNpred = np.load(f) self.FPpred = np.load(f) self.FNpred = np.load(f) self.SDEC = np.load(f) self.R2 = np.load(f) self.scoringR = np.load(f) self.Q2 = np.load(f) self.SDEP = np.load(f) self.scoringP = np.load(f) self.OOBe = np.load(f) self.vpath = np.load(f) f.close() # the classifier cannot be loaded with numpy self.clf = joblib.load(os.path.dirname(filename) + '/clasifier.pkl') def build(self, X, Y, quantitative=False, autoscale=False, nestimators=0, features='', random=False, tune=False, class_weight="balanced", cv='loo', n=2, p=1, lc=True, vpath=''): """Build a new RF model with the X and Y numpy matrices """ nobj, nvarx = np.shape(X) self.nobj = nobj self.nvarx = nvarx self.quantitative = quantitative self.autoscale = autoscale self.estimators = nestimators self.features = features self.random = random self.class_weight = class_weight self.learning_curve = lc self.n = n self.p = p self.cv = cv self.X = X.copy() self.Y = Y.copy() self.vpath = vpath #print self.vpath if autoscale: self.X, self.mux = center(self.X) self.X, self.wgx = scale(self.X, autoscale) if random: RANDOM_STATE = None else: RANDOM_STATE = 1226 # no reason to pick this number if self.cv: self.cv = getCrossVal(self.cv, RANDOM_STATE, self.n, self.p) if tune: self.estimators, self.features = self.optimize(self.X, self.Y) if self.features == 'none': self.features = None #print self.estimators if self.quantitative: print("Building Quantitative RF model") self.clf = RandomForestRegressor(n_estimators=int(self.estimators), warm_start=False, max_features=self.features, oob_score=True, random_state=RANDOM_STATE) else: print("Building Qualitative RF_model") self.clf = RandomForestClassifier(n_estimators=int( self.estimators), warm_start=False, max_features=self.features, oob_score=True, random_state=RANDOM_STATE, class_weight=self.class_weight) self.clf.fit(self.X, self.Y) print('Building Learning Curves') if self.learning_curve: title = "Learning Curves (RF)" # SVC is more expensive so we do a lower number of CV iterations: cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) estimator = self.clf plot = plot_learning_curve(estimator, title, self.X, self.Y, (0.0, 1.01), cv=cv) plot.savefig(self.vpath + "/RF-learning_curves.png", format='png') plot.savefig("./RF-learning_curves.png", format='png') # Regenerate the X and Y, since they might have been centered/scaled self.X = X.copy() self.Y = Y.copy() def validate(self): """ Validates the models and completes suitable scoring values """ ## valRF = open("valRF.txt", "w") ## valRF.write("Experimental\tRecalculated\tPredicted\n") if self.X == None or self.clf == None: return X = self.X.copy() Y = self.Y.copy() if self.autoscale: X = X - self.mux X = X * self.wgx Yp = self.clf.predict(X) Ym = np.mean(Y) ######################################################################3 ### quantitative if self.quantitative: # OOB_errors = [] # Recalculated predictions SSY0 = np.sum(np.square(Ym - Y)) SSY = np.sum(np.square(Yp - Y)) NMSErec = np.mean(mean_squared_error(Y, Yp)) # Mean Squared Error self.scoringR = NMSErec self.SDEC = np.sqrt(SSY / self.nobj) self.R2 = 1.00 - (SSY / SSY0) self.OOBe = 1.00 - self.clf.oob_score_ print("Recalculated results") print('rec R2:%5.3f SDEC:%5.3f OOB_error:%5.3f neg_mean_squared_error:%5.3f' % \ (self.R2,self.SDEC,self.OOBe, self.scoringR)) scoring = 'neg_mean_squared_error' y_pred = cross_val_predict(self.clf, X, Y, cv=self.cv) NMSEcv = np.mean( cross_val_score(self.clf, X, Y, cv=self.cv, scoring=scoring)) # Mean Squared Error SSY0_out = np.sum(np.square(Ym - Y)) SSY_out = np.sum(np.square(Y - y_pred)) self.scoringP = NMSEcv self.SDEP = np.sqrt(SSY_out / (self.nobj)) self.Q2 = 1.00 - (SSY_out / SSY0_out) # OOBe_loo = 1.00 - np.mean(OOB_errors) print(str(self.cv) + " cross-validation results") print('pred R2:%5.3f Q2:%5.3f SDEP:%5.3f neg_mean_squared_error:%5.3f' % \ (self.R2,self.Q2,self.SDEP, self.scoringP)) # Automated cross-validation loo scikitlearn clf = RandomForestRegressor(n_estimators=int(self.estimators), warm_start=False, max_features=self.features, oob_score=True, random_state=1226) # GRAPHS pngfiles = glob.glob(self.vpath + '/*.png') for i in pngfiles: ## print i os.remove(i) try: fig1 = plt.figure() plt.xlabel('experimental y') plt.ylabel('recalculated\n', fontsize=14) plt.title('R2: %4.2f / SDEC: %4.2f \n' % (self.R2, self.SDEC), fontsize=14) plt.plot(Y, Yp, "ro") fig1.savefig(self.vpath + "/RF-recalculated.png", format='png') fig1.savefig("./RF-recalculated.png", format='png') except: print( "Error creating Recalculated vs Experimental RF model graph" ) try: fig1 = plt.figure() plt.xlabel('experimental y') plt.ylabel('predicted\n', fontsize=14) plt.title('Q2: %4.2f / SDEP: %4.2f \n' % (self.Q2, self.SDEP), fontsize=14) plt.plot(Y, y_pred, "ro") fig1.savefig(self.vpath + "/RF-predicted.png", format='png') fig1.savefig("./RF-predicted.png", format='png') except: print( "Error creating Predicted vs Experimental RF model graph") # File with experimental, recalculated and cv predictions values. ## for i in range(len(Y)): ## valRF.write(str(Y[i]) + "\t" + str(Yp[i]) + "\t" + str(y_pred[i]) + "\n") ######################################################################3 ### qualitative else: # I think this is not needed.... by the characteristics of RF it allways shows perfect performance if len(Yp) != len(Y): return TP = TN = FP = FN = 0 for i in range(len(Y)): if Y[i] == 1.0: if Yp[i] == 1.0: TP += 1 else: FN += 1 else: if Yp[i] == 1.0: FP += 1 else: TN += 1 if TP + TN + FP + FN == 0: #print 'no objects' return self.TP = TP self.TN = TN self.FP = FP self.FN = FN sens = sensitivity(TP, FN) spec = specificity(TN, FP) mcc = MCC(TP, TN, FP, FN) f1 = f1_score(Y, Yp, pos_label=1, average='binary') self.OOBe = 1.00 - self.clf.oob_score_ print("Recalculated results") print("rec TP:%d TN:%d FP:%d FN:%d spec:%5.3f sens:%5.3f MCC:%5.3f OOB_error:%5.3f f1_score:%5.3f" % \ (TP, TN, FP, FN, spec, sens, mcc, self.OOBe, f1 )) # Leave-one-out Cross validation print('Cross validating RF....') scoring = 'f1' y_pred = cross_val_predict(self.clf, X, Y, cv=self.cv) #Y_score = np.mean(cross_val_score(self.clf, X, Y, cv=self.cv, scoring=scoring)) TPo = TNo = FPo = FNo = 0 for i in range(len(Y)): if Y[i] == 1.0: if y_pred[i] == 1.0: TPo += 1 else: FNo += 1 else: if y_pred[i] == 1.0: FPo += 1 else: TNo += 1 if TPo + TNo + FPo + FNo == 0: return self.TPpred = TPo self.TNpred = TNo self.FPpred = FPo self.FNpred = FNo sens_cv = sensitivity(TPo, FNo) spec_cv = specificity(TNo, FPo) mcc_cv = MCC(TPo, TNo, FPo, FNo) f1_cv = f1_score(Y, y_pred, pos_label=1, average='binary') print(str(self.cv) + " cross-validation results") print("pred TP:%d TN:%d FP:%d FN:%d spec:%5.3f sens:%5.3f MCC:%5.3f f1_score:%5.3f" % \ (TPo, TNo, FPo, FNo, spec_cv, sens_cv, mcc_cv, f1_cv )) # Create Graphs pngfiles = glob.glob(self.vpath + '/*.png') for i in pngfiles: os.remove(i) # Predicted confusion matrix graph try: FourfoldDisplay(TPo, TNo, FPo, FNo, 'RFC Predicted', 'RF_predicted_confusion_matrix.png', self.vpath) except: print("Failed to generate RF predicted validation graph") # Recalculated confusion matrix graph try: FourfoldDisplay(TP, TN, FP, FN, 'RFC Recalculated', 'RF_recalculated_confusion_matrix.png', self.vpath) except: print("Failed to generate RF recalculated validation graph") return (Yp) def project(self, Xb): """ Uses the X matrix provided as argument to predict Y """ if self.clf == None: print('failed to load clasifier') return if self.autoscale: Xb = Xb - self.mux Xb = Xb * self.wgx Xb = Xb.reshape(1, -1) # required by sklean, to avoid deprecation warning Yp = self.clf.predict(Xb) return (Yp) def optimize(self, X, Y): """ Optimizes the number of trees (estimators) and max features used (features) and returns the best values, acording to the OOB criteria The results are shown in a diagnostic plot To avoid including many trees to produce tiny improvements, increments of OOB error below 0.01 are considered irrelevant """ RANDOM_STATE = 1226 errors = {} features = ['sqrt', 'log2', 'none'] if self.quantitative: tclf = { 'sqrt': RandomForestRegressor(warm_start=False, oob_score=True, max_features="sqrt", random_state=RANDOM_STATE), 'log2': RandomForestRegressor(warm_start=False, oob_score=True, max_features="log2", random_state=RANDOM_STATE), 'none': RandomForestRegressor(warm_start=False, oob_score=True, max_features=None, random_state=RANDOM_STATE) } else: tclf = { 'sqrt': RandomForestClassifier(warm_start=False, oob_score=True, max_features="sqrt", random_state=RANDOM_STATE, class_weight=self.class_weight), 'log2': RandomForestClassifier(warm_start=False, oob_score=True, max_features="log2", random_state=RANDOM_STATE, class_weight=self.class_weight), 'none': RandomForestClassifier(warm_start=False, oob_score=True, max_features=None, random_state=RANDOM_STATE, class_weight=self.class_weight) } # Range of `n_estimators` values to explore. min_estimators = 15 max_estimators = 700 stp_estimators = 100 num_steps = int((max_estimators - min_estimators) / stp_estimators) print('optimizing RF....') updateProgress(0.0) optValue = 1.0e10 j = 0 for fi in features: errors[fi] = [] count = 0 for i in range(min_estimators, max_estimators + 1, stp_estimators): clf = tclf[fi] clf.set_params(n_estimators=i) clf.fit(X, Y) oob_error = 1 - clf.oob_score_ errors[fi].append((i, oob_error)) if oob_error < optValue: if np.abs(oob_error - optValue) > 0.01: optValue = oob_error optEstimators = i optFeatures = fi updateProgress( float(count + (j * num_steps)) / float(len(features) * num_steps)) count = count + 1 j = j + 1 for ie in errors: xs, ys = list(zip(*errors[ie])) plt.plot(xs, ys, label=ie) plt.xlim(min_estimators, max_estimators) plt.xlabel("n_estimators (Trees)") plt.ylabel("OOB error rate") plt.legend(loc="upper right") plt.show() plt.savefig(self.vpath + "/rf-OOB-parameter-tuning.png") plt.savefig("./rf-OOB-parameter-tuning.png") print('optimum features:', optFeatures, 'optimum estimators:', optEstimators, 'best OOB:', optValue) return (optEstimators, optFeatures)
# Naive Bayes is a prediction model based on applying Bayes’ theorem with the “naive” assumption of conditional independence between every pair of features given the value of the class variable. # # Build a Naive Bayes model to predict whether a movie review is positive or negative. Test the model accuracy on the test data. from sklearn.naive_bayes import MultinomialNB as MNB bobbyBayes = MNB().fit(X, y) b_predict = bobbyBayes.predict(test_tran) # for review, classification in zip(docs_test, b_predict): # print(f"Prediction: {dataset.target_names[classification]}Review:\n{review}\n") print(accuracy_score(y_test, b_predict)) # Task 4 # --- # A random forest is a ensemble (collective) model that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. # # Build a random forest model to predict whether a movie review is positive or negative. Test the model accuracy on the test data. Try different values (20, 100, 500) for the hyper parameter 'n_estimators', i.e., the number of decision trees in the ensemble, and print out the model accuracy for each of the parameter value. from sklearn.ensemble import RandomForestClassifier for num in (20, 100, 500, 1000): rfc = RandomForestClassifier(n_estimators=num) rfc = rfc.fit(X, y) rf_pred = rfc.predict(test_tran) print(accuracy_score(y_test, rf_pred)) # ### From the above tasks, you can observe that different models and different choice of hyper-parameter values can lead to quite different prediction performance. What is the model (and hyper-parameter) among the above that gives the best prediction? What is the worst? # It would appear that the normal SVC using the linear kernel with any C>10 yeilds the most accurate model for the feature extraction method we used (TfidfVectorizer). Interestingly Naive Bayes got very close without the testing a tuning and was near instantaneous to compute on a dataset this small. I tried RandomForestClassifier with n_estimators=100000 and there was no change at all from 1000 trees.
).rolling(5).std() wipro_data = wipro_data.dropna() arr = [] val = [] for value in wipro_data['Close Price'].iteritems(): arr.append(value[1]) for i in range(0, 483): if arr[i + 1] > arr[i]: val.append(1) else: val.append(-1) wipro_data['Action'] = pd.DataFrame(val) wipro_data = wipro_data.dropna() train_X = wipro_data[[ '%chg op_cl', '%chg lw_hg', '%chg 5dymean', '%chg 5dystd' ]] train_Y = wipro_data[['Action']] RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) RF.fit(train_X, train_Y) RF.predict(train_X) print("Random Forests") round(RF.score(train_X, train_Y), 4) wipro_data['Net Cummulative Returns'] = ( ((wipro_data['Open Price'] - wipro_data['Close Price']) / (wipro_data['Open Price'])) * 100).cumsum() plt.figure(figsize=(20, 10)) plt.plot(wipro_data['Net Cummulative Returns'])
clf_b = LogisticRegression(class_weight='balanced') clf_b.fit(X_train_b, y_train_b) y_pred_b = clf_b.predict_proba(X_test_b) print('Accuracy on test set: {0:.4f}'.format(accuracy_score(y_test_b, clf_b.predict(X_test_b)))) print('Percision score on test set: {0:.4f}'.format(precision_score(y_test_b, clf_b.predict(X_test_b)))) print('Recall score on test set: {0:.4f}'.format(recall_score(y_test_b, clf_b.predict(X_test_b)))) print('F1 score on test set: {0:.4f}'.format(f1_score(y_test_b, clf_b.predict(X_test_b)))) print("AUC score: ", roc_auc_score(y_test_b, y_pred_b[:,1])) #Initialising Random Forest model rf_clf=RandomForestClassifier(n_estimators=100,n_jobs=100,random_state=0, min_samples_leaf=100) #Fitting on data rf_clf.fit(X_train_im, y_train_im) #Scoring the model on train data score_rf=rf_clf.score(X_train_im, y_train_im) print("Training score: %.2f " % score_rf) #Scoring the model on test_data score_rf=rf_clf.score(X_test_im, y_test_im) print("Testing score: %.2f " % score_rf) y_pred_rf = rf_clf.predict(X_test_im) print('Accuracy on test set: {0:.4f}'.format(accuracy_score(y_test_im, rf_clf.predict(X_test_im)))) print('Percision score on test set: {0:.4f}'.format(precision_score(y_test_im, rf_clf.predict(X_test_im)))) print('Recall score on test set: {0:.4f}'.format(recall_score(y_test_im, rf_clf.predict(X_test_im)))) print('F1 score on test set: {0:.4f}'.format(f1_score(y_test_im, rf_clf.predict(X_test_im))))
# -*- coding: utf-8 -*- """ Created on Tue Mar 17 13:32:13 2020 @author: Damara """ from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_classification X, y = make_classification(n_samples=1000, n_features=4, n_informative=2, n_redundant=0, random_state=0, shuffle=False) clf = RandomForestClassifier(max_depth=2, random_state=0) clf.fit(X, y) print(clf.feature_importances_) print(clf.predict([[0, 0, 0, 0]]))
def training(): data = pd.read_csv("datasets/Training.csv") # Import train_test_split function from sklearn.model_selection import train_test_split X, y = data.iloc[:,:-1], data.iloc[:,-1] # Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test #Import Random Forest Model from sklearn.ensemble import RandomForestClassifier #Create a Gaussian Classifier clf=RandomForestClassifier(n_estimators=100) #Train the model using the training sets y_pred=clf.predict(X_test) clf.fit(X_train,y_train) y_pred=clf.predict(X_test) #Import scikit-learn metrics module for accuracy calculation from sklearn import metrics # Model Accuracy, how often is the classifier correct? print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) feature_imp = pd.Series(clf.feature_importances_,index=list(data.columns[:-1])).sort_values(ascending=False).head(50) X_reduced, y = data[[ 'receiving_blood_transfusion', 'red_sore_around_nose','abnormal_menstruation', 'continuous_sneezing', 'breathlessness','blackheads', 'shivering', 'dizziness', 'back_pain', 'unsteadiness', 'yellow_crust_ooze', 'muscle_weakness', 'loss_of_balance', 'chills', 'ulcers_on_tongue', 'stomach_bleeding', 'lack_of_concentration', 'coma', 'neck_pain', 'weakness_of_one_body_side', 'diarrhoea', 'receiving_unsterile_injections', 'headache', 'family_history', 'fast_heart_rate', 'pain_behind_the_eyes', 'sweating', 'mucoid_sputum', 'spotting_urination', 'sunken_eyes', 'dischromic_patches', 'nausea', 'dehydration', 'loss_of_appetite', 'abdominal_pain', 'stomach_pain', 'yellowish_skin', 'altered_sensorium', 'chest_pain', 'muscle_wasting', 'vomiting', 'mild_fever', 'high_fever', 'red_spots_over_body', 'dark_urine', 'itching', 'yellowing_of_eyes', 'fatigue', 'joint_pain', 'muscle_pain']], data.iloc[:,-1] # Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3) # 70% training and 30% test #Create a Gaussian Classifier clf2=RandomForestClassifier(n_estimators=100) #Train the model using the training sets y_pred=clf.predict(X_test) clf2.fit(X_train,y_train) y_pred=clf2.predict(X_test) # Model Accuracy, how often is the classifier correct? print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # df=pd.read_csv("dataset.csv") # df=pre_processing(df) # y=df[["Disease"]] # df.drop("Disease", axis="columns", inplace=True) # x=df # # print("#"*50) # # print(x) dummyRow=pd.DataFrame(np.zeros(len(X_reduced.columns)).reshape(1,len(X_reduced.columns)), columns=X_reduced.columns) dummyRow.to_csv('datasets/dummyRowDisease.csv', index=False) # model=RandomForestClassifier(random_state=2) # # model=XGBClassifier(max_depth=2,min_child_weight=3, gamma=0,subsample=0.86, reg_alpha=0, n_estimators=125) # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=5) # model.fit(x,y) # print("#"*50) # print(model) # print("#"*50) # print(model.score(x_test,y_test)) pkl_filename="datasets/pickle_model_disease.pkl" with open(pkl_filename,'wb') as file: pickle.dump(clf2,file)
x_train_undersampled = [] y_train_undersampled = [] for i in ess_sequences: x_train_undersampled.append(X_train[i]) y_train_undersampled.append(y_train[i]) for i in x: x_train_undersampled.append(X_train[i]) y_train_undersampled.append(y_train[i]) print(len(x_train_undersampled)) print(len(y_train_undersampled)) modxtr = np.array(x_train_undersampled) modytr = np.array(y_train_undersampled) clf = RFC(n_estimators=100) svm_best_clf = clf.fit(modxtr, modytr) test_predictions_svm = svm_best_clf.predict(X1) test_predictions_svm_proba = svm_best_clf.predict_proba(X1) accuracy = accuracy_score(y1,test_predictions_svm) true_n, false_p, false_n, true_p = confusion_matrix( y1, test_predictions_svm).ravel() print(true_n, false_p, false_n, true_p) prec = precision_score(y1, test_predictions_svm) f1 = f1_score(y1, test_predictions_svm) sensitivity, specificity = compute_measures(true_p, false_p, false_n, true_n) print(sensitivity, specificity) average = (sensitivity + specificity) / 2 fpr, tpr, thresholds = roc_curve(y1, test_predictions_svm_proba[:, 1]) roc_auc1 = auc(fpr, tpr)
# In[294]: plt.plot([k for k in range(1, 15)], knn_scores, color='green') for i in range(1, 15): plt.text(i, knn_scores[i - 1], (i, knn_scores[i - 1])) plt.xticks([i for i in range(1, 15)]) plt.xlabel('Number of Neighbors (K)') plt.ylabel('Scores') # In[299]: knn_classifier = KNeighborsClassifier(n_neighbors=12) score = cross_val_score(knn_classifier, x, y, cv=10) # In[300]: score.mean() # In[239]: from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier() model = clf.fit(X, y) # In[284]: randomforest_classifier = RandomForestClassifier(n_estimators=50) score = cross_val_score(randomforest_classifier, X, y, cv=10) score.mean()
#Test Set Performance tree_predicted = tree.predict(x_test) tree_acc = accuracy_score(y_test, tree_predicted) print('The test accuracy is: ' + str(tree_acc) + '.') print(confusion_matrix(y_test, tree_predicted)) ################# #BRANDOM FOREST ################# print('\nRANDOM FOREST') from sklearn.ensemble import RandomForestClassifier #Train Classifer rf = RandomForestClassifier(n_estimators = 100) rf.fit(x_train, y_train) #Test Set Performance rf_predicted = rf.predict(x_test) rf_acc = accuracy_score(y_test, rf_predicted) print('The test accuracy is: ' + str(rf_acc) + '.') print(confusion_matrix(y_test, rf_predicted)) ################# #Building KNN ################# print('\nK NEAREST NEIGHBORS') from sklearn.neighbors import KNeighborsClassifier #Train Classifier knn = KNeighborsClassifier()
candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") report(grid_search.cv_results_) # look at best classifier clf = RandomForestClassifier(n_estimators=50, max_depth=None, max_features= 10, min_samples_leaf= 3, min_samples_split= 2,bootstrap = True, criterion= 'entropy', random_state=0) clf.fit(X_train, y_train) y_pred=clf.predict(X_test) # Model Accuracy, how often is the classifier correct? print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # confusion matrix print(metrics.classification_report(y_test, y_pred)) # confusion matrix heatmap mat = confusion_matrix(y_test, y_pred) sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False) plt.xlabel('true label') plt.ylabel('predicted label'); # feature importance horizontal bar plot
}) tree.export_graphviz(model, out_file='decision_tree.dot') print(f'Training Time: {t} seconds.') print(f'Accuracy on test set: {accuracy}') print(f'R squared on test set: {r2_accuracy}') print(f'Confusion Matrix: {matrix[0]}, {matrix[1]}') print('-' * 30) # ---------- TRAIN RANDOM FOREST---------- print('Random Forest') model = RandomForestClassifier(n_jobs=-1, n_estimators=500, max_features=0.2, min_samples_leaf=1) t = time() model.fit(x_train, y_train) t = round(time() - t, 3) accuracy = model.score(x_test, y_test) r2_accuracy = r2_score(model.predict(x_test), y_test) matrix = confusion_matrix(y_test, model.predict(x_test)) filename = 'random_forest.sav' pickle.dump(model, open(filename, 'wb')) data['models'].append({ 'name': 'random_forest', 'train_time': t, 'accuracy': accuracy, 'r2_accuracy': r2_accuracy, 'confusion_matrix': matrix.tolist(), }) print(f'Training Time: {t} seconds.') print(f'Accuracy on test set: {accuracy}')
y_test, y_pred_decision_tree) print("Confusion Matrix (Decision Tree):\n", cm_decision_tree) # Printing the Accuracy, Precision and Recall print("Accuracy of Decision Tree:", decision_tree_accuracy) print("Precision of Decision Tree:", decision_tree_precision) print("Recall of Decision Tree:", decision_tree_recall) print("") ########################RANDOM FOREST CLASSIFIER############################### # Fitting Random Forest Classifier to the Training set from sklearn.ensemble import RandomForestClassifier random_forest_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0) random_forest_classifier.fit(x_train, y_train) # Predicting the Validation set results print("Validation Set Results:") y_pred_random_forest = random_forest_classifier.predict(x_validation) # Making the Confusion Matrix cm_random_forest, random_forest_accuracy, random_forest_precision, random_forest_recall = confusion_matrix( y_validation, y_pred_random_forest) print("Confusion Matrix (Random Forest Classifier):\n", cm_random_forest) # Printing the Accuracy, Precision and Recall print("Accuracy of Random Forest Classifier:", random_forest_accuracy) print("Precision of Random Forest Classifier:", random_forest_precision) print("Recall of Random Forest Classifier:", random_forest_recall) print("")
criterion='gini', max_depth=10, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=4, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=None, oob_score=False, random_state=0, verbose=0, warm_start=False) randomforest.fit(x_train, y_train) y_pred = randomforest.predict(x_test) random_accy = round(accuracy_score(y_pred, y_test), 3) print(random_accy) # In[39]: #ランダムフォレスト_グリッド # n_estimators = [50,75,100,120] # random_state = [0,15] # min_samples_split = [5,6,10,15,20,25] # max_depth = [5,10,15,20,25,30] # min_samples_leaf=[2,3,4,5,6] # parameters = {'n_estimators':n_estimators, # 'random_state':random_state, # 'min_samples_split':min_samples_split, # 'max_depth':max_depth,
X.info() X['age'].fillna(X['age'].mean(), inplace=True) # %% X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=0.25, random_state=33) vec = DictVectorizer(sparse=False) X_tr = vec.fit_transform(X_tr.to_dict(orient='record')) print(vec.feature_names_) X_tst = vec.transform(X_tst.to_dict(orient='record')) # %% dtc = DecisionTreeClassifier() rfc = RandomForestClassifier() gbc = GradientBoostingClassifier() dtc.fit(X_tr, y_tr) rfc.fit(X_tr, y_tr) gbc.fit(X_tr, y_tr) dtc_prd = dtc.predict(X_tst) rfc_prd = rfc.predict(X_tst) gbc_prd = gbc.predict(X_tst) print("Accuracy of dtc is: ", dtc.score(X_tst, y_tst)) print(classification_report(y_tst, dtc_prd, target_names=['died', 'survived'])) print("Accuracy of rfc is: ", rfc.score(X_tst, y_tst)) print(classification_report(y_tst, rfc_prd, target_names=['died', 'survived'])) print("Accuracy of gbc is: ", gbc.score(X_tst, y_tst)) print(classification_report(y_tst, gbc_prd, target_names=['died', 'survived']))