def fit(self, X, Y, W): self.imputer = Imputer() self.imputer.fit(X) X = replace_nan(X, self.imputer) rf_model = RandomForest(**self.params) rf_model.fit(X, Y.ravel()) return RandomForestClassifier(rf_model, self.imputer)
def calculateFeatureAuto(trainX, trainY, testX, testY): first_rs2 = np.array([]) for i in range(1, 151, 1): reg1 = RandomForest(n_estimators=i, max_depth=7, max_features="auto") reg1.fit(trainX, trainY) predict = reg1.predict(testX) rss = calculateRSS(testY, predict) first_rs2 = np.append(first_rs2, rss) return first_rs2
def calculateFeatureSqrt(trainX, trainY, testX, testY): second_rs2 = np.array([]) for i in range(1, 151, 1): reg2 = RandomForest(n_estimators=i, max_depth=7, max_features="sqrt") reg2.fit(trainX, trainY) predict = reg2.predict(testX) rss = calculateRSS(testY, predict) second_rs2 = np.append(second_rs2, rss) return second_rs2
def calculateFeature4(trainX, trainY, testX, testY): forth_rs2 = np.array([]) for i in range(1, 151, 1): reg3 = RandomForest(n_estimators=i, max_depth=7, max_features=4) reg3.fit(trainX, trainY) predict = reg3.predict(testX) rss = calculateRSS(testY, predict) forth_rs2 = np.append(forth_rs2, rss) return forth_rs2
def __init__(self, n=10, **kwargs): # keyword arguments are passed on to scikit-learn's KNN implementation # see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html # relevant kwargs (* indicates default): # n (int): 10* (number of trees in the random forest) # n_jobs (int): 1* or more (cores used to parallelize neighbor search) super(RandomForestClassifier, self).__init__("Random Forest", n=n, **kwargs) self._forest = RandomForest(n_estimators=n, **kwargs)
def getModel(self, _params): return RandomForest( n_estimators= int(_params['n_estimators']), # criterion= _params['criterion'], max_depth= _params['max_depth'], # min_samples_split= _params['min_samples_split'], # min_samples_leaf= _params['min_samples_leaf'], min_weight_fraction_leaf= _params['min_weight_fraction_leaf'], max_features= _params['max_features'], # max_leaf_nodes= int(_params['max_leaf_nodes']), # min_impurity_decrease= _params['min_impurity_decrease'], # bootstrap= _params['bootstrap'], oob_score= _params['oob_score'], # ccp_alpha= _params['ccp_alpha'], n_jobs= definitions.getNumberOfCore(), )
def main(args): start = time.time() if not args.all_feats: data = pickle.load(open(args.pruned_ds, 'rb')) else: data = pickle.load(open(args.full_ds, 'rb')) data = np.array([feats[1] for feats in data]) X = data[:, 1:] y = data[:,0] if args.num_folds > 0: print(f'Performing {args.num_folds}-fold validation') f_scores = kfold_validation(X, y, algorithm=args.algorithm, num_folds=args.num_folds) accs = kfold_scores(f_scores) print(f_scores) print(f'Average accuracy of {args.num_folds}-folds: {100*accs[0]:.2f}%') print(f'Best accuracy of {args.num_folds}-folds: {100*accs[1]:.2f}%') else: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=args.seed) print(f'Train data: {X_train.shape}, train labels: {y_train.shape}') print(f'Test data: {X_test.shape}, test labels: {y_train.shape}') if args.algorithm == 'NB': model = BernoulliNB() model.fit(X_train, y_train) if args.algorithm == 'RF': model = RandomForest(n_estimators=100, max_depth=10, n_jobs=os.cpu_count(), verbose=2) model.fit(X_train, y_train) if args.algorithm == 'XGB': model = XGB(verbosity=1, n_estimators=1000, max_depth=8, reg_lambda=1e-2, reg_alpha=4) model.fit(X_train, y_train, eval_set=[(X_test,y_test)], eval_metric='logloss', verbose=True, early_stopping_rounds=20) # test model test_model(model, X_test, y_test) print(f'Script completed in {time.time()-start:.2f} secs') return 0
def get_classifier(cls, param): if cls == "LR": return LR(C=param, random_state=123) elif cls == "KNN": return KNN(n_neighbors=param) elif cls == "RForest": return RandomForest(n_estimators=75, max_depth=param, random_state=123) elif cls == "BagTree": return Bagging(base_estimator=DecisionTree(max_depth=param, random_state=123), random_state=123) elif cls == "Perceptron": return Perceptron(eta0=param, random_state=123) elif cls == "MLP": return MLP(hidden_layer_sizes=(20, ), alpha=param, max_iter=40, solver='lbfgs') #too slow else: pass
def kfold_validation(features, labels, algorithm='XGB', num_folds=2): kf = KFold(n_splits=num_folds) kf.get_n_splits(features) fold_scores = {'train':[], 'val':[]} fold_num = 0 for train_idx, val_idx in kf.split(features): fold_num += 1 print(f'Training on fold {fold_num}') X_train, y_train = features[train_idx], labels[train_idx] X_val, y_val = features[val_idx], labels[val_idx] if args.algorithm == 'NB': model = BernoulliNB() model.fit(X_train, y_train) if args.algorithm == 'RF': model = RandomForest(n_estimators=100, max_depth=10, n_jobs=os.cpu_count(), verbose=2) model.fit(X_train, y_train) if args.algorithm == 'XGB': model = XGB(verbosity=1, n_estimators=1000, max_depth=3, reg_lambda=1, reg_alpha=1e-4) model.fit(X_train, y_train, eval_set=[(X_val,y_val)], eval_metric='logloss', verbose=True, early_stopping_rounds=20) train_score = model.score(X_train, y_train) fold_scores['train'].append(train_score) val_score = model.score(X_val, y_val) fold_scores['val'].append(val_score) print(f'Fold {fold_num}: training score = {train_score}, validation score = {val_score}') with open('fold_accs_random_forest.npy', 'wb') as outfile: pickle.dump(fold_scores, outfile) return fold_scores
from sklearn.ensemble import RandomForestClassifier as RandomForest from sklearn import cross_validation from utils import * # Read data X, y = get_train_data("../data/train.csv") # Parameters to test parameter_space = [[320], [340], [360], [380], [400]] # Cross validation parameter_scores = [] for parameter in parameter_space: clf = RandomForest(n_estimators=parameter[0], n_jobs=2) # criterion='entropy' scores = cross_validation.cross_val_score(clf, X, y, cv=4, scoring='log_loss', verbose=3) parameter_scores.append(np.mean(scores * -1)) # Show results print "Logloss: " + str(parameter_scores)
def reproducing_sakar(): classifiers = { "Naive Bayes": NaiveBayes(), "Logistic Regression": LogisticRegression(), "k-NN": KNN(p=1, n_neighbors=1), "Multilayer Perceptron": MLP(), "Random Forest": RandomForest(n_estimators=100), "SVM (Linear)": SVM(kernel="linear", gamma="auto"), "SVM (RBF)": SVM(kernel="rbf", gamma="auto") } scores = { "subject": [], "Naive Bayes": [], "Logistic Regression": [], "k-NN": [], "Multilayer Perceptron": [], "Random Forest": [], "SVM (Linear)": [], "SVM (RBF)": [] } f1s = { "subject": [], "Naive Bayes": [], "Logistic Regression": [], "k-NN": [], "Multilayer Perceptron": [], "Random Forest": [], "SVM (Linear)": [], "SVM (RBF)": [] } mccs = { "subject": [], "Naive Bayes": [], "Logistic Regression": [], "k-NN": [], "Multilayer Perceptron": [], "Random Forest": [], "SVM (Linear)": [], "SVM (RBF)": [] } voting = {"subject": [], "voted": [], "true": []} df = pd.read_csv("parkinsons.csv") df = df.drop(["gender"], axis=1) for i in range(252): print("SUBJECT {}".format(i)) scores["subject"].append(i) f1s["subject"].append(i) mccs["subject"].append(i) train_set = df.loc[df["id"] != i].drop(["id"], axis=1) test_set = df.loc[df["id"] == i].drop(["id"], axis=1) X_train = train_set.drop(["class"], axis=1) y_train = train_set["class"] X_test = test_set.drop(["class"], axis=1) y_test = test_set["class"] scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #pca = PCA(n_components=50) pca = LDA() pca.fit(X_train, y_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print(X_train_pca.shape) predictions = [] for name, classifier in classifiers.items(): classifier.fit(X_train_pca, y_train) pred = classifier.predict(X_test_pca) score = round(accuracy_score(y_test, pred), 2) f1 = round(f1_score(y_test, pred), 2) mcc = round(matthews_corrcoef(y_test, pred), 2) scores[name].append(score) f1s[name].append(f1) mccs[name].append(mcc) predictions.extend(list(pred)) print("{:<25}{} {} {}".format(name, score, f1, mcc)) voted_label = mode(predictions) true_label = list(y_test)[0] voting["subject"].append(i) voting["voted"].append(voted_label) voting["true"].append(true_label) print("Voted/True: {}/{}".format(voted_label, true_label)) print() scores = pd.DataFrame(scores) scores.to_csv("results/scores.csv", index=None) f1s = pd.DataFrame(f1s) f1s.to_csv("results/f1s.csv", index=None) mccs = pd.DataFrame(mccs) mccs.to_csv("results/mccs.csv", index=None) voting = pd.DataFrame(voting) voting.to_csv("results/voting.csv", index=None) print(scores) print(f1s) print(mccs) print(voting)
X, y = get_train_data('../features_all.csv', '../trainLabels.csv') # Parameters space creation params_space = [[200]] # Grid search grid_errors = [] for params in params_space: # Cross validation skf = StratifiedKFold(y, n_folds=8) errors = [] for train, test in skf: clf = RandomForest(n_estimators=params[0], n_jobs=2) clf.fit(X[train], y[train]) predictions = clf.predict(X[test]) kappa_score = kappa(y[test], predictions, weights='quadratic') print "Kappa: %f" % kappa_score print "Confusion matrix:" print confusion_matrix(y[test], predictions) print "Classification report:" print classification_report(y[test], predictions) errors.append(kappa_score) grid_errors.append(np.mean(errors)) # Show results
the predictor is picked based on if they can contribute more to the model's accuracy - basically picking the predictor that reduces error in prediction. Now, a Random Forest is a bunch of different decision trees, and we take the weighted average of all the decision trees to make our prediction. However, randomness is introduced at each split of the tree, such that only a few predictors out of all the predictors are even considered to be asked a question. Below we fit the model and use it to predict our response in train and test set. ''' # Create the random forest using a randomisation of 1000 observations forest = RandomForest(n_estimators=1000, criterion='mse', random_state=1, n_jobs=-1) # Fit training data to model forest.fit(x_train2, y_train) # Train the model on training data and predict test data rfy_train_pred = forest.predict(x_train2) rfy_test_pred = forest.predict(x_test2) # Look at the performance measures in terms of RMSE and R-Squared print('RF RMSE train: %.3f, test: %.3f' % ( MSE(y_train, rfy_train_pred)**(1/2), MSE(y_test, rfy_test_pred)**(1/2))) print('RF R^2 train: %.3f, test: %.3f' % ( R2(y_train, rfy_train_pred), R2(y_test, rfy_test_pred)))
(trainData, testData, trainLabels, testLabels) = train_test_split(np.array(features), np.array(labels), test_size=vali_size, random_state=seed) print("[INFO] splitted train and test data...") print("[INFO] train data : {}".format(trainData.shape)) print("[INFO] test data : {}".format(testData.shape)) print("[INFO] train labels: {}".format(trainLabels.shape)) print("[INFO] test labels : {}".format(testLabels.shape)) # use logistic regression as the model print("[INFO] creating model...") model_LR = LogisticRegression(random_state=seed) model_RF = RandomForest(n_estimators=100, random_state=seed) model_SVC = SVC(probability=True,random_state = seed) model_LR.fit(trainData, trainLabels) #model_RF.fit(trainData, trainLabels) #model_SVC.fit(trainData, trainLabels) testLabels_pre = model_LR.predict(testData) score_LR = accuracy_score(testLabels, testLabels_pre) #testLabels_pre = model_RF.predict(testData) #score_RF = accuracy_score(testLabels, testLabels_pre) #testLabels_pre = model_SVC.predict(testData) #score_SVC = accuracy_score(testLabels, testLabels_pre) #testLabels_pre = model_vote.predict(testData) #score_vote = accuracy_score(testLabels, testLabels_pre) #print('Vote accuracy:',score_vote)
open('svm_rbf_grid_results.p', 'wb')) ### Single Decision Tree DecisionTree_Model = OneVsRestClassifier( tree.DecisionTreeClassifier(criterion='gini')) grid = GridSearchCV(DecisionTree_Model, param_grid={'estimator__max_depth': range(1, 10)}, scoring=hamming_scorer) grid.fit(X_data_std, y_data) grid.cv_results_['mean_test_score'] grid.best_score_ ### Random Forest: Classical Random Forest # Tune: max_depth, min_samples_leaf RandomForest_Model = OneVsRestClassifier(RandomForest()) rf_grid = GridSearchCV(RandomForest_Model, param_grid={'estimator__max_depth': [10, 20, 30]}) rf_grid.fit(X_data_std, y_data) rf_grid.cv_results_['mean_test_score'] rf_grid.best_score_ ### Random Forest, Ada boosted: 100 trees # NOTE: Need to tune learning rate Ada_Model = OneVsRestClassifier(AdaBoostClassifier(n_estimators=100)) Ada_Model.fit(X_data_std, y_data) scores = cross_val_score(Ada_Model, X_data_std, y_data, scoring=hamming_scorer) ### Random Forest, Gradient boosting GBRT_Model = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=100))
from sklearn.ensemble import RandomForestClassifier as RandomForest from sklearn.preprocessing import LabelEncoder from sklearn.base import clone from sklearn.model_selection import cross_val_score from text_tokenizer import TextTokenizer from utils_files import read_lines from utils_files import load_object algorithms = { "logistic-regression": LogisticRegression(), "naive-bayes": BernoulliNB(), "linear-svc": LinearSVC(), "random-forest": RandomForest(n_estimators = 100, n_jobs = -1) } class NodeClassif(object): """ Represents a hierarchical node classifier Attributes: ---------- model: type: sklearn.estimator info: trained classifier model selector: type: SelectPercentile
#print(y_train) ''' for c in [0.01, 0.05, 0.25, 0.5, 0.6, 0.75, 1]: pclf = Pipeline([ ('vect', CountVectorizer(binary=True)), # ('tfidf', TfidfTransformer()), ('norm', Normalizer()), ('clf', LogisticRegression(C=c)), ]) pclf.fit(X_train, y_train) y_pred = pclf.predict(X_test) print("C = %s"%(c)) print(metrics.classification_report(y_test, y_pred)) ''' for value in [1]: pclf = Pipeline([ ('vect', CountVectorizer()), # ('tfidf', TfidfTransformer()), # ('norm', Normalizer()), ('clf', RandomForest(n_estimators=100, max_depth=500)), ]) pclf.fit(X_train, y_train) y_pred = pclf.predict(X_test) print("C = %s" % (value)) print(metrics.classification_report(y_test, y_pred))
# 形式変換 data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}) # 欠損補完 data['Age'] = data['Age'].fillna(data['Age'].median()) # 使わない列削除 data = data.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1) # 乗船者のIDと生存ラベルを取り除いたデータと生存ラベルに分ける X = data.drop(['Survived', 'PassengerId'], axis=1).values y = data['Survived'].values # 学習 X_train, X_test, y_train, y_test = train_test_split(X, y) model = RandomForest(n_estimators=100).fit(X_train, y_train) # テスト target = pd.read_csv('../input/titanic/test.csv') target['Sex'] = target['Sex'].map({'female': 0, 'male': 1}) target['Age'] = target['Age'].fillna(target['Age'].median()) target['Fare'] = target['Fare'].fillna(target['Fare'].median()) target = target.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1) # 乗船者のIDと生存ラベルを取り除いたデータと生存ラベルに分ける X_target = target.drop(['PassengerId'], axis=1).values result = model.predict(X_target) target['Survived'] = result target[["PassengerId","Survived"]].to_csv("submission2.csv",index=False)
def calculateFeatureSqrt(trainX, trainY, testX, testY): second_rs2 = np.array([]) for i in range(1, 151, 1): reg2 = RandomForest(n_estimators=i, max_depth=7, max_features="sqrt") reg2.fit(trainX, trainY) predict = reg2.predict(testX) rss = calculateRSS(testY, predict) second_rs2 = np.append(second_rs2, rss) return second_rs2 auto = calculateFeatureAuto(train_x, train_y, test_X, test_Y) sqrt = calculateFeatureSqrt(train_x, train_y, test_X, test_Y) forth = calculateFeature4(train_x, train_y, test_X, test_Y) y_reg1 = RandomForest(n_estimators=150, max_depth=7, max_features=4) y_reg1.fit(train_x, train_y) pred = y_reg1.predict(test_X) y_reg2 = RandomForest(n_estimators=150, max_depth=1, max_features=4) y_reg2.fit(train_x, train_y) pred2 = y_reg2.predict(test_X) plt.ylabel('R^2 Score') plt.xlabel('Number of Estimators(decision trees)') arrArange = np.arange(0, 150, 1) plt.plot(arrArange, auto, "r", label='Auto') plt.plot(arrArange, sqrt, "b", label='Sqrt') plt.plot(arrArange, forth, "g", label='Four')
"logistic_ucb", "logistic_egreedy", ]: kwargs["epsilon"] = 0.01 policy = counterfactual_policy_dict[counterfactual_policy](**kwargs) # compared OPE estimators ope_estimators = [ DirectMethod(), InverseProbabilityWeighting(), SelfNormalizedInverseProbabilityWeighting(), DoublyRobust(), SelfNormalizedDoublyRobust(), SwitchDoublyRobust(), ] # a base ML model for regression model used in Direct Method and Doubly Robust base_model = CalibratedClassifierCV(RandomForest(**hyperparams)) evaluation_of_ope_results = { est.estimator_name: np.zeros(n_runs) for est in ope_estimators } for i in np.arange(n_runs): # sample a new set of logged bandit feedback bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation( bandit_feedback=bandit_feedback, policy=policy) # estimate the ground-truth policy values of the counterfactual policy # using the full expected reward contained in the bandit feedback dictionary ground_truth_policy_value = bandit_feedback["expected_reward"][
def f(id, filtered_df, pos_stem, name): result = [] for n in range(1, 4): for pca_n in [0, 2]: print "Worker " + str(id) + " : " + str(n) + "/3 and " + str( pca_n + 1) + "/2" vectorized_1gram, vectorized_1gram_names = vectorize(pos_stem, n) if pca_n != 0: # project to the data onto the two axes bag_to_use = make_pca(vectorized_1gram, pca_n) else: bag_to_use = vectorized_1gram y = filtered_df['y'].values x_train, x_test, y_train, y_test = train_test_split( bag_to_use, y, test_size=0.4, random_state=42) # to keep track of the best model best_avg = 0 best_trees_avg = None best_depth_avg = None best_nodes_avg = None ### RF CV # parameters for tuning n_trees = np.arange(10, 200, 20) depths = np.arange(2, 10) leaf_nodes = np.arange(2, 10) num_folds = 4 # iterate through trees and depths for nodes in leaf_nodes: for trees in n_trees: for depth in depths: # cross validation for every experiment k_folds = KFold(x_train.shape[0], n_folds=num_folds, shuffle=True) scores = [] # for each fold for train_indices, validation_indices in k_folds: # generate training data x_train_cv = x_train[train_indices] y_train_cv = y_train[train_indices] # generate validation data x_validate = x_train[validation_indices] y_validate = y_train[validation_indices] # fit random forest on training data rf = RandomForest(n_estimators=trees, max_depth=depth, max_leaf_nodes=nodes, class_weight='balanced') rf.fit(x_train_cv, y_train_cv) print "HITTING" # score on validation data scores += [simulate(x_validate, rf, filtered_df)] # record and report accuracy average_score = np.mean(scores, axis=0) # update our record of the best parameters see so far if np.mean(average_score) >= best_avg: best_avg = np.mean(average_score) best_trees_avg = trees best_depth_avg = depth best_nodes_avg = nodes result += [(name, n, pca_n, best_trees_avg, best_depth_avg, best_nodes_avg, best_avg)] return result
from sklearn.ensemble import RandomForestClassifier as RandomForest from utils import * # Read data X_train, y_train = get_train_data("../data/train.csv") X_test = get_test_data("../data/test.csv") # Fit model and make predictions clf = RandomForest(n_estimators=360, n_jobs=2, verbose=1) clf.fit(X_train, y_train) predictions = clf.predict_proba(X_test) # Save submission to file save_submission(predictions, 'rf.csv')
bc_dataset = datasets.load_breast_cancer() # Initialize label encoder label_encoder = preprocessing.LabelEncoder() # define target (what we want to predict) and train (data used to predict) target = bc_dataset.target train = bc_dataset.data # Split: train 60% test 40% from sklearn.cross_validation import train_test_split split = train_test_split(train, target, test_size=0.4, random_state=42) data_train, data_test, target_train, target_test = split ## train with RandomForest model = RandomForest(n_estimators=10) model.fit(data_train[:None], target_train) print 'perf: ' print model.score(data_test[:None], target_test) # variable with the more impact ? features = [ 'mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension'
def instanciate(self, params): n_estimators, = params self.model = RandomForest(n_estimators=n_estimators, n_jobs=CORES)
def select_feature(data, y, tol=0, min_bin=0.05, regress=True): """ Use random forest to select features from data set with both numerical and categorical features. input: data 2D array of the independent variables y 1D array of the respons variable tol the smallest feature importance to keep, default 0 min_bin the minimum percentage of one class in categorical feature, default 0.05 regress indicate whether the problem is regression or classification """ logger = logging.getLogger(__name__) logger.info( "------ select_feature(data, y, tol=0, min_bin=0.05, regress=True) ------" ) from time import time begin = time() cat_cols = data.columns[data.dtypes == 'object'] num_cols = data.columns[data.dtypes != 'object'] logger.info( "{} numerical and {} categorical features out of {} features".format( len(num_cols), len(cat_cols), data.shape[1])) results = [] # score, feature name, feature importance # for numerical variables beg = time() if regress: from sklearn.ensemble import RandomForestRegressor as RandomForest else: from sklearn.ensemble import RandomForestClassifier as RandomForest rf = RandomForest(n_estimators=500, max_depth=int(np.log(len(num_cols)) / np.log(2)) + 1, max_features=0.33) logger.info("max_depth", int(np.log(len(num_cols)) / np.log(2)) + 1) rf = rf.fit(data[num_cols], y) print("Model with numerical variables fitted, time used {:.2f} min".format( (time() - beg) / 60)) beg = time() score = rf.score(data[num_cols], y) results.append([score, num_cols, rf.feature_importances_]) selected = num_cols[rf.feature_importances_ > tol] print("Model score {:.2f}, time used {:.2f} min".format( score, (time() - beg) / 60)) # for categorical variables dummies = pd.DataFrame() print("-" * len(cat_cols), end='\r', flush=True) for i, col in enumerate(cat_cols): beg = time() c, d = np.unique(data[col], return_counts=True) d = d / sum(d) for cls in c[d > min_bin]: dummies[str(col) + '.' + str(cls)] = (data[col] == cls).astype(int) rf = RandomForest( n_estimators=500, max_depth=int( np.log(len(selected) + dummies.shape[1]) / np.log(2)) + 1, max_features=0.33) temp = pd.concat([data[selected], dummies], axis=1) logger.info( "max_depth", int(np.log(len(selected) + dummies.shape[1]) / np.log(2)) + 1, temp.shape[1]) rf = rf.fit(temp, y) score = rf.score(temp, y) results.append([score, temp.columns, rf.feature_importances_]) selected = selected[rf.feature_importances_[:len(selected)] > tol] dummies.drop( dummies.columns[rf.feature_importances_[len(selected):] < tol], axis=1, inplace=True) print("*" * (i + 1) + "-" * (len(cat_cols) - i - 1) + " score {:.2f}, time {:.2f} min".format(score, (time() - beg) / 60), end='\n', flush=True) print("\ntotal time used: {:.2f} min".format((time() - begin) / 60)) return results