def features_ETC(self, sort=False, N=0, n_estimators=1000, max_features='auto', min_samples_split=2, usedata=1): n_sample = self.numData ind = list(range(n_sample)) np.random.shuffle(ind) if not isinstance(max_features, six.string_types) and max_features is not None: max_features = max(1, min(max_features, int(self.data["X"].shape[1]))) etc = ETC(n_estimators=n_estimators, max_features=max_features, min_samples_split=min_samples_split).fit( self.data["X"][ind[:int(usedata * n_sample)]], self.data["Y"][ind[:int(usedata * n_sample)]]) f = etc.feature_importances_ if not N: N = len(f) if sort: return nlargest(N, [(f[i], i) for i in range(len(f))]) else: return [(f[i], i) for i in range(len(f))]
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesClassifier as ETC if refit: self.estimator = None if self.estimator is None: max_features = int(X.shape[1]**float(self.max_features)) self.estimator = ETC( n_estimators=n_iter, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight, warm_start=True) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit(X, y, sample_weight=sample_weight) return self
def main(): # Get the clean datasets x,y,xt,feats,sample = readData() #Try out different models xg_class_params = {"objective" : "binary:logistic","eval_metric" : "auc", "booster" : "gbtree", "eta": 0.01,"max_depth": 14,"min_child_weight": 10, "subsample": 0.66, #"colsample_bytree": 0.7, "colsample_bylevel":0.3, "thread": 1,"silent": 1,"seed": 221} xg_class_params2 = {"objective" : "binary:logistic","eval_metric" : "auc", "booster" : "gbtree", "eta": 0.02,"max_depth": 5,"min_child_weight": 10, "subsample": 0.66, #"colsample_bytree": 0.7, "colsample_bylevel":0.3, "thread": 1,"silent": 1,"seed": 221} rf1 = RF(n_estimators=1000,max_features= 50,criterion='entropy',min_samples_split= 40,max_depth= 30, min_samples_leaf= 2, n_jobs = 10,verbose=0,random_state=42) etc1 = ETC(n_estimators=500,max_features= 90,criterion='entropy',min_samples_split= 20,max_depth= 25, min_samples_leaf= 10, n_jobs =10,verbose=0,random_state=42) xgb1 = XGC(xg_class_params,num_rounds=550) xgb2 = XGC(xg_class_params2,num_rounds=600) xgb_bag=bagger(xgb2,num_bags=3,bag_fraction=0.75) # EVALUATE a model score = crossValidate(etc1,x,y,folds=5,runs=1)
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesClassifier as ETC if refit: self.estimator = None if self.estimator is None: num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.estimator = ETC( n_estimators=0, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight, warm_start=True ) tmp = self.estimator # TODO copy ? tmp.n_estimators += n_iter tmp.fit(X, y, sample_weight=sample_weight) self.estimator = tmp return self
def fit_predict(self, data_fit, data_predict): clf = ETC(criterion='gini', max_features=self.p['nfeatures'], max_depth=self.p['depth'], n_estimators=self.p['ntrees'], random_state=self.p['seed']) clf.fit(data_fit.x, data_fit.y) yhat = clf.predict_proba(data_predict.x)[:, 1] return data_predict.ids, yhat
def etccv(n_estimators, min_samples_split): return cross_val_score(AdaBoostClassifier(ETC( min_samples_split=int(min_samples_split), random_state=2, n_jobs=-1), algorithm="SAMME", n_estimators=int(n_estimators)), train, train_labels, 'roc_auc', cv=5).mean()
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesClassifier as ETC if refit: self.estimator = None if self.estimator is None: max_features = int(X.shape[1]**float(self.max_features)) if self.criterion not in ("gini", "entropy"): raise ValueError("'criterion' is not in ('gini', 'entropy'): " "%s" % self.criterion) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_samples_leaf = int(self.min_samples_leaf) self.min_samples_split = int(self.min_samples_split) self.max_features = float(self.max_features) self.min_impurity_decrease = float(self.min_impurity_decrease) self.min_weight_fraction_leaf = float( self.min_weight_fraction_leaf) self.oob_score = check_for_bool(self.oob_score) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.verbose = int(self.verbose) self.estimator = ETC( n_estimators=n_iter, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight, warm_start=True) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit(X, y, sample_weight=sample_weight) return self
def etc_ccv(self, n_estimators, top_args): _transition_model = ETC(n_estimators=int(n_estimators), random_state=0) _transition_model.fit(self.X, self.y) _transition_model_select = SelectFromModel(_transition_model, prefit=True, threshold=top_args) _transition_model_x = _transition_model_select.transform(self.X) # In this case there are not features to select, therefore, return 0 as accuracy. if _transition_model_x.shape[1] == 0: return 0 # score = make_scorer(self.loss_function_2, greater_is_better=True) # af = AffinityPropagation().fit(self.X) # labels = af.labels_ # val = metrics.metrics.adjusted_mutual_info_score(self.y, ) # val = cross_validate( # AffinityPropagation(), # X=_transition_model_x, # y=self.y, # scoring=score, # cv=2 # ) score = make_scorer(self.loss_function_2, greater_is_better=True) val = cross_validate(ETC(n_estimators=int(n_estimators), random_state=0), X=_transition_model_x, y=self.y, scoring=score, cv=2) self.cumulative_objective_function.append({ "score": val['test_score'].mean(), "n_estimators": n_estimators, "top_args": _transition_model_x.shape[1], "importance_cutoff": top_args }) return val['test_score'].mean()
def fit_predict(self, dfit, dpre, tournament): clf = ETC(criterion='gini', max_features=self.p['nfeatures'], max_depth=self.p['depth'], n_estimators=self.p['ntrees'], random_state=self.p['seed'], n_jobs=-1) clf.fit(dfit.x, dfit.y[tournament]) yhat = clf.predict_proba(dpre.x)[:, 1] return dpre.ids, yhat
def trainClassifier(xTrain, yTrain): # learner = LR(penalty='l2') # learner = SVM() # learner = DT() # learner = RF() learner = ETC() # learner = ADA(n_estimators=200) # learner = G(n_estimators=100) learner.fit(xTrain, yTrain) return learner
def optimize(self): self.gp_params = {"alpha": 1e-5} self.etc_0 = BayesianOptimization( self.etc_ccv, { 'n_estimators': (1000, 1000), 'top_args': (self.min_importance, self.max_importance) }) self.etc_0.maximize(n_iter=self.epochs, **self.gp_params) print('selecting best performance parameters ...') selected_parameters = sorted(self.etc_0.res, key=lambda i: i['target'])[-1] self.forest = ETC(n_estimators=int( selected_parameters['params']['n_estimators']), random_state=0) self.forest.fit(self.X, self.y) self._selected_features_model = SelectFromModel( self.forest, prefit=True, threshold=selected_parameters['params']['top_args']) self.parameters = pd.DataFrame({ "score": [i['score'] for i in self.cumulative_objective_function], "n_estimators": [i['n_estimators'] for i in self.cumulative_objective_function], "top_args": [i['top_args'] for i in self.cumulative_objective_function], "importance_cutoff": [ i['importance_cutoff'] for i in self.cumulative_objective_function ] }) # print(json.dumps(self.cumulative_objective_function, indent=4)) self.x_t_selected = self._selected_features_model.transform(self.X) self.x_selected = pd.DataFrame( data=self.x_t_selected, index=self.X.index, columns=self.X.columns[ self._selected_features_model.get_support()]) self.importances = pd.DataFrame({ 'Gene': self.x_selected.columns, 'importance': self.forest.feature_importances_[ self._selected_features_model.get_support()] })
def Model_rec(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) scaler = MinMaxScaler() column_names = X_train.columns.values X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) X_train = pd.DataFrame(data=X_train, columns=column_names) X_test = pd.DataFrame(data=X_test, columns=column_names) forest = ETC(n_estimators=250, max_depth=10, random_state=np.random) forest.fit(X_train, y_train) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances) return (forest, scaler, X_test, y_test)
train_df['interest_level'].apply(lambda x: target_num_map[x])) KF = StratifiedKFold(train_y, 5, shuffle=True, random_state=2333) cv_scores = [] i = 0 for dev_index, val_index in KF: result_dict = {} dev_set, val_set = train_df.iloc[dev_index, :], train_df.iloc[val_index, :] #filter the features dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix() dev_y, val_y = train_y[dev_index], train_y[val_index] et = ETC(2000, random_state=0) et.fit(dev_X, dev_y) preds = et.predict_proba(val_X) #save the pickles for futures use pickl_file = store + 'et2000-5fold-out-' + str(i) + '.pickle' fileObject = open(pickl_file, 'wb') pickle.dump(preds, fileObject) fileObject.close() loss = log_loss(val_y, preds) cv_scores.append(loss) i += 1 print 'loss for the turn ' + str(i) + ' is ' + str(loss)
print("logloss", logloss(Yval, y_pred_prob2)) #Estimation on testing data y_pred_prob = gmean_pred(clfs2, Xtest) test_data['XGB_pred'] = y_pred_prob #######################ExtraTrees################################ print('\n ################Extra Trees########################') y_pred = Y * 0 y_pred_prob = Y * 0 clfs3 = [] for train, test in kf: X_train, X_test, y_train, y_test = X[train, :], X[ test, :], Y[train], Y[test] clf = ETC(n_estimators=100, n_jobs=7) clf.fit(X_train, y_train) clfs3.append(clf) y_pred_test = clf.predict(X_test) y_pred_prob_test = clf.predict_proba(X_test)[:, 1] print("Iteration - logloss", logloss(Y[test], y_pred_prob_test)) clf3 = combine_classifier(clfs3) #Training performance evaluation y_pred = clf3.predict(X) y_pred_prob = clf3.predict_proba(X)[:, 1] print('Training Results:') print(confusion_matrix(Y, y_pred)) print("logloss", logloss(Y, y_pred_prob)) train_data['ETC_pred'] = clf3.predict_proba(X)[:, 1]
df_validation_label = df_validation.loc[:,"label"] df_validation_label df_test_statement = df_test.loc[:,"statement"] df_test_statement df_test_label = df_test.loc[:,"label"] df_test_label # Training the model from sklearn.ensemble import BaggingClassifier as BRC from sklearn.ensemble import ExtraTreesClassifier as ETC pipeline = Pipeline([ ('ngrams', TfidfVectorizer(ngram_range=(1, 1))), ('clf', BRC(base_estimator=ETC(n_estimators=30), n_estimators=100,bootstrap_features=True,oob_score=True,max_features = 7)) ]) pipeline.fit(df_training_statement, df_training_label) predicted_labels = pipeline.predict(df_validation_statement) predicted_labels accuracy = pipeline.score(df_validation_label,predicted_labels) accuracy predicted_labels_test = pipeline.predict(df_test_statement) predicted_labels_test accuracy_test = pipeline.score(df_test_label,predicted_labels_test) accuracy_test
inpFile = open("data/training_data.txt", "r") # Extract the rest of the data so that we can parse it training_data = np.genfromtxt("data/training_data.txt", delimiter="|", skip_header=1) test_data = np.genfromtxt("data/testing_data.txt", delimiter="|", skip_header=1) X = training_data[:, :1000] Y = training_data[:, 1000] # Various Classifiers dtc_min_samples_leaf = DTC(min_samples_leaf=15) etc = ETC() gbc = GBC() rfc = RFC() dtc_max_depth = DTC(max_depth=8) nb = BernoulliNB() svc = SVC() lr = LR() abc = ABC() bc = BC() ''' inv_doc_freq = np.zeros(1000) for i in range(len(inv_doc_freq)): total = sum(X[:, i]) if total == 0: inv_doc_freq[i] = 0 else:
# RF etcBO = BayesianOptimization(etccv, { 'n_estimators': (200, 800), 'min_samples_split': (2, 8) }) print('-' * 53) etcBO.maximize() print('-' * 53) print('Final Results') print('ETC: %f' % etcBO.res['max']['max_val']) # # MAKING SUBMISSION rf = cross_val_score(ETC( n_estimators=int(etcBO.res['max']['max_params']['n_estimators']), min_samples_split=int( etcBO.res['max']['max_params']['min_samples_split']), random_state=2, n_jobs=-1), train, train_labels, 'roc_auc', cv=5).mean() rf.fit(train, train_labels) preds = rf.predict_proba(test)[:, 1] print('Prediction Complete') submission = submission = pd.DataFrame(preds, index=test_labels, columns=['target']) submission.to_csv('../output/extratrees_autotune.csv')
import utils import pickle from os.path import isfile from sklearn.ensemble import ExtraTreesClassifier as ETC from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split filename = '/usr/src/app/sentiment/models/pickles/BernoulliNB.pickle' if isfile(filename) == False: train, test = train_test_split(utils.read_data(), test_size=0.2) train_embeddings = utils.combined_embeddings(train['text'].tolist()) test_embeddings = utils.combined_embeddings(test['text'].tolist()) clf = ETC(n_estimators=100) clf.fit(train_embeddings, train['sentiment']) prediction = clf.predict(test_embeddings) report = classification_report(test['sentiment'], prediction) print(report) with open(filename, 'wb') as f: pickle.dump(clf, f) else: print('Already Trained!')
# KNeighbors Classifier kn_cls = KNNc(n_neighbors=41, weights='uniform', algorithm='brute', metric='chebyshev') # Ridge Classifier rd_cls = RdC(fit_intercept=False, class_weight=None, solver='lsqr', random_state=5) # Random Forest Classifier rf_cls = RFC(n_estimators=200, max_depth=10, min_samples_split=2, min_samples_leaf=3, max_features=None, class_weight=None, criterion='entropy', random_state=5) # Extra Trees Classifier et_cls = ETC(criterion='entropy', min_impurity_decrease=0.0, bootstrap=True, max_features=None, n_estimators=100, max_depth=None, min_samples_split=3, min_samples_leaf=2, max_leaf_nodes=20, class_weight=None, random_state=5) # Gradient Boosting Classifier gb_cls = GBC(loss='deviance', max_features=None, learning_rate=0.125, n_estimators=150, min_samples_split=2, min_samples_leaf=20, max_depth=5, min_impurity_decrease=0.20, max_leaf_nodes=10, random_state=5) # Isolation Forest if_cls = IFc(random_state=5) if_param = {'n_estimators': [100, 200, 300], 'contamination': [0.05, 0.1, 0.2], 'max_features': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'behaviour': ['new']}
words = inpFile.readline().rstrip().split("|") # Extract the rest of the data so that we can parse it training_data = np.genfromtxt("data/training_data.txt", delimiter="|", skip_header=1) test_data = np.genfromtxt("data/testing_data.txt", delimiter="|", skip_header=1) X = training_data[:, :1000] Y = training_data[:, 1000] # Various Classifiers dtc_min_samples_leaf = DTC(min_samples_leaf=15) etc = ETC() gbc = GBC() rfc = RFC() dtc_max_depth = DTC(max_depth=8) nb = BernoulliNB() svc = SVC() # Split Training Data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3333) # Compare individual classifiers dtc_min_samples_leaf.fit(X_train, Y_train) print "dtc_min_samples_leaf" prediction1_train = dtc_min_samples_leaf.predict(X_train) prediction1 = dtc_min_samples_leaf.predict(X_test) print accuracy_score(prediction1, Y_test)
def nn_vs_out_of_the_box_model(all_data=None, test_box_id=1000, prop_test=0.1, predictor_params=None, box_file=None, min_test_cases=5, max_test_cases=100): """ Tests accuracy of nearest neighbor predictor for a specific box :param all_data: :param test_id: :param prop_test: :param window: :return: """ # ------------PRELIMINARY SET UP----------------------------------- data_test_bx = all_data[all_data.box_id == test_box_id] # select data for this box only train_df, test_df = prepare_data_for_training_testing( data=all_data, box_id=test_box_id, min_test_cases=min_test_cases, prop_test=prop_test, max_test_cases=max_test_cases) # ----------------CREATE MODEL OBJECT----------------------------- num_boxes = len(list(all_data.box_id.unique())) etc = ETC(n_estimators=100) clf = pred.ImputationNearestNeighbor( data=train_df, target=predictor_params['target'], neighbors=predictor_params['neighbors'], how=predictor_params['how'], time_window=predictor_params['time-window'], direction=predictor_params['direction'], out_of_box_model=etc, pred_features=PREDICTION_FEATURES) clf.generate_box_metadata(box_file=box_file) box_lat_lon = [ data_test_bx[data_test_bx.box_id == test_box_id].lat.values[0], data_test_bx[data_test_bx.box_id == test_box_id].lon.values[0] ] # --------------TEST MODEL PERFOMANCE OUT OF THE BOX--------------------------------------------------- results_out = compute_metrics_power_state( model_object=clf, test_data=test_df, box_id=test_box_id, xy=box_lat_lon, model_type='out', ) # --------------TEST MODEL PERFOMANCE NEAREST NEIGHBOR----------------------------------------------- results_nearest = compute_metrics_power_state(model_object=clf, test_data=test_df, box_id=test_box_id, xy=box_lat_lon, model_type='nn') # --------------TEST MODEL PERFOMANCE MAJORITY CLASSIFIER------------------------------------------ results_majority = compute_metrics_power_state(model_object=clf, test_data=test_df, box_id=test_box_id, xy=box_lat_lon, model_type='major') # --------------TEST MODEL PERFOMANCE RANDOM------------------------------------------------------- results_random = compute_metrics_power_state(model_object=clf, test_data=test_df, box_id=test_box_id, xy=box_lat_lon, model_type='rand') return results_nearest, results_out, results_majority, results_random
scores = cross_val_score(LR, X, y, cv=3, scoring='roc_auc') print scores print np.mean(scores) #random forest from sklearn.ensemble import ExtraTreesClassifier as ETC RF = RF(n_estimators=100, random_state=1) RF.fit(X_train, y_train) predicted_probs = RF.predict_proba(X_test) predicted_probs = ["%f" % x[1] for x in predicted_probs] print RF.score(X_test, y_test) forest = ETC( n_estimators=100, random_state=1, compute_importances=True, ) forest.fit(X_train, y_train) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] print "feature importances:" print importances pl.figure() pl.title("feature importances") pl.bar(xrange(9), importances[indices], color="b",
#label of training dataset DEFAULTER = np.array(df['DEFAULTER']) #stacking features together in a matrix X = np.column_stack((AMOUNT, VAR_1, VAR_2, DUE_MORTGAGE, VALUE, DCL, REASON, OCC, TJOB, CL_COUNT, CL_COUNT, RATIO)) #setting Y as the label Y = DEFAULTER #using the Imputer() function to fill in the missing values using strategy='mean' imputer = Imputer(copy=False) transformed_X = imputer.fit_transform(X) #fitting the model with training dataset. Model is a BaggingClassifier, with ExtraTreesClassifier as it's estimator model = BRC(base_estimator=ETC(n_estimators=30), n_estimators=100, bootstrap_features=True, oob_score=True, max_features=7) model.fit(transformed_X, Y) ''' #crossvalidating the model using RepeatedStratifiedKFold model = BRC(base_estimator=ETC(n_estimators=30), n_estimators=100,bootstrap_features=True,oob_score=True,max_features = 7) kfold = KFold() result = cross_val_score(model, transformed_X, Y, cv=kfold, scoring = 'roc_auc') print(result.mean()) ''' f = r'F:/Analyticity2018/test.csv' #reading address of file df = pd.read_csv(f) #creating pandas dataframe