def fit(self, X, y): self.w = np.ones(X.shape[0]) / X.shape[0] # Weights on data self.a = np.zeros(self.n) # Weights on decision trees k = 0 self.features = [] for tree in self.decision_trees: mask = np.random.randint(0, high=len(X), size=len(X)) features = np.random.choice(X.shape[1], size=self.m) Xsampling = X[mask, :] Xsampling = Xsampling[:, features] ysampling = y[mask] tree.fit(Xsampling, ysampling) self.features.append(features) ej = 0 for j in range(len(Xsampling)): ej = checkXY(Xsampling[j, :], ysampling[j], tree) * self.w[j] + ej ej = ej / float(sum(self.w)) self.a[k] = 0.5 * np.log((1 - ej) / float(ej)) for i in range(len(Xsampling)): if checkXY(Xsampling[i, :], ysampling[i], tree) > 0.5: self.w[i] = self.w[i] * np.exp(self.a[k]) else: self.w[i] = self.w[i] * np.exp(-self.a[k]) k = k + 1
def drawDecisionTree(data, name): data['change next day class'] = data['change next day'].apply(classify) X_train, X_test, y_train, y_test = train_test_split( data[[ 'rate of increase', 'increase length', 'rate of decrease', 'decrease length' ]], data['change next day class'], test_size=0.2, random_state=42) tree = DecisionTreeClassifier(max_depth=6, random_state=0) tree.fit(X_train, y_train) print('Train score:{:.3f}'.format(tree.score(X_train, y_train))) print('Test score:{:.3f}'.format(tree.score(X_test, y_test))) #生成可视化图 export_graphviz(tree, out_file="tree.dot", feature_names=[ 'rate of increase', 'increase length', 'rate of decrease', 'decrease length' ], impurity=False, filled=True) #展示可视化图 graph = pydotplus.graph_from_dot_file('tree.dot') graph.write_pdf(name + '.pdf')
def LandmarkDecisionTree(X, y, categorical): if not sps.issparse(X): import sklearn.tree if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier( random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit(X[train], y[train]) else: tree = OneVsRestClassifier(tree) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10 else: return np.NaN
def testRunWithIrisData(self): # Load data and store it into pandas DataFrame objects iris = load_iris() X = pd.DataFrame(iris.data[:, :], columns=iris.feature_names[:]) y = pd.DataFrame(iris.target, columns=["Species"]) # Defining and fitting a DecisionTreeClassifier instance tree = DecisionTreeClassifier(max_depth=2) tree.fit(X, y) # Creates dot file named tree.dot export_graphviz(tree, out_file="../output/IrisOutput_DT.dot", feature_names=list(X.columns), class_names=iris.target_names, filled=True, rounded=True) sample_one_pred = int(tree.predict([[5, 5, 1, 3]])) sample_two_pred = int(tree.predict([[5, 5, 2.6, 1.5]])) print( f"The first sample most likely belongs a {iris.target_names[sample_one_pred]} flower." ) print( f"The second sample most likely belongs a {iris.target_names[sample_two_pred]} flower." )
def _calculate(self, X, y, logger, categorical): import sklearn.tree if type(y) in ('binary', 'multiclass'): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) accuracy = 0. for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier( random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit( X.iloc[train] if hasattr(X, 'iloc') else X[train], y.iloc[train] if hasattr(y, 'iloc') else y[train], ) else: tree = OneVsRestClassifier(tree) tree.fit( X.iloc[train] if hasattr(X, 'iloc') else X[train], y.iloc[train] if hasattr(y, 'iloc') else y[train], ) predictions = tree.predict( X.iloc[test] if hasattr(X, 'iloc') else X[test], ) accuracy += sklearn.metrics.accuracy_score( predictions, y.iloc[test] if hasattr(y, 'iloc') else y[test], ) return accuracy / 5
def fit(self, X_train, Y_train): # 每一颗树都通过get_bookstrap_data获得随机的数据集 sub_sets = self.get_bootstrap_data(X_train, Y_train) n_features = X_train.shape[1] if self.max_features == None: self.max_features = int(np.sqrt(n_features)) for i in range(self.n_estimators): # 现在为每一颗树选择随机的特征 tree = DecisionTreeClassifier( min_samples_split=self.min_samples_split, min_impurity_decrease=self.min_gain, max_depth=self.max_depth) sub_X, sub_Y = sub_sets[i] features = np.random.choice(n_features, self.max_features, replace=True) sub_X = sub_X[:, features] #print("X",sub_X) #print("X",sub_Y) tree.fit(sub_X, sub_Y) self.trees.append(tree) self.trees_feature.append(features)
def TreeTest(): spamDat = spamData() k = 10 all_folds = hw3.partition_folds(spamDat, k) num_in_fold = [] err_in_fold = [] for i in range(len(all_folds) - 1): spam = all_folds[i] num_in_fold.append(len(spam)) truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) print 'Prediction...\n' predict = tree.predict(f_data) print predict print truth error = 1. - hw3.get_accuracy(predict, truth) err_in_fold.append(error) print 'Tree error is: {}'.format(error) spam = all_folds[k -1] truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) predict = tree.predict(f_data) error = 1. - hw3.get_accuracy(predict, truth) sum_training_err = 0 for i in range(len(num_in_fold)): sum_training_err += err_in_fold[i] #sum_training_err += float(err_in_fold)/num_in_fold average_training_error = float(sum_training_err)/len(num_in_fold) print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def learn(self, learn): self._model.clear() self._model.config['version'] = sklearn.__version__ self._tree = None input = [] classes = [] for data in learn: try: input += [self._get_input(data[0])] classes += [data[1]] except github.GithubError: continue # Check for empty data set if len(input) == 0 or len(classes) == 0: logging.error( 'Trying to learn MetadataClassifier with an empty data set. This is not possible.\n' 'Possible errors:\n' ' * Your learning folder is not set up correctly\n' ' * Your rate limit is exhausted\n' ' * There is an error with your internet connection\n' ' * There is an error while connecting to GitHub\n') self._model.clear() self._model.save() return tree = sklearn.tree.DecisionTreeClassifier(min_samples_leaf=3) tree.fit(input, classes) self._tree = tree self._model.config['tree'] = base64.b64encode( pickle.dumps(tree)).decode() self._model.save()
def landmark_decision_tree(X, y): # pylint: disable=C0103 """Compute statistic.""" try: if scipy.sparse.issparse(X): return np.NaN import sklearn.tree # pylint: disable=C0103 if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier( random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit(X[train], y[train]) else: tree = OneVsRestClassifier(tree) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score( predictions, y[test]) return accuracy / 10 except Exception as ex: # pylint: disable=W0703 automl_log( "Landmark Decision Tree could not be computed. Returning 0 \ instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def fit(self, X, y): # TODO implement function n = X.shape[0] for tree in self.decision_trees: indices = np.random.randint(0, n, n) tree.fit(X[indices], y[indices]) return self
def fit(self, X, y): ''' Trains the model Arguments: X is a n-by-d numpy array y is an n-dimensional numpy array ''' #TODO n = len(X) self.k = len(np.unique(y)) #build map of indices to classes a = 0 convert_map = {} for i in np.unique(y): self.class_map[a] = i convert_map[i] = a a += 1 #convert y to be labelled 0-(k-1) instead of 1-k for i in xrange(len(y)): y[i] = convert_map[y[i]] w = np.ones(n) / n for t in xrange(self.numBoostingIters): #fit based on weights tree = DecisionTreeClassifier(max_depth=self.maxTreeDepth) tree.fit(X, y, sample_weight=w) self.tree_array.append(tree) #calculate weighted training error epsilon = 0 y_preds = tree.predict(X) wrong_preds = np.nonzero(y_preds - y) for index in wrong_preds[0]: epsilon += w[index] #early stopping if epsilon == 0: break #calculate beta beta = 0.5 * np.log((1 - epsilon)/epsilon) + np.log(self.k-1) self.beta_array.append(beta) #update all instance weights for i in xrange(len(w)): if i not in wrong_preds[0]: w[i] = w[i] * np.exp(-1 * beta) else: w[i] = w[i] * np.exp(beta) #normalize weight vector w = w / np.sum(w) #fix y for i in xrange(len(y)): y[i] = self.class_map[y[i]]
def fit(self, X, y): self.mask = [] for tree in self.decision_trees: mask = np.random.randint(0, high=len(X), size=len(X)) Xsampling = X[mask, :] ysampling = y[mask] tree.fit(Xsampling, ysampling) self.mask.append(mask)
def _fit_stage(self, i, X, y, qids, y_pred, sample_weight, sample_mask, query_groups, random_state): """Fit another tree to the boosting model.""" assert sample_mask.dtype == np.bool n_samples = X.shape[0] all_lambdas = np.zeros(n_samples) all_deltas = np.zeros(n_samples) mat = [] for qidx, (qid, a, b, _) in enumerate(query_groups): score = self.metric.evaluate_preds(qid, y[a:b], y_pred[a:b]) score_b1 = self.metric.evaluate_preds( qid, y[a:b], X[:, int(self.features_risk[0])][a:b]) score_b2 = self.metric.evaluate_preds( qid, y[a:b], X[:, int(self.features_risk[1])][a:b]) mat.append([score, score_b1, score_b2]) mat = np.array(mat) grisk_normal = getGeoRiskDefault(mat, 5)[0] posix = 0 for qid, a, b, _ in query_groups: lambdas, deltas = self._calc_lambdas_deltas( qid, y[a:b], y_pred[a:b], mat, posix, grisk_normal) all_lambdas[a:b] = lambdas all_deltas[a:b] = deltas posix += 1 tree = sklearn.tree.DecisionTreeRegressor( criterion='friedman_mse', splitter='best', presort=True, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=0.0, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state) if self.subsample < 1.0 or self.query_subsample < 1.0: sample_weight = sample_weight * sample_mask.astype(np.float64) tree.fit(X, all_lambdas, sample_weight=sample_weight, check_input=False) self._update_terminal_regions(tree.tree_, X, y, all_lambdas, all_deltas, y_pred, sample_mask) self.estimators_[i, 0] = tree self.estimators_fitted_ = i + 1 return y_pred
def mapper(self, _, line): ratings = pd.read_csv(self.train_set) ratings = ratings.sample(round(len(ratings.index)/10)) tree = sklearn.tree.DecisionTreeRegressor() labels = ratings.iloc[:, 3] ratings = ratings.drop(ratings.columns[[3]], axis=1) samples = ratings tree.fit(samples, labels) yield jsonpickle.encode(tree), 1
def err(x, y): from sklearn import tree tree = tree.DecisionTreeClassifier(random_state=0) tree.fit(x, y) error = 0 for i, v in enumerate(tree.predict(D_data)): if v != D_target[i]: error += 1 erate = error / float(len(D_target)) return erate
def train(self, dataset, targets, hyper_params): numQueries=len(dataset.docsPerQuery) validDocs=numpy.minimum(dataset.docsPerQuery, self.rankingSize) queryDocPosTriplets=numpy.dot(dataset.docsPerQuery, validDocs) designMatrix=numpy.zeros((queryDocPosTriplets, self.numFeatures), dtype=numpy.float32, order='F') regressionTargets=numpy.zeros(queryDocPosTriplets, dtype=numpy.float64, order='C') sampleWeights=numpy.zeros(queryDocPosTriplets, dtype=numpy.float32) currID=-1 for i in range(numQueries): numAllowedDocs=dataset.docsPerQuery[i] currValidDocs=validDocs[i] allFeatures=dataset.features[i].toarray() for doc in range(numAllowedDocs): docID=doc if dataset.mask is not None: docID=dataset.mask[i][doc] for j in range(currValidDocs): currID+=1 designMatrix[currID,:]=self.createFeature(allFeatures[docID,:], j) regressionTargets[currID]=targets[i][j,doc] sampleWeights[currID]=1.0/(numAllowedDocs * currValidDocs) for i in targets: del i del targets print("L2RPolicy:train [LOG] Finished creating features and targets ", numpy.amin(regressionTargets), numpy.amax(regressionTargets), numpy.median(regressionTargets), flush=True) print("L2RPolicy:train [LOG] Histogram of targets ", numpy.histogram(regressionTargets), flush=True) if self.modelType == 'gbrt': tree=sklearn.ensemble.GradientBoostingRegressor(learning_rate=hyper_params['lr'], n_estimators=hyper_params['ensemble'], subsample=hyper_params['subsample'], max_leaf_nodes=hyper_params['leaves'], max_features=1.0, presort=False) tree.fit(designMatrix, regressionTargets, sample_weight=sampleWeights) self.tree=tree print("L2RPolicy:train [INFO] %s" % self.modelType, flush=True) elif self.modelType == 'ridge': ridgeCV=sklearn.linear_model.RidgeCV(alphas=self.hyperParams, fit_intercept=False, normalize=False, cv=3) ridgeCV.fit(designMatrix, regressionTargets, sample_weight=sampleWeights) self.policyParams=ridgeCV.coef_ print("L2RPolicy:train [INFO] Done. ", flush=True) else: print("L2RPolicy:train [ERR] %s not supported." % self.modelType, flush = True) sys.exit(0) print("L2R:train [INFO] Created %s predictor using dataset %s." % (self.modelType, dataset.name), flush = True)
def _calculate(self, X, y, categorical): import sklearn.tree kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10) accuracy = 0. for train, test in kf: random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def decision_tree(train_set, test_set, features): start_time = time.time() # Instantiate the classifier tree = skl.tree.DecisionTreeClassifier(criterion="entropy") # Train classifier tree.fit(train_set[features].values, train_set['target']) # Predict y_pred = tree.predict(test_set[features]) # Report results report_results("Decision Tree", time.time() - start_time, test_set["target"], y_pred)
def predict_with_dtree(col_names, data): N = len(data) training_data = data[:int(0.80 * N)] tree = DecisionTree() tree.fit(col_names, training_data) testing_data = data[int(0.80 * N):] for row in testing_data: print 'Actual label is %s and predicted %s' % (row[-1], tree.predict(row))
def fit(self, X, y): self.mask = [] self.features = [] for tree in self.decision_trees: mask = np.random.randint(0, high=len(X), size=len(X)) features = np.random.choice(X.shape[1], size=self.m) Xsampling = X[mask, :] Xsampling = Xsampling[:, features] ysampling = y[mask] tree.fit(Xsampling, ysampling) self.mask.append(mask) self.features.append(features)
def addBoostIteration(self): rv = self.regressionValues() trees = [] mask = numpy.array([True] * self.nF) for i in range(0, self.nF): mask[:] = True mask[i] = False tree = DecisionTreeRegressor(max_depth=self.max_depth) tree.fit(self.data[:, mask], rv[:, i]) # newpsis[:, i] = tree.predict(self.data[:, mask]) trees.append(tree) self.trees.append(trees)
def _calculate(self, X, y, categorical): import sklearn.tree kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10) accuracy = 0.0 for train, test in kf: random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def build_tree_classifier(X_train, X_test, y_train, y_test, criterion='gini'): ''' Build classifier, return results, predictions, and tree Inputs: X_train, X_test, y_train, y_test, criterion Returns: y_test, y_pred, tree ''' tree = DecisionTreeClassifier(criterion) tree.fit(X_train, y_train) y_pred = tree.predict(X_test) return y_test, y_pred, tree
def learn(self, learn): self._model.clear() self._model.config['version'] = sklearn.__version__ self._tree = None known_languages = set() for data in learn: try: languages = data[0].get_languages() except github.GithubError: continue for language in languages: known_languages.add(language) known_languages = list(known_languages) dataset = [] labels = [] for data in learn: try: languages = data[0].get_languages() except github.GithubError: continue entry = self._get_entry(languages, known_languages) dataset += [entry] labels += [data[1]] # Check for empty data set if len(dataset) == 0 or len(labels) == 0: logging.error( 'Trying to learn LanguageDetailsClassifier with an empty data set. This is not possible.\n' 'Possible errors:\n' ' * Your learning folder is not set up correctly\n' ' * Your rate limit is exhausted\n' ' * There is an error with your internet connection\n' ' * There is an error while connecting to GitHub\n') self._model.clear() self._model.save() return tree = sklearn.tree.DecisionTreeClassifier(min_samples_leaf=3) tree.fit(dataset, labels) self._tree = tree self._model.config['tree'] = base64.b64encode( pickle.dumps(tree)).decode() self._model.config['known_languages'] = known_languages self._model.save()
def save_features(self, X, y): feats = dict() print "univariate feature selectors" selector_clf = SelectKBest(score_func=f_classif, k='all') selector_clf.fit(X, y) pvalues_clf = selector_clf.pvalues_ pvalues_clf[np.isnan(pvalues_clf)] = 1 #put feature vectors into dictionary feats['univ_sub01'] = (pvalues_clf < 0.1) feats['univ_sub005'] = (pvalues_clf < 0.05) feats['univ_clf_sub005'] = (pvalues_clf < 0.05) print "randomized logistic regression feature selector" sel_log = linear_model.RandomizedLogisticRegression(random_state=42, n_jobs=4).fit( X, y) #put rand_lasso feats into feature dict feats['rand_logreg'] = sel_log.get_support() print "l1-based feature selectors" X_sp = sparse.coo_matrix(X) sel_svc = svm.LinearSVC(C=0.1, penalty="l1", dual=False, random_state=42).fit(X, y) feats['LinearSVC'] = np.ravel(sel_svc.coef_ > 0) sel_log = linear_model.LogisticRegression(C=0.01, random_state=42).fit( X_sp, y) feats['LogReg'] = np.ravel(sel_log.coef_ > 0) tree_max_features = 20 print "ExtraTrees feature selectors (%s)" % tree_max_features feats['tree'] = np.zeros(len(feats['LogReg'])) tree = ExtraTreesClassifier(n_estimators=250, max_features=tree_max_features) tree.fit(X, y) feature_importance = tree.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance)[::-1] for i in xrange(tree_max_features): feats['tree'][sorted_idx[i]] = 1 feat_sums = np.zeros(len(feats['LogReg'])) for key in feats: feat_sums += feats[key].astype(int) feats[ 'ensemble'] = feat_sums >= 4 #take features which get 5 or more votes joblib.dump(feats, 'features/feats.pkl', compress=3) return feats
def build_policies(self, n): c2 = DatasetBandit.DATASETS[self.dataset](L=1, loop=True) Policies = [] for p in range(n): X = np.zeros((100, c2.d)) r = np.zeros((100, )) for n in range(100): (curr_x, curr_r) = c2.next() a = np.random.choice(curr_x.get_K()) X[n, :] = curr_x.get_ld_features()[a, :] r[n] = curr_r[a] tree = sklearn.tree.DecisionTreeRegressor(max_depth=3) tree.fit(X, r) Policies.append(RegressionPolicy(tree)) return (Policies)
def select_model(trainx,trainy,validx,validy): trees = {} criterion = ["gini","entropy"] depth = 7 for i in range(1,8): for j in criterion: tree = sklearn.tree.DecisionTreeClassifier(criterion=j, max_depth=i) tree.fit(trainx,trainy) accuracy = check_accuracy(tree,validx,validy) print("Model(depth= " + str(i) + "criteria= " + str(j)+") with accuracy = " + str(accuracy)) trees[accuracy] = tree keys = trees.keys() max_key = max(keys) solution = trees[max_key] return solution
def fit(self, X, y): assert self.sample_size <= len( y), "Sample size cannot be greater or equal to input size" full = np.concatenate((X, y.reshape(-1, 1)), axis=1) for tree in self.decision_trees: bagged_samples = np.random.choice(list(range(len(full))), size=self.sample_size, replace=True) train_data = full[bagged_samples, :] train_data_x = train_data[:, :-1] train_data_y = train_data[:, -1:] tree.fit(train_data_x, train_data_y)
def kfold_cross_validation(df,tree,folds_num,features,target): kf = KFold(n_splits = folds_num, shuffle=True) attributes = df[features] labels = df[target] accuracy= [] precision = [] recall = [] f1score =[] #RMSE = [] scores= [] for i in range(folds_num): result = next(kf.split(attributes), None) x_train = attributes.iloc[result[0]] x_test = attributes.iloc[result[1]] y_train = labels.iloc[result[0]] y_test = labels.iloc[result[1]] model = tree.fit(x_train,y_train) y_pred = tree.predict(x_test) accuracy.append(accuracy_score(y_test, y_pred)) precision.append(precision_score(y_test, y_pred, average="weighted")) #labels=np.unique(y_pred) can be added to calculate the measure only for the labels that have predicted samples recall.append(recall_score(y_test, y_pred, average="weighted")) f1score.append(f1_score(y_test, y_pred, average="weighted")) #RMSE.append(root_mean_squared_error(y_test, y_pred)) #accuracy.append(model.score(x_test,y_test)) #print("Accuracy:",accuracy) #print("Avg accuracy:",np.mean(accuracy)) scores = [np.mean(accuracy),np.mean(precision), np.mean(recall),np.mean(f1score)] #scores = [np.mean(accuracy),np.mean(precision), np.mean(recall),np.mean(f1score),np.mean(RMSE)] return(scores)
def train_tree(X, Y, k=2, max_depths=default_max_depths, criterions=default_criterions): depths = [] scores = [] for depth in max_depths: for criteria in criterions: tree = sklearn.tree.DecisionTreeClassifier(criterion=criteria, max_depth=depth) tree.fit(X, Y) train_score = validate(tree, X, Y) depths.append(depth) scores.append(train_score) return depths, scores
def fit( self, feature_list, label_list ): #build the RandomForest by DecisionTree , every RandomForest =num DecisionTrees train_list = [] train_label_list = [] vec = DictVectorizer() for i in range(self.num): x_train, x_test, y_train, y_test = train_test_split(feature_list, label_list, test_size=0.33, random_state=i) x_train_vec = vec.fit_transform(x_train).toarray() train_list.append(x_train_vec) train_label_list.append(y_train) tree = DecisionTree() tree.fit(x_train_vec, y_train) self.tree_list.append(tree)
def dt(X_train, y_train): from sklearn import tree param_grid = {'max_depth': np.arange(3, 25)} clf = tree.DecisionTreeClassifier(random_state=0) tree = GridSearchCV(clf, param_grid) clf = tree.fit(X_train, y_train) return clf
def calculate(self): import sklearn.tree if len(self.y.shape) == 1 or self.y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(self.X, self.y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) if len(self.y.shape) == 1 or self.y.shape[1] == 1: tree.fit(self.X.iloc[train], np.ravel(self.y.iloc[train],order='C')) else: tree = OneVsRestClassifier(tree) tree.fit(self.X.iloc[train], np.ravel(self.y.iloc[train],order='C')) predictions = tree.predict(self.X.iloc[test]) accuracy += sklearn.metrics.accuracy_score(predictions, self.y.iloc[test]) return accuracy / 10
def fit(self, X, y): for tree in self.decision_trees: X_tree = np.zeros_like(X) y_tree = np.zeros_like(y) for i in range(X.shape[0]): rand_i = np.random.choice(X.shape[0]) X_tree[i, :] = X[rand_i, :] y_tree[i] = y[rand_i] tree = tree.fit(X_tree, y_tree) return self
def findMisClf(df, X, y, y_pred, name): ''' Takes a dataframe (df), column names of predictors (X) and a dependent variable (y). Loops over generic classifiers to find predictions. Creates a decision tree using prediction misclassification as the dependent variable. ''' var_name = name + '_predict' try: df[var_name] = y_pred except: import pdb pdb.set_trace() correct = name + '_correct' # Determine "correctness" based on 0.5 threshold df[correct] = (df[var_name] > 0.5).astype(int) # Determine which observations are being misclassified tree = DecisionTreeClassifier(max_depth=3) tree.fit(X, df[correct]) feature_names = df.columns left, right = tree.tree_.children_left, tree.tree_.children_right threshold = tree.tree_.threshold features = [feature_names[i] for i in tree.tree_.feature] value = tree.tree_.value def recurse(left, right, threshold, features, node): if (threshold[node] != -2): print "if ( " + features[node] + " <= " + str(threshold[node]) + " ) {" if left[node] != -1: recurse (left, right, threshold, features,left[node]) print " } else {" if right[node] != -1: recurse (left, right, threshold, features,right[node]) print "}" else: print "return " + str(value[node]) recurse(left, right, threshold, features, 0)
def _fit_stage(self, i, X, y, qids, y_pred, sample_weight, sample_mask, query_groups, random_state): """Fit another tree to the boosting model.""" assert sample_mask.dtype == np.bool n_samples = X.shape[0] all_lambdas = np.zeros(n_samples) all_deltas = np.zeros(n_samples) for qid, a, b, _ in query_groups: for coef, metric in zip(self.metric_coefs, self.metrics): lambdas, deltas = metric.calc_lambdas_deltas( qid, y[a:b], y_pred[a:b]) all_lambdas[a:b] += coef * lambdas all_deltas[a:b] += coef * deltas tree = sklearn.tree.DecisionTreeRegressor( criterion='friedman_mse', splitter='best', presort=True, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=0.0, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state) if self.subsample < 1.0 or self.query_subsample < 1.0: sample_weight = sample_weight * sample_mask.astype(np.float64) tree.fit(X, all_lambdas, sample_weight=sample_weight, check_input=False) self._update_terminal_regions(tree.tree_, X, y, all_lambdas, all_deltas, y_pred, sample_mask) self.estimators_[i, 0] = tree self.estimators_fitted_ = i + 1 return y_pred
def _calculate(self, X, y, categorical): import sklearn.tree if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10) else: kf = sklearn.cross_validation.KFold(y.shape[0], n_folds=10) accuracy = 0. for train, test in kf: random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit(X[train], y[train]) else: tree = OneVsRestClassifier(tree) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def cluster_then_forest(xs, ys, in_sample_size): isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size) clf = cluster.KMeans(n_clusters = 4) clf.fit(in_sample) oos_clusterid = clf.predict(out_sample) ins_clusterid = clf.predict(in_sample) for id in numpy.unique(oos_clusterid): print "Now working on Cluster " + str(id) oos_ind = oos_clusterid == id ins_ind = ins_clusterid == id tree = ensemble.RandomForestRegressor(50) tree.fit(in_sample[ins_ind], ys[isi][ins_ind]) print "Score for in-sample" print str(tree.score(in_sample[ins_ind], ys[isi][ins_ind])) print "Score for out-of sample" tree.predict(out_sample[oos_ind]) print str(tree.score(out_sample[oos_ind], ys[osi][oos_ind])) return None
def _fit_stage(self, i, X, y, qids, y_pred, sample_weight, sample_mask, query_groups, random_state): """Fit another tree to the boosting model.""" assert sample_mask.dtype == np.bool n_samples = X.shape[0] all_lambdas = np.zeros(n_samples) all_deltas = np.zeros(n_samples) for qid, a, b, _ in query_groups: lambdas, deltas = self._calc_lambdas_deltas(qid, y[a:b], y_pred[a:b]) all_lambdas[a:b] = lambdas all_deltas[a:b] = deltas tree = sklearn.ensemble.RandomForestRegressor( criterion='friedman_mse', max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=0.0, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state, n_jobs = -1 ) if self.subsample < 1.0 or self.query_subsample < 1.0: sample_weight = sample_weight * sample_mask.astype(np.float64) tree.fit(X, all_lambdas, sample_weight=sample_weight) self._update_terminal_regions(tree.tree_, X, y, all_lambdas, all_deltas, y_pred, sample_mask) self.estimators_[i, 0] = tree self.estimators_fitted_ = i + 1 return y_pred
target = 'safe_loans' loans = loans[features + [target]] #train_idx=ps.read_json('module-5-assignment-1-train-idx.json') with open('C:\Users\Isaac\Course 3/module-5-assignment-1-train-idx.json', 'r') as f: train_idx = json.load(f) #test_idx=ps.read_json('module-5-assignment-1-test-idx.json') with open('C:\Users\Isaac\Course 3/module-5-assignment-1-validation-idx.json', 'r') as f: validation_idx = json.load(f) train_data = loans.iloc[train_idx] validation_data = loans.iloc[validation_idx] train_matrix,train_output=get_numpy_data(train_data,features,target) validation_matrix,validation_output=get_numpy_data(validation_data,features,target) safe_loans_raw = loans[loans[target] == +1] risky_loans_raw = loans[loans[target] == -1] print "Number of safe loans : %s" % len(safe_loans_raw) print "Number of risky loans : %s" % len(risky_loans_raw) percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) risky_loans = risky_loans_raw safe_loans = safe_loans_raw.sample(percentage, seed=1) tree=sklearn.tree.DecisionTreeClassifier(max_depth=6) decision_tree_model=tree.fit(train_matrix,train_output)
''' Decision Tree ''' binary_data = pd.get_dummies(all_census_prep) X_train, X_test, y_train, y_test = cross_validation.train_test_split(binary_data[binary_data.columns.difference(["earning_class"])], binary_data["earning_class"], train_size=0.80) scaler = preprocessing.StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train.astype("f64")), columns=X_train.columns) X_test = scaler.transform(X_test.astype("f64")) from sklearn.tree import DecisionTreeClassifier, export_graphviz tree = DecisionTreeClassifier(criterion='entropy',max_depth=20) tree.fit(X_train, y_train) y_pred = tree.predict(X_test) cm = metrics.confusion_matrix(y_test, y_pred) (cm[0][0]+cm[1][1]).astype('f64')/sum(sum(cm)) feature_names = list(X_train.columns) export_graphviz(tree, out_file="tree.dot",feature_names=feature_names) import pydotplus import pyparsing import StringIO dotfile = StringIO.StringIO() export_graphviz(tree, out_file=dotfile,feature_names=feature_names) graph = pydotplus.graph_from_dot_data(dotfile.getvalue()) graph.write_png("dtree2.png")
def fit(self, X_train, Y_train): for i in range(self.n_tree): Xb_train,Yb_train = rswr(X_train,Y_train,500); tree = DecisionTreeClassifier(max_depth = self.md, max_features = 'auto',min_samples_split=2,) tree.fit(X_train, Y_train) self.tree_bags.append(tree)
fpr_master.append(fpr) tpr_master.append(tpr) model.append('NB') aucs.append(roc_auc) # how about multi-fold cross-validation with 5 folds cv_results_gnb = cross_val_score(gnb, x_train, y_train, cv=5) print 'CV Results:',round(cv_results_gnb.mean(),3) # cross-validation average accuracy # ------------------------------------------------------------------- # Create the Decision Tree Model # ------------------------------------------------------------------- #fit the linear svc model tree = tree.DecisionTreeClassifier(random_state = 9999, criterion='entropy', max_depth=6, min_samples_leaf=5) tree_model_fit = tree.fit(x_train, y_train) # predicted class in training data only y_pred = tree_model_fit.predict(x_test) print '\nDecision Tree Results:' print 'Confustion Matrix:\n',confusion_matrix(y_test, y_pred) print 'Accuracy Score:',round(accuracy_score(y_test, y_pred), 3) #calculate the TPR/FPR for NB fpr, tpr, thresholds = roc_curve(y_test, y_pred) roc_auc = auc(fpr, tpr) print "AUC : %f" % roc_auc fpr_master.append(fpr) tpr_master.append(tpr) model.append('DT') aucs.append(roc_auc)
def train(self, feature_list, name): self.featureList=feature_list self.name=name+'-'+self.modelType modelFile=Settings.DATA_DIR+self.dataset.name+'_'+self.name if 'alpha' not in self.hyperParams: #Expecting hyper-params for GBRT; Add those hyper-params to the model file name modelFile=modelFile+'ensemble-'+str(self.hyperParams['ensemble'])+'_lr-'+str(self.hyperParams['lr'])+'_subsample-'+str(self.hyperParams['subsample'])+'_leaves-'+str(self.hyperParams['leaves']) if self.modelType=='tree' or self.modelType=='gbrt': modelFile+='.z' else: modelFile+='.npz' self.savedRankingsSize=None self.savedRankings=None if os.path.exists(modelFile): if self.modelType=='tree' or self.modelType=='gbrt': self.tree=joblib.load(modelFile) print("DeterministicPolicy:train [INFO] Using precomputed policy", modelFile, flush=True) else: with numpy.load(modelFile) as npFile: self.policyParams=npFile['policyParams'] print("DeterministicPolicy:train [INFO] Using precomputed policy", modelFile, flush=True) print("DeterministicPolicy:train [INFO] PolicyParams", self.policyParams,flush=True) else: numQueries=len(self.dataset.features) allFeatures=None allTargets=None print("DeterministicPolicy:train [INFO] Constructing features and targets", flush=True) if self.dataset.mask is None: allFeatures=scipy.sparse.vstack(self.dataset.features, format='csc') allTargets=numpy.hstack(self.dataset.relevances) else: temporaryFeatures=[] temporaryTargets=[] for currentQuery in range(numQueries): temporaryFeatures.append(self.dataset.features[currentQuery][self.dataset.mask[currentQuery], :]) temporaryTargets.append(self.dataset.relevances[currentQuery][self.dataset.mask[currentQuery]]) allFeatures=scipy.sparse.vstack(temporaryFeatures, format='csc') allTargets=numpy.hstack(temporaryTargets) if self.regressGains: allTargets=numpy.exp2(allTargets)-1.0 allSampleWeights=None fitParams=None if self.weighted: allSampleWeights=numpy.array(self.dataset.docsPerQuery, dtype=numpy.float64) allSampleWeights=numpy.reciprocal(allSampleWeights) allSampleWeights=numpy.repeat(allSampleWeights, self.dataset.docsPerQuery) fitParams={'sample_weight': allSampleWeights} #Restrict features to only the unmasked features if self.featureList is not None: print("DeterministicPolicy:train [INFO] Masking unused features. Remaining feature size", len(feature_list), flush=True) allFeatures = allFeatures[:, self.featureList] print("DeterministicPolicy:train [INFO] Beginning training", self.modelType, flush=True) if self.modelType=='tree': treeCV=sklearn.model_selection.GridSearchCV(sklearn.tree.DecisionTreeRegressor(criterion="mse", splitter="random", min_samples_split=4, min_samples_leaf=4, presort=False), param_grid=self.treeDepths, scoring=None, fit_params=fitParams, n_jobs=-2, iid=True, cv=5, refit=True, verbose=0, pre_dispatch="1*n_jobs", error_score='raise', return_train_score=False) treeCV.fit(allFeatures, allTargets) self.tree=treeCV.best_estimator_ print("DeterministicPolicy:train [INFO] Done. Best depth", treeCV.best_params_['max_depth'], flush=True) joblib.dump(self.tree, modelFile, compress=9, protocol=-1) elif self.modelType=='lasso': lassoCV=sklearn.model_selection.GridSearchCV(sklearn.linear_model.Lasso(fit_intercept=False, normalize=False, precompute=False, copy_X=False, max_iter=3000, tol=1e-4, warm_start=False, positive=False, random_state=None, selection='random'), param_grid=self.hyperParams, scoring=None, fit_params=fitParams, n_jobs=-2, iid=True, cv=5, refit=True, verbose=0, pre_dispatch="1*n_jobs", error_score='raise', return_train_score=False) lassoCV.fit(allFeatures, allTargets) self.policyParams=lassoCV.best_estimator_.coef_ print("DeterministicPolicy:train [INFO] Done. CVAlpha", lassoCV.best_params_['alpha'], flush=True) print("DeterministicPolicy:train [INFO] PolicyParams", self.policyParams,flush=True) numpy.savez_compressed(modelFile, policyParams=self.policyParams) elif self.modelType == 'ridge': ridgeCV=sklearn.model_selection.GridSearchCV(sklearn.linear_model.Ridge(fit_intercept=False, normalize=False, copy_X=False, max_iter=3000, tol=1e-4, random_state=None), param_grid=self.hyperParams, n_jobs=-2, fit_params=fitParams, iid=True, cv=3, refit=True, verbose=0, pre_dispatch='1*n_jobs') ridgeCV.fit(allFeatures, allTargets) self.policyParams=ridgeCV.best_estimator_.coef_ print("DeterministicPolicy:train [INFO] Done. CVAlpha", ridgeCV.best_params_['alpha'], flush=True) elif self.modelType=='gbrt': tree=sklearn.ensemble.GradientBoostingRegressor(learning_rate=self.hyperParams['lr'], n_estimators=self.hyperParams['ensemble'], subsample=self.hyperParams['subsample'], max_leaf_nodes=self.hyperParams['leaves'], max_features=1.0, presort=False) tree.fit(allFeatures, allTargets, sample_weight=allSampleWeights) self.tree=tree print("DeterministicPolicy:train [INFO] Done.", flush=True) joblib.dump(self.tree, modelFile, compress=9, protocol=-1) else: print("DeterministicPolicy:train [ERR] %s not supported." % self.modelType, flush=True) sys.exit(0)