def MarginBoostClf(features, labels, max_depth, n_steps, margin): sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size clf_list = [] for t in range(n_steps): clf = DecisionTreeClassifier(max_depth=max_depth) clf = clf.fit(features, labels, sample_weight=weights) y_predict = clf.predict(features) incorrect = y_predict != labels # Error fraction estimator_error = np.mean( np.average(incorrect, weights=weights, axis=0)) if (estimator_error >= 0.5): break step_size = 0.5 * (np.log((1 - estimator_error) / estimator_error) + np.log(1 - margin) - np.log(1 + margin)) norm_factor = 2 * pow(estimator_error * (1 - estimator_error), 0.5) for i in range(sample_size): if (labels[i] == y_predict[i]): weights[i] *= np.exp(-step_size) / norm_factor else: weights[i] *= np.exp(step_size) / norm_factor clf_list.append([clf, step_size]) return clf_list
class DecisionTreeClassifierImpl(): def __init__(self, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight='balanced', presort=False): self._hyperparams = { 'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'random_state': random_state, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'class_weight': class_weight, 'presort': presort} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict): clf = DecisionTreeClassifier() clf.fit(X, y) current_state_to_predict = np.array(current_state_to_predict).reshape( 1, -1) predicted_state = clf.predict(current_state_to_predict) return predicted_state
def BoostByMaj(features, labels, max_depth, gamma): sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size counts = np.zeros(sample_size) k_pre = get_k_from_gamma(gamma, sample_size) k = k_pre #k = min(600, k_pre) print('k ', k) clf_list = [] for i in range(k): estimator_error = 0.6 countdown = 10 while ((estimator_error >= 0.5) and (countdown >= 0)): clf = DecisionTreeClassifier(max_depth=max_depth) clf = clf.fit(features, labels, sample_weight=weights) y_predict = clf.predict(features) correct_ones = y_predict == labels incorrect_ones = y_predict != labels estimator_error = np.mean( np.average(incorrect_ones, weights=weights, axis=0)) unweighted_estimator_error = np.mean( np.average(incorrect_ones, axis=0)) countdown -= 1 counts += correct_ones coeff_1 = int(np.floor(k / 2)) - counts coeff_2 = int(np.ceil(k / 2)) - i - 1 + counts weights = comb(k - i - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow( 0.5 - gamma, coeff_2) print('i', i, 'error', estimator_error, 'unweighted_error', unweighted_estimator_error, 'wnorm', np.linalg.norm(weights, ord=1)) weights = weights / np.linalg.norm(weights, ord=1) clf_list.append([clf, 1]) return clf_list, weights
def dtree(X, y, model_path): model = DecisionTreeClassifier() model.fit(X, y) expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def DeepBBM2(features, labels, max_depth, gamma, max_depth_range): num_features = features.shape[1] sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size D_weights = np.ones(sample_size) / sample_size counts = np.zeros(sample_size) k_pre = get_k_from_gamma(gamma, sample_size) k = k_pre #k = min(600, k_pre) normalizer = np.exp(1) * sample_size print('k ', k) clf_list = [] rademacher_list = [] for depth in max_depth_range: rademacher_list.append( calc_rademacher(depth, sample_size, num_features, normalizer)) for t in range(k): best_loss = 10000 best_error = 1 best_depth = -1 best_clf = DecisionTreeClassifier(max_depth=0) for depth in max_depth_range: new_clf_list, new_weights = DeepBoost(features, labels, 1, max_depth_range, initial_weights=weights) new_clf = DecisionTreeClassifier(max_depth=depth) new_clf = new_clf.fit(features, labels, sample_weight=weights) new_error = eval_clf(new_clf, features, labels, weights) new_edge = new_error - 0.5 new_sign_edge = np.sign(new_edge) new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth - 1] # print ('new_error', new_error, 'new_grad', new_grad) print('depth', depth, 'new_error', new_error, 'new_grad', new_loss) if (new_loss < best_loss): best_clf = new_clf best_loss = new_loss best_error = new_error best_depth = depth y_predict = best_clf.predict(features) correct_ones = y_predict == labels counts += correct_ones # if (best_error >= 0.5): # break; coeff_1 = int(np.floor(k / 2)) - counts coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow( 0.5 - gamma, coeff_2) print('i', t, 'error', best_error, 'wnorm', np.linalg.norm(weights, ord=1)) weights = weights / np.linalg.norm(weights, ord=1) clf_list.append([best_clf, 1]) return clf_list, weights
def wrapper_for_decision_tree_accuracy(X, y, relative_test_size): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=relative_test_size, random_state=42) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) score = accuracy_score(pred, y_test) return score
def wrapper_for_decision_tree_accuracy(X, y, relative_test_size): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=relative_test_size, random_state=42) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) score = accuracy_score(pred,y_test) return score
def DeepBBM(features, labels, gamma, max_depth_range, PARAM_lambda_2): num_features = features.shape[1] sample_size = features.shape[0] weights = np.ones(sample_size) / sample_size counts = np.zeros(sample_size) k_pre = get_k_from_gamma(gamma, sample_size) k = k_pre #k = min(600, k_pre) normalizer = np.exp(1) * sample_size # print ('k ', k) clf_list = [] rademacher_list = [] for depth_index in range(len(max_depth_range)): depth = max_depth_range[depth_index] rademacher_list.append( calc_rademacher(depth, sample_size, num_features, normalizer)) for t in range(k): best_loss = 10000 best_error = 1 best_depth = -1 best_clf = DecisionTreeClassifier(max_depth=0) for depth_index in range(len(max_depth_range)): depth = max_depth_range[depth_index] new_clf = DecisionTreeClassifier(max_depth=depth) new_clf = new_clf.fit(features, labels, sample_weight=weights) new_error = eval_clf(new_clf, features, labels, weights) new_edge = new_error - 0.5 new_sign_edge = np.sign(new_edge) new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth_index] if (new_loss < best_loss): best_clf = new_clf best_loss = new_loss best_error = new_error best_depth = depth y_predict = best_clf.predict(features) correct_ones = y_predict == labels counts += correct_ones coeff_1 = int(np.floor(k / 2)) - counts coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow( 0.5 - gamma, coeff_2) clf_list.append([best_clf, 1, best_depth]) # print ('i', t, 'error', best_error, 'wnorm', np.linalg.norm(weights, ord=1)) if (np.max(coeff_1) < 0): break weights = weights / np.linalg.norm(weights, ord=1) return clf_list, weights
def create_decision_tree(self): ''' based on experiments our best model was the decision tree model with the following params: ''' tree = DecisionTreeClassifier(max_depth=65, min_samples_split=0.03, min_samples_leaf=3, max_features=8) tree.fit(self.X_train, self.Y_train) predicted_y = tree.predict(self.X_test) print(predicted_y) self.print_stats(predicted_y, "") self.test_df['learning_label'] = predicted_y self.test_df.to_csv('output/feature_extraction.csv', encoding="latin-1") # save the training dataset
def use(): # test use from sklearn.tree.tree import DecisionTreeClassifier import sklearn.datasets path = 'model.pkl' iris = sklearn.datasets.load_iris() model = DecisionTreeClassifier() # a = None # try: # a.test() # except Exception as e: # traceback.print_exc() train(model, iris.data, iris.target) save(model, path) model = load(path) print(model.predict(iris.data))
def BrownBoost(features, labels, max_depth, total_time): sample_size = features.shape[0] clf_list = [] r = np.zeros(sample_size) weights = np.array([]) s = total_time #s works as the remaining time with the initial value total_time T = total_time alpha = 0 i = 0 b = np.zeros(sample_size) while (s > 0 and i < 200): weights = np.exp(-(r + s)**2 / total_time) weights = weights / (np.sum(weights)) clf = DecisionTreeClassifier(max_depth=max_depth) clf = clf.fit(features, labels, sample_weight=weights) y_predict = clf.predict(features) incorrect = y_predict != labels # Error fraction estimator_error = np.mean( np.average(incorrect, weights=weights, axis=0)) print('estimator_error is', estimator_error) if (estimator_error >= 0.5): break for j in range(sample_size): if (labels[j] == y_predict[j]): b[j] = 1 else: b[j] = -1 a = r + s (t, alpha) = SolveODE(a, b, s, sample_size, T) r += alpha * b s = s - t print(s) clf_list.append([clf, alpha]) i += 1 return clf_list
target_c = [0 if target[i] == "CATTLE" else 1 for i in range(len(target))] X_train_deer, X_test_deer, y_train_deer, y_test_deer = train_test_split( train, target_a, random_state=0, test_size=0.3) X_train_elk, X_test_elk, y_train_elk, y_test_elk = train_test_split( train, target_b, random_state=0, test_size=0.3) X_train_cattle, X_test_cattle, y_train_cattle, y_test_cattle = train_test_split( train, target_c, random_state=0, test_size=0.3) print("-----Question 1-----") ##Question 1## ###Decision Tree print("-----DECISION TREE-----") print("DEER confusion Matrix and accuracy score") clf = DecisionTreeClassifier() clf.fit(X_train_deer, y_train_deer) y_pred = clf.predict(X_test_deer) ##predict my y's based on x's print(confusion_matrix(y_test_deer, y_pred)) print("Testing Score") print(accuracy_score(y_test_deer, y_pred)) # y_pred = clf.predict(X_train_deer) print("Training Score") print(accuracy_score(y_train_deer, y_pred)) print("ELK confusion matrix and accuracy score") clf = DecisionTreeClassifier() clf.fit(X_train_elk, y_train_elk) y_pred = clf.predict(X_test_elk) print(confusion_matrix(y_test_elk, y_pred)) print("Testing Score") print(accuracy_score(y_test_elk, y_pred)) y_pred = clf.predict(X_train_elk)
def main(): print("Loading samples and labels") samples, labels, _ = load_files("data") print("Loaded {} samples".format(samples.shape[0])) sequence_dim = 100 print("Converting to sequences of length {}".format(sequence_dim)) samples, labels = make_sequences(samples, labels, sequence_dim) print("Number of samples from sequences: {}".format(samples.shape[0])) lb = LabelBinarizer() labels = lb.fit_transform(labels) # flattened samples for Decision Tree flatSamples = samples.reshape(samples.shape[0], -1) #tree! (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(flatSamples, labels, test_size=0.25, random_state=42) print("=" * 20) print("Building DecisionTree model") model = DecisionTreeClassifier() model.fit(trainSamples, trainLabels) treeResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), treeResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), treeResults.argmax(axis=1))) treeAcc = accuracy_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)) print("Accuracy Tree: {:.2f}".format(treeAcc)) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)))) print("=" * 20) print("Building CNN model") (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42) inputShape = (samples.shape[1], samples.shape[2]) model = Sequential() model.add(Conv1D(32, 10, padding="same", input_shape=inputShape)) model.add(Activation("relu")) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Conv1D(64, 10, padding="same")) model.add(Activation("relu")) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Conv1D(128, 10, padding="same")) model.add(Activation("relu")) model.add(Dropout(0.2)) model.add(Flatten(input_shape=inputShape)) model.add(Dense(128, activation='sigmoid')) model.add(Dense(64, activation='sigmoid')) model.add(Dense(labels.shape[1], activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) EPOCHS = 10 BATCH = 128 model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS, validation_data=(testSamples, testLabels)) cnnResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), cnnResults.argmax(axis=1), target_names=lb.classes_)) print("CNN Accuracy: {:.2f}".format( accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) input("")
print "Loaded data; testing classifier..." features_train, labels_train = ClassBalancingClassifierWrapper.rebalance( features_train, labels_train, ratio=2) results = [] for i in range(15): print 'Round', i classifier = DecisionTreeClassifier() classifier = SKLPipeline([('feature_selection', SelectPercentile(f_classif, 1)), ('classification', classifier)]) classifier.fit(features_train, labels_train) labels_test_predicted = classifier.predict(features_test) results.append(diff_binary_vectors(labels_test_predicted, labels_test_gold)) # support = classifier.steps[0][1].get_support(True) # print 'Selected', len(support), 'features:' # for index in support: # print ' ', feature_names[index] print 'Results:' print ClassificationMetrics.average(results, False) # Visualize last round ''' fig = plt.figure() fig.canvas.set_window_title('All training features')
class Model(object): """ The machine learning component of the tester. This component stores four different models: 1) A model to decide between different types of events (drags and touches). 2) A model to decide on the starting position for drags. 3) A model to decide on the ending position for drags. 4) A model to decide on the position of the touch. The input data are all the different known UI elements on the screen from the training data and whether or not they are visible on the screen. To acquire this, we first get the stored XML model and record the resource-id and class. We concatenate them into an array and mark as (1) for visible and (0) for not visible. """ def __init__(self): self.symbols = {} self.action_data = None self.action_labels = None self.action_classifier = None self.drag_data = None self.drag_end_labels = None self.drag_end_classifier = None self.drag_start_labels = None self.drag_start_classifier = None self.touch_data = None self.touch_labels = None self.touch_classifier = None self.device_info = device.info def parse_events(self, queue): symbols = {"randomizer": 0} events = [] all_data = [] all_results = [] drag_data = [] drag_start_results = [] drag_end_results = [] touch_data = [] touch_results = [] while not queue.empty(): event = queue.get() events.append(event) lst = event.state.start.as_list(symbols) lst[0] = random() all_data.append(lst) if event.action.is_drag(): drag_data.append(lst) all_results.append(DRAG) start = event.changes.start() end = event.changes.end() drag_start_results.append(start.x * start.y) drag_end_results.append(end.x * end.y) if event.action.is_touch(): touch_data.append(lst) all_results.append(TOUCH) start = event.changes.start() touch_results.append(start.x * start.y) if event.action.is_back(): all_results.append(BACK) data = np.zeros((len(all_data), len(symbols))) for i, item in enumerate(all_data): data[i, : len(item)] = item[:] drags = np.zeros((len(drag_data), len(symbols))) for i, item in enumerate(drag_data): drags[i, : len(item)] = item[:] touches = np.zeros((len(touch_data), len(symbols))) for i, item in enumerate(touch_data): touches[i, : len(item)] = item[:] self.symbols = symbols self.action_data = data self.action_labels = np.array(all_results) self.drag_data = drags self.drag_start_labels = np.array(drag_start_results) self.drag_end_labels = np.array(drag_end_results) self.touch_data = touches self.touch_labels = np.array(touch_results) def train(self): self.action_classifier = DecisionTreeClassifier() self.action_classifier.fit(self.action_data, self.action_labels) self.drag_start_classifier = DecisionTreeRegressor() self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels) self.drag_end_classifier = DecisionTreeRegressor() self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels) self.touch_classifier = DecisionTreeRegressor() self.touch_classifier.fit(self.touch_data, self.touch_labels) def predict(self, state): input = state.as_list(self.symbols, False) input[0] = random() action = Action() type = self.action_classifier.predict(input) width = self.device_info["displayWidth"] if type == DRAG: start = self.drag_start_classifier.predict(input)[0] end = self.drag_end_classifier.predict(input)[0] start = Point(start % width, start / width) end = Point(end % width, end / width) action.init(ACTION_DRAG, start, end, 0.5) elif type == TOUCH: point = self.touch_classifier.predict(input)[0] point = Point(point % width, point / width) action.init(ACTION_TOUCH, point.x, point.y) elif type == BACK: action.init(ACTION_BACK) return action def save(self): pass
Created on 2019年1月4日 决策树 ''' import numpy as np from sklearn.model_selection._split import train_test_split from sklearn.metrics.classification import classification_report from sklearn.tree.tree import DecisionTreeClassifier def iris_type(s): it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} return it[str(s, encoding="utf8")] path = 'demo1_Iris.txt' # data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) x, y = np.split(data, (4, ), axis=1) x = x[:, :4] x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6) clf = DecisionTreeClassifier(criterion='entropy', random_state=0) clf.fit(x, y.ravel()) print('feature_importances_', clf.feature_importances_) y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred))
def train(dataset): if (dataset == 'spambase'): features, labels, testing_features, true_labels = fetch_data_from_raw( 'spambase') else: features, labels, testing_features, true_labels = fetch_npy_data( dataset) PARAM_lambda = 0.001 PARAM_lambda_2 = 0.01 PARAM_beta = 0.001 gamma = 0.06 tree_depth = 15 depth_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] ## DeepBoost T = 200 clf_list_db, weights = DeepBoost(features, labels, T, depth_range, PARAM_lambda, PARAM_beta) train_error_db = testEnsemble(clf_list_db, features, labels) test_error_db = testEnsemble(clf_list_db, testing_features, true_labels) print('db done') ## Deep BBM gamma_list = [0.15, 0.1, 0.08, 0.06] lambda_2_list = [1, 0.1, 0.01, 0.001, 0.0001] max_depth_list = [1, 2, 3, 5, 10, 15] # parameter search for Deep BBM # train_errors_dbbm = np.zeros([len(gamma_list), len(lambda_2_list), len(max_depth_list)]) # test_errors_dbbm = np.zeros([len(gamma_list), len(lambda_2_list), len(max_depth_list)]) # for i in range(len(gamma_list)): # for j in range(len(lambda_2_list)): # for k in range(len(max_depth_list)): # gamma = gamma_list[i] # lambda_2 = lambda_2_list[j] # max_depth = max_depth_list[k] # depth_range = [] # for l in range(max_depth): # depth_range.append(l+1) # print (depth_range) # clf_list_dbbm, weights = DeepBBM(features, labels, gamma, depth_range, lambda_2) # train_error_dbbm = testEnsemble(clf_list_dbbm, features, labels) # test_error_dbbm = testEnsemble(clf_list_dbbm, testing_features, true_labels) # print ('ga', gamma, 'l2', lambda_2, 'md', max_depth, 'TrErr', train_error_dbbm, 'TeErr', test_error_dbbm) # train_errors_dbbm[i, j, k] = train_error_dbbm # test_errors_dbbm[i, j, k] = test_error_dbbm # np.save('TrErr_dbbm_ps_2', train_errors_dbbm) # np.save('TeErr_dbbm_ps_2', test_errors_dbbm) clf_list_dbbm, weights = DeepBBM(features, labels, gamma, depth_range, PARAM_lambda_2) train_error_dbbm = testEnsemble(clf_list_dbbm, features, labels) test_error_dbbm = testEnsemble(clf_list_dbbm, testing_features, true_labels) print('dbbm done') ## DecisionTreeClassifier dtc = DecisionTreeClassifier(max_depth=tree_depth) dtc = dtc.fit(features, labels) train_pred = dtc.predict(features) train_mse_dtc = ((train_pred - labels)**2).mean(axis=0) test_pred = dtc.predict(testing_features) # print (np.concatenate((np.expand_dims(pred, axis=1), np.expand_dims(true_labels, axis=1)), axis=1)) test_mse_dtc = ((test_pred - true_labels)**2).mean(axis=0) ## Boost by Majority # gamma = 0.1 clf_list_bbm, weights = BoostByMaj(features, labels, tree_depth, gamma) train_error_bbm = testEnsemble(clf_list_bbm, features, labels) test_error_bbm = testEnsemble(clf_list_bbm, testing_features, true_labels) #PlotMarginDistribution(clf_list_bbm, testing_features, true_labels) print('bbm done') # ## AdaBoost # T = 200 clf_list_adb = AdaBoostClf(features, labels, tree_depth, T) train_error_adb = testEnsemble(clf_list_adb, features, labels) test_error_adb = testEnsemble(clf_list_adb, testing_features, true_labels) print('adb done') #PlotMarginDistribution(clf_list_adb, testing_features, true_labels) ## MarginBoost (from our homework) # T = 200 margin = pow(2, -6) clf_list_mb = MarginBoostClf(features, labels, tree_depth, T, margin) train_error_mb = testEnsemble(clf_list_mb, features, labels) test_error_mb = testEnsemble(clf_list_mb, testing_features, true_labels) print('mb done') #PlotMarginDistribution(clf_list_mgb, testing_features, true_labels) ## BrownBoost total_time = 100 clf_list_brown = BrownBoost(features, labels, tree_depth, total_time) train_error_brown = testEnsemble(clf_list_brown, features, labels) test_error_brown = testEnsemble(clf_list_brown, testing_features, true_labels) print('bb done') print('DeepBoost: train_error', train_error_db) print('DeepBoost: test_error', test_error_db) print('DeepBBM: train_error', train_error_dbbm) print('DeepBBM: test_error', test_error_dbbm) print('decision tree: train_mse', train_mse_dtc) print('decision tree: test_mse', test_mse_dtc) print('BBM: train_error', train_error_bbm) print('BBM: test_error', test_error_bbm) print('AdaBoost: train_error', train_error_adb) print('AdaBoost: test_error', test_error_adb) print('MarginBoost: train_error', train_error_mb) print('MarginBoost: test_error', test_error_mb) print('BrownBoost: train_error', train_error_brown) print('BrownBoost: test_error', test_error_brown)
### a classic way to overfit is to use a small number ### of data points and a large number of features; ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here from sklearn.tree.tree import DecisionTreeClassifier vocab_list = vectorizer.get_feature_names() dtc = DecisionTreeClassifier() dtc.fit(features_train, labels_train) pred = dtc.predict(features_test) from sklearn.metrics import accuracy_score accuracy = accuracy_score(labels_test, pred) print(accuracy) feature_importances = dtc.feature_importances_ for i in range(0, len(feature_importances)): if feature_importances[i] > 0.2: print("Importance = ", feature_importances[i], " number is ", i, " word is ", vocab_list[i])
''' Random forest ''' print('Run random forest....') rr_model = RandomForestClassifier(n_estimators=100,max_depth=10, random_state=1) rr_model.fit(rel_train_X.relation_matrix, rel_train_Y.values) rf_pred_train = rr_model.predict(rel_train_X.relation_matrix) train_result.append(('RF', evaluateByF1(rf_pred_train, rel_train_Y.values))) rf_pred_test = rr_model.predict(test_X) test_result.append(('RF', evaluateByF1(rf_pred_test, test_Y))) print('Run decision tree....') id3_model = DecisionTreeClassifier(max_depth=10, random_state=1) id3_model.fit(rel_train_X.relation_matrix, rel_train_Y.values) id3_pred_train = id3_model.predict(rel_train_X.relation_matrix) train_result.append(('ID3', evaluateByF1(id3_pred_train, rel_train_Y.values))) id3_pred_test = id3_model.predict(test_X) test_result.append(('ID3', evaluateByF1(id3_pred_test, test_Y))) print('Performance of CBA and CMAR with different measures:') printList(train_result) printList(test_result)
# TODO: sprawdzić tą predykcję wieku, działa chyba ok # data['AgePredicted'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), None) data['Age'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), data['Age']) ##predykcja poziomu classifier = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2) # X_train_floor = data[pd.notnull(data.Floor)][['Embarked', 'Pclass']] y_train_floor = data[pd.notnull(data.Floor)]['Floor'].values.astype('int') classifier.fit(X_train_floor, y_train_floor) data['Floor'] = np.where(pd.isnull(data.Floor), classifier.predict(data[['Embarked', 'Pclass']]), data['Floor']) ##zmiana ceny za bilet data['TicketCounts'] = data.groupby('Ticket')['Ticket'].transform('count') data['Fare'] = data['Fare'] / data['TicketCounts'] ##usunięcie nieużywanych kolumn data = data.drop(['Ticket', 'Cabin', 'Name', 'SibSp', 'Parch', 'TicketCounts'], axis=1) ##zalozenie indeksu na kolumnie data = data.set_index('PassengerId') ##Podzielenie na zbiór uczący i testowy
print("rmse: " + str(rmse)) rmse = rmseEval(data["tw"]['target'], combinedPrediction2)[1] print("rmse: " + str(rmse)) print("identification test:") identificationColumns = [] for c in columns["all"]: if c not in ['target', 'prediction', 'timestamp', 'location']: identificationColumns.append(c) clf = DecisionTreeClassifier() traintestX = generateTrainingData(data["all"], identificationColumns) clf = clf.fit(traintestX, label) prediction = clf.predict(traintestX) a = accuracy_score(label, prediction) print(str(a)) a = accuracy_score(label, prediction, normalize=False) print(str(a)) a = confusion_matrix(label, prediction) print(str(a)) # with open(OUTPUT_DIRECTORY + "dt.dot", 'w') as f: # f = tree.export_graphviz(clf, out_file=f, feature_names=identificationColumns)#, max_depth=10)
def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict): clf = DecisionTreeClassifier() clf.fit(X, y) current_state_to_predict = np.array(current_state_to_predict).reshape(1,-1) predicted_state = clf.predict(current_state_to_predict) return predicted_state
labelYStat[ly] = labelYStat[ly] + 1 for i in range(0, 16): print("\tindex " + str(i) + ": " + str(labelYStat[i])) model_to_show = DecisionTreeClassifier(random_state=42, max_depth=5) model = DecisionTreeClassifier(random_state=42, max_depth=30) # model = RandomForestClassifier(n_estimators=25, random_state=42) model.fit(X, labelY) model_to_show.fit(X, labelY) tree.export_graphviz(model_to_show, out_file=OUTPUT_TREE_FILE, feature_names=columns, label='none') predY = model.predict(X) print("Pred Y stat:") predYStat = defaultdict(lambda: 0) for py in predY: predYStat[py] = predYStat[py] + 1 for i in range(0, 16): print("\tindex " + str(i) + ": " + str(predYStat[i])) prediction = [] for i in range(0, len(Y)): p = data["pred_" + top16tags[predY[i]]][i] prediction.append(p) rmse = rmseEval(Y, data["pred_TW"])[1] print("TW Rmse: " + str(rmse))