def calculate(A, X, Y): A = sp.coo_matrix(A) A = A + A.T.multiply(A.T > A) - A.multiply(A.T > A) rowsum = np.array(A.sum(1)).clip(min=1) r_inv_sqrt = np.power(rowsum, -0.5).flatten() r_mat_inv_sqrt = sp.diags(r_inv_sqrt) A = A.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt) low = 0.5 * sp.eye(A.shape[0]) + A high = 0.5 * sp.eye(A.shape[0]) - A low = low.todense() high = high.todense() low_signal = np.dot(np.dot(low, low), X) high_signal = np.dot(np.dot(high, high), X) low_MLP = MLPClassifier(hidden_layer_sizes=(16), activation='relu', max_iter=2000) low_MLP.fit(low_signal[:100, :], Y[:100]) low_pred = low_MLP.predict(low_signal[100:, :]) high_MLP = MLPClassifier(hidden_layer_sizes=(16), activation='relu', max_iter=2000) high_MLP.fit(high_signal[:100, :], Y[:100]) high_pred = high_MLP.predict(high_signal[100:, :]) return acc(Y[100:], low_pred), acc(Y[100:], high_pred)
def on_epoch_end(self,epoch,logs=None): x_1 = self.validation_data[0] x_2 = self.validation_data[1] y_test = self.validation_data[2] print('Dims Validation data: %s %s %s',x_1.shape,x_2.shape,y_test.shape) # predicting outputs for val data y_pred = self.model.predict([x_1,x_2]) # selecting the top value of predictions and y_test = np.argmax(y_test, axis=-1) y_pred = np.argmax(y_pred, axis=-1) self.acc_val.append(acc(y_test,y_pred)) print ('Acc: ',acc(y_test,y_pred))
def decision_tree_accuracy(X, y, random, depth, test, crit): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test, random_state=random) regressor = tree.DecisionTreeClassifier(criterion=crit, max_depth=depth, random_state=random) regressor.fit(X_train, y_train) ytr_pred = regressor.predict(X_train) yts_pred = regressor.predict(X_test) acc_ytr = acc(y_train, ytr_pred) acc_yts = acc(y_test, yts_pred) print(f'Accuracy test = {acc_yts}') print(f'Accuracy train = {acc_ytr}')
def main(): x_train, y_train, x_test, y_test = get_data() for n in [2, 3, 5, 10, 16]: sfs = SFS(KNeighborsClassifier(n_neighbors=7), k_features=n, forward=False, floating=True, scoring='accuracy', cv=0) sfs = sfs.fit(x_train, y_train) print('\nSequential Floating Forward Selection: ', n) feat_cols = list(sfs.k_feature_idx_) print(feat_cols) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(x_train[:, feat_cols], y_train) y_train_pred = knn.predict(x_train[:, feat_cols]) print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred)) y_test_pred = knn.predict(x_test[:, feat_cols]) print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred)) print(confusion_matrix(y_test, y_test_pred)) print(classification_report(y_test, y_test_pred)) if n == 2: fig, axs = plt.subplots(2) fig.suptitle("SFS(KNN) Scatter Plot", fontsize='small') axs[0].scatter(x_train[:, feat_cols[0]], x_train[:, feat_cols[1]], marker='o', c=y_train, s=25, edgecolor='k') axs[1].scatter(x_test[:, feat_cols[0]], x_test[:, feat_cols[1]], marker='o', c=y_test, s=25, edgecolor='k') plt.show()
def evaluate(model, iterator_function, _batch_count, cuda_device, output_buffer=sys.stderr): if output_buffer is not None: print(_batch_count, file=output_buffer) model.eval() with torch.no_grad(): predictions = [] expectations = [] batch_generator = range(_batch_count) if output_buffer is not None: batch_generator = tqdm(batch_generator) for _ in batch_generator: features, targets = iterator_function() if cuda_device != -1: features = features.cuda(device=cuda_device) probs, _, _ = model(example_batch=features) batch_pred = np.argmax(probs.detach().cpu().numpy(), axis=-1).tolist() batch_tgt = targets.detach().cpu().numpy().tolist() predictions.extend(batch_pred) expectations.extend(batch_tgt) model.train() return acc(expectations, predictions) * 100, \ pr(expectations, predictions) * 100, \ rc(expectations, predictions) * 100, \ f1(expectations, predictions) * 100,
def combinationPredict(predict, samples_test): labels_samples, merger_min, merger_max, merger_sum, merger_pro = add.fusoesDiego( predict, samples_test) classSeg = np_utils.categorical_probas_to_classes(predict) classMin = np_utils.categorical_probas_to_classes(merger_min) classMax = np_utils.categorical_probas_to_classes(merger_max) classSom = np_utils.categorical_probas_to_classes(merger_sum) classPro = np_utils.categorical_probas_to_classes(merger_pro) print() print("Min: " + str(acc(labels_samples, classMin))) print("Max: " + str(acc(labels_samples, classMax))) print("Sum: " + str(acc(labels_samples, classSom))) print("Product: " + str(acc(labels_samples, classPro))) print()
def test_converter(self): interpreter = tf.lite.Interpreter( os.path.join(self.model_path, 'model.tflite')) interpreter.allocate_tensors() input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() #load test data [test_data, test_labels] = read_database(self.data_path) test_data = test_data / 255.0 test_data = test_data[..., tf.newaxis].astype("float32") predData = np.ndarray(shape=(test_data.shape[0]), dtype='uint8') for i in range(0, test_data.shape[0]): test_data_temp = np.array(test_data[[i], :, :, :], dtype='float32') interpreter.set_tensor(input_details[0]['index'], test_data_temp) interpreter.invoke() output_data = interpreter.get_tensor(output_details[0]['index']) predData[i] = np.argmax(output_data) kappa = metrics.cohen_kappa_score(test_labels, predData) print('Kappa:', kappa) accuracy = acc(test_labels, predData) print('Accuracy:', accuracy) print('Confusion matrix:') confusion_mat = metrics.confusion_matrix(test_labels, predData) print(confusion_mat) return (accuracy)
def actual_prediction_accuracy(self): if self.predicted: print("Prediction accuracy:", acc(self.y_test, self.y_pred)) else: raise TypeError( "Predictions for model are not available, use 'predict' method first!" )
def predict(X_spam, X_ham, X_test, y_test): pred = [] for X in X_test: spam = nb(X_spam, X) ham = nb(X_ham, X) pred.append(1) if spam > ham else pred.append(0) print(pred, y_test) print('Accuracy:', acc(pred, y_test))
def cluster_acc(Y, clusterLabels): assert (Y.shape == clusterLabels.shape) pred = np.empty_like(Y) for label in set(clusterLabels): mask = clusterLabels == label sub = Y[mask] target = Counter(sub).most_common(1)[0][0] pred[mask] = target return acc(Y, pred)
def build_classifier_and_test(train_X, train_y, test_X, test_y, clf, print_train_result=True): clf.fit(train_X, train_y) if print_train_result == True: p_tr = clf.predict(train_X) print("Train Accuracy:\t", acc(train_y, p_tr)) print("Train Precision:\t", pr(train_y, p_tr)) print("Train Recall_score:\t", rc(train_y, p_tr)) print("Train F-score:\t", f1(train_y, p_tr)) predicted = clf.predict(test_X) print("Accuracy:\t", acc(test_y, predicted)) print("Precision:\t", pr(test_y, predicted)) print("Recall_score:\t", rc(test_y, predicted)) print("F-score:\t", f1(test_y, predicted))
def clone_analysis(data_paths): code = [] labels = [] positives = 0 for file_name in data_paths: data = json.load(open(file_name)) for example in data: code.append(example['tokenized']) l = 0 if 'label' in example.keys(): l = int(example['label']) elif 'lebel' in example.keys(): l = int(example['lebel']) elif 'leble' in example.keys(): l = int(example['leble']) elif 'lable' in example.keys(): l = int(example['lable']) if l > 1: l = 1 positives += l labels.append(l) print(len(code), len(labels), positives, len(labels) - positives) vectorizer = TfidfVectorizer(input=code, lowercase=False, ngram_range=(1, 3)) X = vectorizer.fit_transform(code) model = KMeans(n_clusters=10, max_iter=100) model.fit(X) y = model.predict(X) cluster_to_positive = [0] * 10 cluster_to_negative = [0] * 10 for pred, label in zip(y, labels): if label == 1: cluster_to_positive[pred] += 1 else: cluster_to_negative[pred] += 1 print(cluster_to_positive) print(cluster_to_negative) percentages = [ float(p) / (p + n) for p, n in zip(cluster_to_positive, cluster_to_negative) ] for p in percentages: print(p) for _ in range(5): XTrain, XTest, YTrain, YTest = train_test_split(X, labels, test_size=0.2) model = RandomForestClassifier() model.fit(XTrain, YTrain) predicted = model.predict(XTest) print('%.3f\t%.3f\t%.3f\t%.3f' % (acc(YTest, predicted) * 100, pr(YTest, predicted) * 100, rc(YTest, predicted) * 100, f1(YTest, predicted) * 100)) pass
def cluster_acc(Y, clusterLabels): #used in clustering.py assert (Y.shape == clusterLabels.shape) pred = np.empty_like(Y) for label in set(clusterLabels): mask = clusterLabels == label sub = Y[mask] target = Counter(sub).most_common(1)[0][0] pred[mask] = target # assert max(pred) == max(Y) # assert min(pred) == min(Y) return acc(Y, pred)
def cluster_acc(y, cluster_labels): assert (y.shape == cluster_labels.shape) pred = np.empty_like(y) for label in set(cluster_labels): mask = cluster_labels == label sub = y[mask] target = Counter(sub).most_common(1)[0][0] pred[mask] = target # assert max(pred) == max(Y) # assert min(pred) == min(Y) return acc(y, pred)
def cluster_acc(Y,clusterLabels): assert (Y.shape == clusterLabels.shape) pred = np.empty_like(Y) for label in set(clusterLabels): mask = clusterLabels == label sub = Y[mask] target = Counter(sub).most_common(1)[0][0] pred[mask] = target # assert max(pred) == max(Y) # assert min(pred) == min(Y) return acc(Y,pred)
def train(self): x, y = np.load('images/64px_image_x.npy'), np.load( 'images/64px_image_y.npy') x = np.reshape(x, (40000, 64, 64, 1)) kmeans = KMeans(n_clusters=2, n_init=20) y_pred = kmeans.fit_predict(self.encoder.predict(x)) y_pred_last = np.copy(y_pred) self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) loss = 0 ae_loss = 0 index = 0 maxiter = 80000 update_interval = 100 index_array = np.arange(x.shape[0]) batch_size = 16 tol = 0.001 # model.load_weights('DEC_model_final.h5') for ite in range(int(maxiter)): if ite % update_interval == 0: q = self.model.predict(x, verbose=0) # update the auxiliary target distribution p p = self.target_distribution(q) # evaluate the clustering performance y_pred = q.argmax(1) if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) loss = np.round(loss, 5) print( 'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f, loss=%.5f' % (ite, acc, nmi, ari, loss)) # check stop criterion - model convergence delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = np.copy(y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') break idx = np.random.randint(low=0, high=x.shape[0], size=batch_size) # ae_loss = ae.train_on_batch(x=x[idx], y=x[idx]) loss = self.model.train_on_batch(x=x[idx], y=p[idx]) index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0 self.model.save_weights('DEC_model_final_64px.h5') self.test_model()
def on_epoch_end(self, epoch, logs=None): if int(epochs/10) != 0 and epoch % int(epochs/10) != 0: return feature_model = Model(self.model.input, self.model.get_layer( 'encoder_%d' % (int(len(self.model.layers) / 2) - 1)).output) features = feature_model.predict(self.x) km = KMeans(n_clusters=len(np.unique(self.y)), n_init=20, n_jobs=4) y_pred = km.fit_predict(features) # print() print(' '*8 + '|==> acc: %.4f, nmi: %.4f <==|' % (metrics.acc(self.y, y_pred), metrics.nmi(self.y, y_pred)))
def main(): x_train, y_train, x_test, y_test = get_data() for n in [2, 3, 5, 10, 15]: pca = decomposition.PCA(n_components=n) pca.fit(x_train) pca_x_train = pca.transform(x_train) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(pca_x_train, y_train) print('\nPCA: ', n) y_train_pred = knn.predict(pca_x_train) print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred)) pca_x_test = pca.transform(x_test) y_test_pred = knn.predict(pca_x_test) print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred)) if n == 2: fig, axs = plt.subplots(2) fig.suptitle("PCA Scatter Plot", fontsize='small') axs[0].scatter(pca_x_train[:, 0], pca_x_train[:, 1], marker='o', c=y_train, s=25, edgecolor='k') axs[1].scatter(pca_x_test[:, 0], pca_x_test[:, 1], marker='o', c=y_test, s=25, edgecolor='k') plt.show()
def cal_cost_tree(x, trn, trg): x = [int(a) for a in np.round(x)] if sum(x) == 0 : return np.inf x_index = [i for i in range(len(x)) if x[i]==1] trn = trn.reshape(trn.shape[1], -1) trn = trn[x_index, :] trn = np.transpose(trn) clf = tree.DecisionTreeClassifier() clf.fit(trn, trg) pre = clf.predict(trn) score = acc(pre, trg) error = 1 - score return (1-alpha)*error + alpha * (sum(x)*1.0/len(x)), error, sum(x)*1.0/len(x)
def updatePointsAtk(xc, clf, Xtr, ytr=None, Xtt=None, ytt=None, err_type='loss'): d = Xtr.shape[1] m = xc.size/d xc = xc.reshape(m,d) # if xcid.size != m: # print 'Attack points indices do not match xc, exit!' # return None X0 = np.concatenate([Xtr, xc], axis=0) # TODO: update SVC instead of retrain with xc clf.fit(X0) # <---- this is just a lazy update # maximize the obj value (generalization error) if err_type is 'fval': # objective values on untainted dataset Dtr return 1*(clf.fval - clf.C*(clf.kdist2(xc) - clf.r).sum()) elif err_type is 'r': # the squared radius return 1*clf.r elif err_type is 'xi': return 1*(clf.fval - clf.C*(clf.kdist2(xc) - clf.r).sum() - clf.r) elif err_type is 'f1': if ytr is not None: return f1_score(ytr, clf.y[:ytr.size]) else: print 'You need give the true labels!' return None elif err_type is 'acc': if Xtt is not None and ytt is not None: y_clf = clf.predict_y(Xtt) return acc(ytt, y_clf) else: print 'You need give the test dataset!' return None elif err_type is 'loss': # min(\sum_xi - R^2) means let less samples lie out meanwhile maximize the ball # note: xc are excluded sum_xi_c = (clf.kdist2(xc) - clf.r).sum() return (clf.fval - clf.r)/clf.C - sum_xi_c - clf.r elif err_type is 'fn': if ytr is not None: pid = np.where(ytr==1)[0] return 0.5*((ytr[pid] - clf.y[pid]).sum())/pid.size else: print 'You need give the true labels!' return None elif err_type is 'fp': if ytr is not None: nid = np.where(ytr==-1)[0] return 0.5*((clf.y[nid] - ytr[nid]).sum())/nid.size else: print 'You need give the true labels!' return None
def cal_cost_knn(x, trn, trg): x = list(map(int, np.round(x))) if sum(x) == 0 : return np.inf, np.inf, 1 x_index = [i for i in range(len(x)) if x[i]==1] trn = trn.reshape(trn.shape[1], -1) trn = trn[x_index, :] trn = np.transpose(trn) clf = knn(n_neighbors=nn) clf.fit(trn, trg) pre = clf.predict(trn) score = acc(pre, trg) error = 1 - score return (1-alpha)*error + alpha * (sum(x)*1.0/len(x)), error, sum(x)*1.0/len(x)
def cal_cost_svm(x, trn, trg): x = [int(a) for a in np.round(x)] if sum(x) == 0 : return np.inf, np.inf, 1 x_index = [i for i in range(len(x)) if x[i]==1] trn = trn.reshape(trn.shape[1], -1) trn = trn[x_index, :] trn = np.transpose(trn) clf = SVC(gamma="auto", kernel=svm_kernel) clf.fit(trn, trg) pre = clf.predict(trn) score = acc(pre, trg) error = 1 - score return (1-alpha)*error + alpha * (sum(x)*1.0/len(x)), error, sum(x)*1.0/len(x)
def cluster_acc(Y,clusterLabels): import numpy as np from collections import Counter from sklearn.metrics import accuracy_score as acc assert (Y.shape == clusterLabels.shape) pred = np.empty_like(Y) for label in set(clusterLabels): mask = clusterLabels == label sub = Y[mask] target = Counter(sub).most_common(1)[0][0] pred[mask] = target # assert max(pred) == max(Y) # assert min(pred) == min(Y) return acc(Y,pred)
def dae_svm(): dae_train = np.load('data/train_dae.npy')[:10000] dae_test = np.load('data/test_dae.npy')[:5000] svm_dae = svm.SVC(C=2.0, gamma=0.05, cache_size=2000) #svm_dae = GridSearchCV(svr, parameters) svm_dae.fit(dae_train, label_train) predicted_dae = svm_dae.predict(dae_test) daeacc = acc(label_test, predicted_dae) #model_params = str(svm_dae.best_estimator_) print 'DAE accuracy - ' + str(daeacc) with open(PATHS + 'dae_svm', 'wb') as f: pickle.dump(daeacc, f)
def reportStats(weight, current_iteration, X_train, y_train, X_test, y_test): y_train[y_train < 0] = 0 y_test[y_test < 0] = 0 ypred_is = predict_all(X_train, weight) ypred_oos = predict_all(X_test, weight) np_err_handling = np.seterr(invalid='ignore') is_acc = acc(y_train, ypred_is) is_mcc = mcc(y_train, ypred_is) is_f1 = f1(y_train, ypred_is) is_mse = mse(y_train, ypred_is) oos_acc = acc(y_test, ypred_oos) oos_mcc = mcc(y_test, ypred_oos) oos_f1 = f1(y_test, ypred_oos) oos_mse = mse(y_test, ypred_oos) is_tn, is_fp, is_fn, is_tp = confusion_matrix(y_train, ypred_is).ravel() oos_tn, oos_fp, oos_fn, oos_tp = confusion_matrix(y_test, ypred_oos).ravel() is_auprc = auprc(y_train, ypred_is) oos_auprc = auprc(y_test, ypred_oos) np.seterr(**np_err_handling) print( f"Consensus {current_iteration}: IS acc {is_acc:0.5f}. IS MCC {is_mcc:0.5f}. IS F1 {is_f1:0.5f}. IS MSE {is_mse:0.5f}. OOS acc {oos_acc:0.5f}. OOS MCC {oos_mcc:0.5f}. OOS F1 {oos_f1:0.5f}. OOS MSE {oos_mse:0.5f}." ) print( f"Confusion {current_iteration}: IS TP: {is_tp}, IS FP: {is_fp}, IS TN: {is_tn}, IS FN: {is_fn}, IS AUPRC: {is_auprc:0.5f}. OOS TP: {oos_tp}, OOS FP: {oos_fp}, OOS TN: {oos_tn}, OOS FN: {oos_fn}, OOS AUPRC: {oos_auprc:0.5f}." ) return is_acc, is_mcc, is_f1, is_mse, is_auprc, oos_acc, oos_mcc, oos_f1, oos_mse, oos_auprc
def test_acc_knn(x, tst, tst_trg, trn, trn_trg): x = [int(a) for a in np.round(x)] if sum(x) == 0: return 0 x = [i for i in range(len(x)) if x[i]==1] tst = tst.reshape(tst.shape[1], -1) tst = tst[x, :] tst = np.transpose(tst) trn = trn.reshape(trn.shape[1], -1) trn = trn[x, :] trn = np.transpose(trn) clf = knn(n_neighbors=nn) clf.fit(trn, trn_trg) tst_pred = clf.predict(tst) return acc(tst_trg, tst_pred)
def test_acc_svm(x, tst, tst_trg, trn, trn_trg): x = [int(a) for a in np.round(x)] if sum(x) == 0: return 0 x = [i for i in range(len(x)) if x[i]==1] tst = tst.reshape(tst.shape[1], -1) tst = tst[x, :] tst = np.transpose(tst) trn = trn.reshape(trn.shape[1], -1) trn = trn[x, :] trn = np.transpose(trn) clf = SVC(gamma="auto", kernel=svm_kernel) clf.fit(trn, trn_trg) tst_pred = clf.predict(tst) return acc(tst_trg, tst_pred)
def test_acc_tree(x, tst, tst_trg, trn, trn_trg): x = [int(a) for a in np.round(x)] if sum(x) == 0: return x = [i for i in range(len(x)) if x[i]==1] tst = tst.reshape(tst.shape[1], -1) tst = tst[x, :] tst = np.transpose(tst) trn = trn.reshape(trn.shape[1], -1) trn = trn[x, :] trn = np.transpose(trn) clf = tree.DecisionTreeClassifier() clf.fit(trn, trn_trg) tst_pred = clf.predict(tst) return acc(tst_trg, tst_pred)
def cluster_acc(y, cluster_labels): assert (y.shape == cluster_labels.shape) pred = np.empty_like( y ) # what the clustering algoritm predicts as the y-label (if it predicted the majority y-label for each cluster) for label in set(cluster_labels): mask = cluster_labels == label # indices where the training data match a specific cluster sub = y[mask] # get only the training labels that are in this cluster target = Counter(sub).most_common(1)[0][ 0] # get the majority y-label from the instances assigned to this cluster (e.g 0) pred[ mask] = target # assign the training data in this cluster to have the majority y-label # assert max(pred) == max(Y) # assert min(pred) == min(Y) return acc(y, pred)
def get_accuracy(predictions, y, std_price = 0, mean_price = 0): unnorm_predictions = [] for pred in predictions: if math.isnan(unnormalize(pred, std_price,mean_price)): print("NAN FOUND") exit() unnorm_predictions.append(unnormalize(pred, std_price, mean_price)) unnorm_y = [] for y_pt in y: if math.isnan(unnormalize(y_pt, std_price,mean_price)): print("NAN FOUND") exit() unnorm_y.append(unnormalize(y_pt, std_price, mean_price)) # Create lists to measure if opening price increased or decreased direction_pred = [] for pred in unnorm_predictions: if pred >= 0: direction_pred.append(1) else: direction_pred.append(0) direction_test = [] for value in unnorm_y: if value >= 0: direction_test.append(1) else: direction_test.append(0) from sklearn.metrics import confusion_matrix # Calculate if the predicted direction matched the actual direction direction = acc(direction_test, direction_pred) direction = round(direction,4)*100 _mae = mae(unnorm_y, unnorm_predictions) #median absolute error _rmse = np.sqrt(mse(y, predictions)) # root mean squared error _r2 = r2(unnorm_y, unnorm_predictions) #R squared error print("CONFUSION MATRIX") print(confusion_matrix(direction_test,direction_pred).ravel()) return (direction,_mae,_rmse,_r2)
def test(self, x_test, y_test, params, n_centers, width): y_true = [] y_pred = [] (p, _) = x_test.shape for i in range(p): d = y_test[i] y = self.predict(x_test[i], params, n_centers, width) # Confusion Matrix y_true.append(list(d)) y_pred.append(list(y)) a = util.inverse_transform(y_true, self.n_classes) b = util.inverse_transform(y_pred, self.n_classes) return acc(a, b), tpr(a, b, average='macro'), 0, ppv(a, b, average='weighted')
######################################################### ### your code goes here ### from sklearn.svm import SVC from sklearn.metrics import accuracy_score as acc for i in range(1,20): C = pow(10,i) clf = SVC(kernel="rbf",C=C) clf.fit(features_train,labels_train) pred = clf.predict(features_test) print "C:",C,"Accuracy:",acc(pred,labels_test) C = 10000 clf = SVC(kernel="rbf",C=C) clf.fit(features_train,labels_train) pred = clf.predict(features_test) print "C:",C,"Accuracy:",acc(pred,labels_test) from sklearn.tree import DecisionTreeClassifier as DTC from sklearn.metrics import accuracy_score as accu clf = DTC(min_samples_split=2) clf.fit(features_train,labels_train) pred = clf.predict(features_test)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) for outcast in ["sshacklensf", "cgermannsf"]: features_train = [x.replace(outcast,"") for x in features_train] from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() ### a classic way to overfit is to use a small number ### of data points and a large number of features; ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score as acc clf = DecisionTreeClassifier() clf.fit(features_train,labels_train) pred = clf.predict(features_test) print acc(pred,labels_test) print [(i,x) for i,x in enumerate(clf.feature_importances_) if x > 0.2] print vectorizer.get_feature_names()[21323]
#### initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score as acc clf = RandomForestClassifier(n_jobs=-1, criterion="gini", n_estimators=100, min_samples_leaf=5, max_features=1) clf.fit(features_train, labels_train) pred = clf.predict(features_test) print "Accuracy", acc(pred, labels_test) try: prettyPicture(clf, features_train, labels_train) except NameError: pass