def test_ovr_always_present(): """Test that ovr works with classes that are always present or absent """ # Note: tests is the case where _ConstantPredictor is utilised X = np.ones((10, 2)) X[:5, :] = 0 y = np.zeros((10, 3)) y[5:, 0] = 1 y[:, 1] = 1 y[:, 2] = 1 [[int(i >= 5), 2, 3] for i in range(10)] ovr = OneVsRestClassifier(LogisticRegression()) assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) assert_equal(np.unique(y_pred[:, -2:]), 1) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.ones(X.shape[0])) # y has a constantly absent label y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
def OneVsRest_multilabel(train_feature, train_label, test_feature, BinaryClassifier, **kwargs): """ multi-label classification """ from sklearn.multiclass import OneVsRestClassifier clf = OneVsRestClassifier(BinaryClassifier(**kwargs)).fit(train_feature, train_label) train_pred = clf.predict_proba(train_feature) test_pred = clf.predict_proba(test_feature) return train_pred, test_pred
def process_fold(X_train, X_val, y_train, y_val, X_test): #XGBoos clf = OneVsRestClassifier(xgb.XGBClassifier(learning_rate=0.005, n_estimators=500)) clf.fit(X_train, y_train) y_p_x = clf.predict_proba(X_val) y_p_x_tst = clf.predict_proba(X_test) # Keras y_p_k, y_p_k_tst = KerasClassifier(X_train, y_train, X_val, y_val, X_test) return (y_p_x+y_p_k) / 2.0, (y_p_x_tst+y_p_k_tst) / 2.0
def test_ovr_fit_predict_sparse(): for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]: base_clf = MultinomialNB(alpha=1) X, Y = datasets.make_multilabel_classification( n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0 ) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) Y_pred_sprs = clf_sprs.predict(X_test) assert_true(clf.multilabel_) assert_true(sp.issparse(Y_pred_sprs)) assert_array_equal(Y_pred_sprs.toarray(), Y_pred) # Test predict_proba Y_proba = clf_sprs.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > 0.5 assert_array_equal(pred, Y_pred_sprs.toarray()) # Test decision_function clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train)) dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int) assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
def train_linear(X, Y, splits, model_config, results_dir, best_k=10, validation_score='f1', threshold_score='f1', threshold_criterion='zack', fn_prefix='', label_idx=None): label_idx = np.arange(Y.shape[1]) if label_idx is None else label_idx best_perf = None best_C = None best_model = None for C in np.logspace(-3,3, num=20): sys.stdout.write('Training Ridge Regression with C={0}...'.format(C)) sys.stdout.flush() model = OneVsRestClassifier(LogisticRegression(C=C)) try: model.fit(X[splits[0]], Y[splits[0]]) except KeyboardInterrupt: sys.stdout.write('training interrupted...') break except: raise Yp = model.predict_proba(X[splits[1]]) perf = compute_micro_evaluations(Y[splits[1]][:,label_idx], Yp[:,label_idx], k=best_k, threshold_score=threshold_score, criterion=threshold_criterion) sys.stdout.write(' {0}={1:.4f}'.format(validation_score, perf[validation_score])) sys.stdout.flush() if best_perf is None or perf[validation_score] > best_perf[validation_score]: best_perf = perf best_model = model best_C = C sys.stdout.write(' *BEST') sys.stdout.write('\n') model_config['C'] = best_C cPickle.dump(best_model, open(os.path.join(results_dir, fn_prefix + '-model.pkl'), 'wb')) return best_model, model_config
def make_classifier(): test_size=0 X, y = make_X_Y() X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size) X_train = X_train.astype(int) X_test = X_test.astype(int) y_train = y_train.astype(int) y_test = y_test.astype(int) clf = OneVsRestClassifier(SVC(kernel='linear', class_weight='auto', probability=True)) clf.fit(X_train, y_train) try: y_suggest = clf.predict_proba(X_test) nn = 0 n = 0 for y_s, y_t in zip(y_suggest, y_test): s1 = chords_Y[np.argmax(y_s)] y_s[np.argmax(y_s)]=0 s2 = chords_Y[np.argmax(y_s)] t = chords_Y[np.argmax(y_t)] print 'Suggest: ' + s1 + ' or ' + s2 + ' Real: ' + t n = n+1 if s1==t: nn = nn+1 if n>0: print 'Accuracy is ' + str(float(nn)/n) except ValueError: pass #print classification_report(clf.predict(X_test), y_test) pickle.dump(clf, open("classifier.bin", "wb"))
def benchmark(clf_current): print('_' * 80) print("Test performance for: ") clf_descr = str(clf_current).split('(')[0] print(clf_descr) t0 = time() classif = OneVsRestClassifier(clf_current) classif.fit(X_train, Y_train.toarray()) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() if hasattr(clf_current,"decision_function"): dfmatrix = classif.decision_function(X_test) score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5)) else: probsmatrix = classif.predict_proba(X_test) score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5)) test_time = time() - t0 print("f1-score: %0.7f" % score) print("test time: %0.3fs" % test_time) print('_' * 80) return clf_descr, score, train_time, test_time
def setUp(self): import sklearn.svm as svm import sklearn.preprocessing as pp from sklearn.multiclass import OneVsRestClassifier # 2 class iris = datasets.load_iris() self.data = iris.data self.target = pp.LabelBinarizer().fit_transform(iris.target) self.df = pdml.ModelFrame(self.data, target=self.target) self.assertEqual(self.df.shape, (150, 7)) svc1 = svm.SVC(probability=True, random_state=self.random_state) estimator1 = OneVsRestClassifier(svc1) self.df.fit(estimator1) self.df.predict(estimator1) self.assertTrue(isinstance(self.df.predicted, pdml.ModelFrame)) svc2 = svm.SVC(probability=True, random_state=self.random_state) estimator2 = OneVsRestClassifier(svc2) estimator2.fit(self.data, self.target) self.pred = estimator2.predict(self.data) self.proba = estimator2.predict_proba(self.data) self.decision = estimator2.decision_function(self.data) # argument for classification reports self.labels = np.array([2, 1, 0])
def ml_train(datasetFilePath, falsePredictionsFilePath, unknownPredictionsFilePath, confusionMatricesDir, classifierFilePath): logger.info("start of training and testing phase") classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True), n_jobs=NUMBER_OF_CPUS_TO_USE) logger.info("loading data set") dataset, features_names = load_dataset(datasetFilePath) #limited_dataset = limit_dataset(dataset) limited_dataset = dataset ml_dataset = split_dataset(limited_dataset, len(features_names)) logger.info("fitting training set X_train - %s, y_train - %s" % (ml_dataset.X_train.shape, ml_dataset.y_train.shape)) classifier.fit(ml_dataset.X_train, ml_dataset.y_train) logger.info("predicting test set X_test - %s, y_test - %s" % (ml_dataset.X_test.shape, ml_dataset.y_test.shape)) y_pred = classifier.predict(ml_dataset.X_test) y_pred_probabilities = classifier.predict_proba(ml_dataset.X_test) y_pred_with_unknown_cls, y_pred_fictive, max_y_pred_probs = process_prediction_vector(ml_dataset.y_test, y_pred, y_pred_probabilities) validation(ml_dataset.y_test, y_pred, y_pred_with_unknown_cls, y_pred_fictive, list(classifier.classes_) + ["unknown"]) plot_confusion_matrices(ml_dataset.y_test, y_pred, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "1") plot_confusion_matrices(ml_dataset.y_test, y_pred_with_unknown_cls, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "2") plot_confusion_matrices(ml_dataset.y_test, y_pred_fictive, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "3") produce_output(ml_dataset.y_test, y_pred, max_y_pred_probs, ml_dataset.test_terms_name, falsePredictionsFilePath, unknownPredictionsFilePath) logger.info("exporting classifier model") joblib.dump(classifier, classifierFilePath) logger.info("end of training and testing phase")
def trainAndPredictLR(trainX, trainY, testX): """ Logistic regression is used for predicting the target labels of the test data The probability of belonging to each of the labels is predicted for every test data and the labels with the top 10 probability values are extracted Input: 1. trainX: ntrainingSamples * 2000 numpy matrix representing training data features 2. trainY: ntrainingSamples * 185 numpy matrix representing the training data labels 3. testX: ntestSamples * 2000 numpy matrix representing test data features Output: testY: ntestSamples * 19 numpy matrix representing the labels for the test data """ clf = OneVsRestClassifier(LogisticRegression(C = 1.0)) clf.fit(trainX, trainY) actY = clf.predict_proba(testX) testY = [] # fetch the labels with max probability for prob in actY: y = [] for i in range(10): index = np.argmax(prob, axis=0) classVal = classOrder[index] y.append(classVal) prob[index] = -1 testY.append(y) return np.array(testY)
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, return_indicator=True, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(probability=False)) decision_only.fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
def objective(args): c, gamma = args clf = OneVsRestClassifier(svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma, probability=True, random_state=23)) score1 = 0 score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False) score = log_loss(valid_labels, clf.predict_proba(valid)) print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (c, gamma, score1, score2, score) return score
class Classifier(object): '''Classifier base class. Uses OneVsRest for multiclass problems''' def __init__(self, clf, x_train, y_train): n_classes = len(set(y_train)) if n_classes > 2: self.clf = OneVsRestClassifier(clf) else: self.clf = clf self.clf.fit(x_train, y_train) def __call__(self, x_val): return self.clf.predict_proba(x_val)
def fit_models_mc(imps, X, Y, all_props, props=None, labels=None, n_splits=5, clf_args={'n_estimators':25, 'max_features':'auto', 'random_state':0}): if props is None: props = all_props n_obs = X['missing'].shape[0] # Number of observations. n_features = X['missing'].shape[1] # Number of observations. n_props = len(props) # Number of properties to predict. test_size = 0.2 if labels is None: shuffle_split = ShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) else: shuffle_split = LabelShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) n_test_samples = np.max([len(list(shuffle_split)[i][1]) \ for i in range(n_splits)]) rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps} ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} feature_importances = None#{imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps} cols = np.array([i for i in range(len(all_props)) if all_props[i] in props]) for imp in imps: for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),groups=labels)): #X_train,X_test = X[imp][train][:,cols],X[imp][test][:,cols] #Y_train,Y_test = Y[imp][train][:,cols],Y['missing'][test][:,cols] X_train,X_test = X[imp][train,:],X[imp][test,:] Y_train,Y_test = Y[imp][train,:],Y['missing'][test,:] clf_args_ = {key:(value if type(value) is not dict \ else value[prop])\ for key,value in clf_args.items()} if clf_args_['max_features'] not in [None, 'auto']: clf_args_['max_features'] = min(X_train.shape[1], clf_args_['max_features']) rfc = RandomForestClassifier(**clf_args_) onevsrest = OneVsRestClassifier(rfc) onevsrest.fit(X_train,Y_train) Y_predict = onevsrest.predict(X_test)#.reshape(-1,n_props) probs = onevsrest.predict_proba(X_test) if probs.shape[1]<2 and probs.mean()==1.0: n_test_samples = len(probs) ps[imp][:,k,:n_test_samples] = 0.0 else: n_test_samples = len(probs[:,1]) ps[imp][:,k,:n_test_samples] = probs.T ys[imp][:,k,:n_test_samples] = Y_test.T for i in range(n_props): rs[imp][i,k] = np.ma.corrcoef(Y_predict[:,i],Y_test[:,i])[0,1] #feature_importances[imp][n_prop,:,k] = onevsrest.feature_importances_ return rs,feature_importances,ys,ps
def go(): input = TrainingFactory.build_sparse_matrix_input(limit=10000) targets = TrainingFactory.build_sparse_matrix_target(limit=10000) input_train, input_test, target_train, target_test = train_test_split(input, targets, test_size=0.1) classif = OneVsRestClassifier(SVC(kernel='rbf', tol=0.001, probability=True)) classif.fit(input_train, target_train) output_targets = classif.predict_proba(input_test) print ClassifierFactory.output_function(output_targets) print ClassifierFactory.output_function(target_test.todense()) print log_loss(target_test, output_targets) print
def conduct_test(base_clf, test_predict_proba=False): clf = OneVsRestClassifier(base_clf).fit(X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict(np.array([[0, 0, 4]]))[0] assert_equal(set(y_pred), set("eggs")) if test_predict_proba: X_test = np.array([[0, 0, 4]]) probabilities = clf.predict_proba(X_test) assert_equal(2, len(probabilities[0])) assert_equal(clf.classes_[np.argmax(probabilities, axis=1)], clf.predict(X_test)) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[3, 0, 0]])[0] assert_equal(y_pred, 1)
def model(train_data, train_label, test_data, test_label, n_classes): # Binarize the output train_label = label_binarize(train_label, classes=list(np.arange(n_classes))) test_label = label_binarize(test_label, classes=list(np.arange(n_classes))) # Basic classifier # basic_clf = LogisticRegression(C=1.0) # basic_clf = SVC() # basic_clf = KNeighborsClassifier() basic_clf = GaussianNB() # Multi-class classifier = OneVsRestClassifier(basic_clf) classifier.fit(train_data, train_label) # test_score = classifier.decision_function(test_data) test_score = classifier.predict_proba(test_data) return test_score, test_label
class Classifier(object): '''Classifier base class. Uses OneVsRest for multiclass problems''' def __init__(self, clf, x_train, y_train): n_classes = len(set(y_train)) if n_classes > 2: self.clf = OneVsRestClassifier(clf) else: self.clf = clf self.clf.fit(x_train, y_train) def __call__(self, x_val): return self.clf.predict_proba(x_val) def describe(self): return dict( (k, v) for k, v in self.clf.get_params().iteritems() if not callable(v))
def process_data_set(X_train, y_train, X_val, X_test, c=1.0): cls = OneVsRestClassifier(LogisticRegression(C=c)) # 4096 + 4096 + 384*4 + 256*4 # "fc6" "fc7" "flatten4" "flatten5" # [0, 4096, 8192, 9728, 10752] layers = np.array((4096, 4096, 384*4, 256*4)) layers = np.concatenate(([0], np.cumsum(layers))) r_ = range(layers[0], layers[4]) x_tr = X_train[:, r_] x_vl = X_val[:, r_] x_ts = X_test[:, r_] cls.fit(x_tr, y_train) y_vl = cls.predict_proba(x_vl) y_ts = cls.predict_proba(x_ts) return y_vl, y_ts
def test_ovr_single_label_predict_proba(): base_clf = MultinomialNB(alpha=1) X, Y = iris.data, iris.target X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) assert_almost_equal(Y_proba.sum(axis=1), 1.0) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = np.array([l.argmax() for l in Y_proba]) assert_false((pred - Y_pred).any())
def multiclass_svm(training_feature_array, training_label_array, test_feature_array, test_label_array, kernel_type = "rbf", grid_search = True, n_fold = 5, costs = None, gammas = None): """ 多クラス分類のSVC @param training_feature_array: トレーニング用データ @param training_label_array: トレーニング用データラベル @param test_feature_array: テスト用データ @param test_label_array: テスト用データラベル @keyword kernel_type: カーネル種別 @keyword grid_search: パラメータ最適化をするか否か @keyword n_fold: フォールド数 @keyword costs: コスト値リスト @keyword gammas: ガンマ値リスト @return: 識別率, 識別結果のリスト, 識別面からの距離のリスト, SVCオブジェクト """ # 多クラス識別器の生成 multi_svm_model = OneVsRestClassifier(sksvm.SVC(kernel=kernel_type, probability=True)) # print multi_svm_model.get_params() if grid_search: # パラメータ最適化 ret_c, ret_gamma = optimizeParameter(multi_svm_model, kernel_type, training_feature_array, training_label_array, _fold=n_fold, _costs=costs, _gammas=gammas) else: # パラメータ最適化を行わない場合、一般的なデフォルト値を用いる # コスト値は1.0で、ガンマ値は1/特徴量次元数 ret_c = 1.0 ret_gamma = 1/len(training_feature_array[0,]) # 最適なコスト値、ガンマ値の設定 #multi_svm_model.set_params(C=ret_c, gamma=ret_gamma) multi_svm_model.estimator.set_params(C=ret_c, gamma=ret_gamma) # 学習 multi_svm_model.fit(training_feature_array, training_label_array) # 予測 result_class_list = multi_svm_model.predict(test_feature_array) # クラス尤度の計算 result_probability_list = multi_svm_model.predict_proba(test_feature_array) # 識別率の計算 try: precision = skmet.accuracy_score(test_label_array, result_class_list) except DeprecationWarning, e: pass
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # Decision function only estimator. decision_only = OneVsRestClassifier(svm.SVR(gamma='scale') ).fit(X_train, Y_train) assert_false(hasattr(decision_only, 'predict_proba')) # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(gamma='scale', probability=False)) assert_false(hasattr(decision_only, 'predict_proba')) decision_only.fit(X_train, Y_train) assert_false(hasattr(decision_only, 'predict_proba')) assert_true(hasattr(decision_only, 'decision_function')) # Estimator which can get predict_proba enabled after fitting gs = GridSearchCV(svm.SVC(gamma='scale', probability=False), param_grid={'probability': [True]}) proba_after_fit = OneVsRestClassifier(gs) assert_false(hasattr(proba_after_fit, 'predict_proba')) proba_after_fit.fit(X_train, Y_train) assert_true(hasattr(proba_after_fit, 'predict_proba')) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
def test_ovr_single_label_predict_proba(): base_clf = MultinomialNB(alpha=1) X, Y = iris.data, iris.target X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # Decision function only estimator. decision_only = OneVsRestClassifier(svm.SVR(gamma='scale') ).fit(X_train, Y_train) assert_false(hasattr(decision_only, 'predict_proba')) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) assert_almost_equal(Y_proba.sum(axis=1), 1.0) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = np.array([l.argmax() for l in Y_proba]) assert_false((pred - Y_pred).any())
def conduct_test(base_clf, test_predict_proba=False): clf = OneVsRestClassifier(base_clf).fit(X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict(np.array([[0, 0, 4]]))[0] assert_array_equal(y_pred, ["eggs"]) if hasattr(base_clf, 'decision_function'): dec = clf.decision_function(X) assert_equal(dec.shape, (5,)) if test_predict_proba: X_test = np.array([[0, 0, 4]]) probabilities = clf.predict_proba(X_test) assert_equal(2, len(probabilities[0])) assert_equal(clf.classes_[np.argmax(probabilities, axis=1)], clf.predict(X_test)) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[3, 0, 0]])[0] assert_equal(y_pred, 1)
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification( n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, random_state=0 ) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than than 0.5. pred = [tuple(l.nonzero()[0]) for l in (Y_proba > 0.5)] assert_equal(pred, Y_pred)
def multiclass_lda(training_feature_array, training_label_array, test_feature_array, test_label_array): """ 多クラス分類LDA @param training_feature_array: トレーニング用データ @param training_label_array: トレーニング用データラベル @param test_feature_array: テスト用データ @param test_label_array: テスト用データラベル @return: 全体識別率, 識別結果のリスト, 識別されたクラスへの所属確率のリスト, LDAオブジェクト 動作確認済 """ multi_lda_obj = OneVsRestClassifier(slda.LDA()) multi_lda_obj.fit(training_feature_array, training_label_array) print "test..." class_result = multi_lda_obj.predict(test_feature_array) proba_result = multi_lda_obj.predict_proba(test_feature_array) proba_max_result = np.max(proba_result, axis=1) try: precision = smet.accuracy_score(test_label_array, class_result) except DeprecationWarning, e: pass
class SVM(Classifier): def __init__(self): self.__class_zero_indexing = True self.__class_num = 0 self.__clf = OneVsRestClassifier(SVC(probability=True)) @staticmethod def name(): return "svm" def train(self, X, Y, class_number=-1): self.__class_num = max(np.unique(Y).size, class_number) self.__clf.fit(X, Y) def predict(self, X): out = self.__clf.predict_proba(X) assert len(out[0]) == self.__class_num return out def predict2(self, X): out = self.__clf.predict(X) # assert len(out[0]) == self.__class_num return out
clf2.fit( x_train, y_train ) clf3.fit( x_train, y_train ) clf4.fit( x_train, y_train ) print "training ended" et = time.time() tt = et - st print "Training Time = " + str(tt) + "\n" #predictions pred1 = clf1.predict( x_test ) pred2 = clf2.predict( x_test ) pred3 = clf3.predict( x_test ) pred4 = clf4.predict( x_test ) pred = pred2; #NOTE: change to decision_function or predict_proba depending on the classifier y_score1 = clf1.predict_proba(x_test) y_score2 = clf2.predict_proba(x_test) y_score3 = clf3.predict_proba(x_test) y_score4 = clf4.predict_proba(x_test) #y_score = clf.decision_function(x_test) y_score = y_score1 + y_score2 + y_score3 + y_score4 ################################################################################# #PrecisionRecall-plot precision = dict() recall = dict() PR_area = dict() PR_thresholds = dict() average_precision = dict() for i in range(n_classes):
train_num=1500 test_num=1500 data_train=data[0:train_num,] label_train=label[0:train_num,] # label_train=label_train[0:2] # print(label_train.shape) data_test=data[train_num:train_num+test_num,] label_test=label[train_num:train_num+test_num,] # print(label_test.shape) ## multi classification model_0 =OneVsRestClassifier(SVC(kernel='linear', probability=True,gamma='scale')) model_0.fit(data_train, label_train) pre_0 = model_0.predict_proba(data_test) max_ind=np.argmax(pre_0,axis=1) # print(max_ind) pre=np.zeros_like(pre_0) for i in range(pre.shape[0]): pre[i,max_ind[i]]=1 # print(pre) pre_train0=model_0.predict_proba(data_train) max_ind_train=np.argmax(pre_train0,axis=1) # print(max_ind) pre_train=np.zeros_like(pre_0) for i in range(max_ind_train.shape[0]): pre_train[i,max_ind_train[i]]=1 print(metrics.accuracy_score(label_train,pre_train))
def evaluate(graph_path, embedding_file, number_of_shuffles, training_ratios, classification_method, file_type="binary"): #print("Basladi") cache_size = 10240 g = nx.read_gml(graph_path) node2community = get_node2community(g) # N = g.number_of_nodes() K = detect_number_of_communities(g) #print("K: {}".format(K)) # nodelist = [node for node in g.nodes()] nodelist = [int(node) for node in node2community] #nodelist.sort() N = len(nodelist) #print("N: {}".format(N)) #print("--------", x.shape if file_type == "binary": x = read_binary_emb_file(file_path=embedding_file, nodelist=nodelist) else: x = read_embedding_file(embedding_file, nodelist=nodelist) #print("Basladi 2") label_matrix = [[ 1 if k in node2community[str(node)] else 0 for k in range(K) ] for node in nodelist] label_matrix = csr_matrix(label_matrix) results = {} for score_t in _score_types: results[score_t] = OrderedDict() for ratio in training_ratios: results[score_t].update({ratio: []}) print("+ Similarity matrix is begin computed!") if classification_method == "svm-hamming": sim = 1.0 - cdist(x, x, 'hamming') elif classification_method == "svm-cosine": sim = 1.0 - cdist(x, x, 'cosine') else: raise ValueError("Invalid classification method name: {}".format( classification_method)) #print("\t- Completed!") for train_ratio in training_ratios: for shuffleIdx in range(number_of_shuffles): print("Current train ratio: {} - shuffle: {}/{}".format( train_ratio, shuffleIdx + 1, number_of_shuffles)) # Shuffle the data shuffled_idx = np.random.permutation(N) shuffled_sim = sim[shuffled_idx, :] shuffled_sim = shuffled_sim[:, shuffled_idx] shuffled_labels = label_matrix[shuffled_idx] # Get the training size train_size = int(train_ratio * N) # Divide the data into the training and test sets train_sim = shuffled_sim[0:train_size, :] train_sim = train_sim[:, 0:train_size] train_labels = shuffled_labels[0:train_size] test_sim = shuffled_sim[train_size:, :] test_sim = test_sim[:, 0:train_size] test_labels = shuffled_labels[train_size:] # Train the classifier ovr = OneVsRestClassifier( SVC(kernel="precomputed", cache_size=cache_size, probability=True)) ovr.fit(train_sim, train_labels) # Find the predictions, each node can have multiple labels test_prob = np.asarray(ovr.predict_proba(test_sim)) y_pred = [] for i in range(test_labels.shape[0]): k = test_labels[i].getnnz( ) # The number of labels to be predicted pred = test_prob[i, :].argsort()[-k:] y_pred.append(pred) # Find the true labels y_true = [[] for _ in range(test_labels.shape[0])] co = test_labels.tocoo() for i, j in zip(co.row, co.col): y_true[i].append(j) mlb = MultiLabelBinarizer(range(K)) for score_t in _score_types: score = f1_score(y_true=mlb.fit_transform(y_true), y_pred=mlb.fit_transform(y_pred), average=score_t) results[score_t][train_ratio].append(score) return results
def ML_with_BN_feat(bn_feat_file='../data/factors_n_bn_feat.csv', n_comp=100, plotting=False): plt.close('all') if n_comp < 50: n_comp = 50 # Importing the bottleneck features for each image feat_df = pd.read_csv(bn_feat_file, index_col=0, dtype='unicode') # feat_df = feat_df.sample(frac=0.05) print('Data frame shape:', feat_df.shape) # feat_df = feat_df.iloc[0:300,:] mask = feat_df.loc[:, 'label'].isin(['Parasitized', 'Uninfected']) feat_df = feat_df.loc[mask, :].drop_duplicates() print('Number of bottleneck features:', feat_df.shape[1] - 7) y = feat_df.loc[:, ['label']].values print(type(y), y.shape) print('Number of samples for each label \n', feat_df.groupby('label')['label'].count()) X = feat_df.loc[:, 'x0':'x2047'].astype(float).values # print(list(feat_df.loc[:, 'x0':].columns)) ##-- Dealing with imbalanced data # from imblearn.over_sampling import RandomOverSampler # ros = RandomOverSampler(random_state=0) # # X_resampled, y_resampled = ros.fit_sample(X, y[:,0]) # # from collections import Counter # print(sorted(Counter(y_resampled).items())) # # X, y = X_resampled, y_resampled # checking for nulls in DF #nulls = BN_featues.isnull().any(axis=1) # checking for nulls in DF #nulls = BN_featues.isnull().any(axis=1) # In[3]: class_names = set(feat_df.loc[:, 'label']) # Binarize the labels # print(class_names) # lb = label_binarize(y = y, classes = list(class_names)) # classes.remove('unknown') # lb.fit(y) #for LabelBinarizer not lable_binerize() # lb.classes_ #for LabelBinarizer not lable_binerize # Split the training data for cross validation X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ##### Dimensionality Reduction #### # In[4]: # Princple Component Analysis # Use n_components = None first to determine variability of principle components # Then limit the number of principle components that are reasonable # n_components=None --> min(n observation, n features) print('...running PCA analysis...' '') pca_none = PCA(n_components=None) pca_none.fit_transform(X_train) # print(X_test.shape, type(X_test)) # arr_index = np.where(X_test == '0.1465795w85188675') # print('arr_index', arr_index) # print('X_test[arr_index]',X_test[arr_index]) pca_none.transform(X_test) explained_variance = pca_none.explained_variance_ratio_ plt.figure(0) plt.plot(explained_variance) plt.xlabel('n_components') plt.ylabel('variance') plt.suptitle('Explained Variance of Principle Components') # plt.show(block=False) plt.savefig('../plots/pca_var_vs_ncomp.png') # #### After about 70 components there is very little variance gain #### # Applying Principle Component Decomposition # In[5]: # n_comp = 11 # the number of Principal Components to project/decompose the data into print('...running PCA with', n_comp, 'components') pca = PCA(n_components=n_comp) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) explained_variance1 = pca.explained_variance_ratio_ plt.figure(1) plt.plot(explained_variance1) plt.xlabel('n_components') plt.ylabel('variance') plt.suptitle('Explained Variance of Principle Components') plt.show(block=False) plt.savefig('../plots/pca_var_vs_{}_ncomp.png'.format(n_comp)) # Save feature reduction PCA save_PCA = '../models/trained_PCA.sav' pickle.dump(pca, open(save_PCA, 'wb')) # In[6]: if plotting: # Pairwise plots of 11 PCA, note this only works with two labels feat_df_ploting = pd.DataFrame({'label': y_train[:, 0]}) caa_plot_pairs(X_train[:, :11], feat_df_ploting, 'PCA') plt.figure(figsize=(16, 24)) plt.show(block=False) # In[70]: # seaborn plot of PCA # need to add columns to pca X_train # conver to a dataframe #Pairwise plots of 11 components pca_DF = pd.DataFrame(X_train[:, :11]) df_y_train = pd.DataFrame(y_train, columns=['label']) #,'Date','group_idx']) df_pca_train = pd.concat([df_y_train, pca_DF], axis=1) # dates = list(set(df_pca_train['Date'])) # print(list(feat_df.columns)) feature_names = df_pca_train.columns[1:] n_comp_pca = pca_DF.shape[1] print('n_comp_pca', n_comp_pca) print('feature_names', feature_names) print('df_pca_train columns', list(df_pca_train.columns)) plt.close('all') # Set up plot to compare confusion matrices params = { 'axes.titlesize': 'x-large', # 'legend.fontsize': 'large', # 'figure.figsize': (15, 5), 'axes.labelsize': 'large', 'axes.titlesize': 'large', 'xtick.labelsize': 'medium', 'ytick.labelsize': 'medium' } plt.rcParams.update(params) fig, axs = plt.subplots(1, 4, sharey=True, figsize=(15, 8.5)) font = { 'linespacing': 1.5, #'family': 'serif', 'color': 'darkred', 'weight': 'normal', 'size': 14 } # ## Exploring Different Algorithms For Mutliclass Classfication #Metric in this case is F2 from sklearn.metrics import fbeta_score, make_scorer ftwo_scorer = make_scorer(fbeta_score, beta=2) # In[7.5]: # Let's scale the features and plug into logisitc regression classifier # from sklearn.preprocessing import StandardScaler # X_scaled = StandardScaler().fit_transform(X_train) from sklearn import linear_model log_reg_classifier = linear_model.LogisticRegression(penalty='l2', tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', n_jobs=1) log_r = log_reg_classifier.fit(X_train, df_y_train['label'].values) y_test_predictions_log_r = log_r.predict(X_test) y_predict_prob_log_r = log_r.predict_proba(X_test) # save results into a DF results = pd.DataFrame() results['y_test'] = y_test[:, 0] results['log_r_pred'] = list(y_test_predictions_log_r) results['log_r_prob'] = y_predict_prob_log_r[:, 0] #Perform 3-fold cross validation and return the mean accuracy on each fold cv_scores_lr = cross_val_score(estimator=log_r, X=X_train, y=y_train) #, scoring = ftwo_scorer) print('Logistic regression cv_scores', cv_scores_lr) save_LR = '../models/trained_log_reg.sav' pickle.dump(log_reg_classifier, open(save_LR, 'wb')) # Confusion Matrix for Logistic Regresssion cmNB = confusion_matrix(y_test, y_test_predictions_log_r, labels=list(class_names)) plt.subplot(1, 4, 1) plot_confusion_matrix(cm1=cmNB, classes=class_names, normalize=True, gradientbar=False, title='Logistic Regression\n') cv_scores_lr = ["{:.2f}".format(x) for x in cv_scores_lr] p_r_fscore_lr = precision_recall_fscore_support(y_test, y_test_predictions_log_r, beta=2.0, labels=['Parasitized'], pos_label='Parasitized', average='binary') print(p_r_fscore_lr[:3]) plt.text( 0.01, -1, '\nCV Scores:\n' + str(cv_scores_lr) + '\n' + 'Precision: {d[0]:.2f}\nRecall: {d[1]:.2f} \nF2 score: {d[2]:.2f} \n'. format(d=p_r_fscore_lr[:3]), ha='left', va='bottom', fontdict=font, transform=plt.subplot(1, 4, 1).transAxes) # In[7]: # ### OneVsRestClassifier with Naive Bayes classifier = OneVsRestClassifier(GaussianNB()) nbclf = classifier.fit(X_train, df_y_train['label'].values) y_test_predictions_nbclf = nbclf.predict(X_test) y_predict_prob = nbclf.predict_proba(X_test) # save results into a DF results['NB_pred'] = list(y_test_predictions_nbclf) results['NB_r_prob'] = y_predict_prob[:, 0] #Perform 3-fold cross validation and return the mean accuracy on each fold cv_scores = cross_val_score(classifier, X_train, y_train) #default 3-fold cross validation print('NB cv_scores', cv_scores) # answer = pd.DataFrame(y_predict_prob, columns = class_names).round(decimals=3) # index= pd.DataFrame(X_test).index.tolist()) #print('One vs Rest - Naive Bayes\n', answer.head()) # Confusion Matrix for Naive Bayes cmNB = confusion_matrix(y_test, y_test_predictions_nbclf, labels=list(class_names)) plt.subplot(1, 4, 2) plot_confusion_matrix(cm1=cmNB, classes=class_names, normalize=True, gradientbar=False, title='One vs Rest - Naive Bayes\n') cv_scores = ["{:.2f}".format(x) for x in cv_scores] p_r_fscore_NB = precision_recall_fscore_support(y_test, y_test_predictions_nbclf, beta=2.0, labels=['Parasitized'], pos_label='Parasitized', average='binary') print(p_r_fscore_NB[:3]) plt.text( 0.01, -1, '\nCV Scores:\n' + str(cv_scores) + '\n' + 'Precision: {d[0]:.2f}\nRecall: {d[1]:.2f} \nF2 score: {d[2]:.2f} \n'. format(d=p_r_fscore_NB[:3]), ha='left', va='bottom', fontdict=font, transform=plt.subplot(1, 4, 2).transAxes) # ### Random Forest Classification # In[8]: # Next, let's try Random Forest Classifier if n_comp < 100: f = n_comp else: f = 100 n = 30 RFclf = OneVsRestClassifier( RandomForestClassifier(n_estimators=n, max_features=f)) RFclf.fit(X_train, df_y_train['label'].values) y_test_predictions_RF = RFclf.predict(X_test) # y_score_RF = RFclf.predict_proba(X_test) y_score_answer_RF = RFclf.predict_proba(X_test) # save results into a DF results['RF'] = list(y_test_predictions_RF) results['RF_prob'] = y_score_answer_RF[:, 0] #Perform 3-fold cross validation and return the mean accuracy on each fold cv_scores_RF = cross_val_score(RFclf, X_train, y_train) #default 3-fold cross validation print('Random Forest cv_scores', cv_scores_RF) # answer_RF = pd.DataFrame(y_score_answer_RF) save_RF = '../models/trained_RF.sav' pickle.dump(RFclf, open(save_RF, 'wb')) #print('Random Forest\n', answer_RF.head()) # confusion matrix cmRF = confusion_matrix(y_test, y_test_predictions_RF, labels=list(class_names)) plt.subplot(1, 4, 3) plot_confusion_matrix( cm1=cmRF, classes=class_names, normalize=True, gradientbar=False, title='Random Forests\nestimators: {0}\n max_features: {1}\n'.format( n, f)) cv_scores_RF = ["{:.2f}".format(x) for x in cv_scores_RF] p_r_fscore_RF = precision_recall_fscore_support(y_test, y_test_predictions_RF, beta=2.0, labels=['Parasitized'], pos_label='Parasitized', average='binary') print(p_r_fscore_RF[:3]) plt.text( 0.01, -1, '\nCV Scores:\n' + str(cv_scores_RF) + '\n' + 'Precision: {d[0]:.2f}\nRecall: {d[1]:.2f} \nF2 score: {d[2]:.2f} \n'. format(d=p_r_fscore_RF[:3]), ha='left', va='bottom', fontdict=font, transform=plt.subplot(1, 4, 3).transAxes) # ### Adaptive Boosting Classifier # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html # In[9]: AdaBoost = AdaBoostClassifier() AdaBoost.fit(X_train, y_train) y_predAB = AdaBoost.predict(X_test) y_predAB_prob = AdaBoost.predict_proba(X_test) # y_predAB_binarized = label_binarize(y_predAB, # classes=['single_product','market_place']) # save results into a DF results['AB_pred'] = list(y_predAB) results['AB_prob'] = y_predAB_prob[:, 0] results.to_csv('../data/y_test_predictions') #Perform 3-fold cross validation and return the mean accuracy on each fold cv_scores_AB = cross_val_score(AdaBoost, X_train, y_train) #default 3-fold cross validation print('Adaptive Boosting cv_scores', cv_scores_AB) save_AdaBoost = '../models/trained_AdaBoost.sav' pickle.dump(AdaBoost, open(save_AdaBoost, 'wb')) plt.subplot(1, 4, 4) cmAdaBoost = confusion_matrix(y_test, y_predAB, labels=list(class_names)) plot_confusion_matrix(cm1=cmAdaBoost, normalize=True, classes=class_names, title='AdaBoost\n', gradientbar=False) cv_scores_AB = ["{:.2f}".format(x) for x in cv_scores_AB] p_r_fscore_AB = precision_recall_fscore_support(y_test, y_predAB, beta=2.0, labels=['Parasitized'], pos_label='Parasitized', average='binary') print(p_r_fscore_AB[:3]) plt.text( 0.01, -1, '\nCV Scores:\n' + str(cv_scores_AB) + '\n' + 'Precision: {d[0]:.2f}\nRecall: {d[1]:.2f} \nF2 score: {d[2]:.2f} \n'. format(d=p_r_fscore_AB[:3]), ha='left', va='bottom', fontdict=font, transform=plt.subplot(1, 4, 4).transAxes) # #### Comparing mean accuracy and confusion matrices of difference classification algorithrms # In[10]: print('\nLogistic Regression mean accuracy:', round(log_reg_classifier.score(X_test, y_test), 4)) print('One vs Rest - Naive Bayes mean accuracy:', round(classifier.score(X_test, y_test), 4)) print('Random Forest Classifier mean accuracy:', round(RFclf.score(X_test, y_test), 4)) print('Adaptive Boosting Classifier mean accuracy:', round(AdaBoost.score(X_test, y_test), 4)) plt.tight_layout() fig.tight_layout() plt.savefig('../plots/confusion_matrix_result_1.png') plt.show(block=False) ### -- ROC and AUC # Compute ROC curve and area the curve plt.figure(12) # print('y_test before binirization', y_test[0:4]) y_test = label_binarize(y_test, classes=['Uninfected', 'Parasitized']) # print('y_test after binirization', y_test[0:4]) # print(y_predict_prob_log_r[1:4, 0]) fpr, tpr, thresholds = roc_curve(y_test, y_predict_prob_log_r[:, 0]) roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds}) roc_df.to_csv('../data/roc_data.csv') # tprs = [interp(mean_fpr, fpr, tpr)] # tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) plt.title('Receiver Operating Characteristic', fontsize=18) plt.plot(fpr, tpr, lw=2, color='#3399ff', label='AUC = {0:.2f}'.format(roc_auc)) plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='gray', label='Chance', alpha=.8) plt.ylabel('True Positive Rate', fontsize=14) plt.xlabel('False Positive Rate', fontsize=14) plt.tick_params(axis='both', which='major', labelsize=12) plt.legend(loc="lower right") plt.tight_layout() plt.savefig('../plots/ROC_CNN_log_reg.png') plt.show() plt.close('all') print( 'If launched from command line use ctrl+z to close all plots and finish' )
class SceneClassifier: def __init__(self): self.kernel = None self.named_labels = [ 'base', 'greeting', 'qa', 'repeat_user', 'repeat_machine', 'sale' ] self.fasttext_url = "http://localhost:11425/fasttext/s2v?q={0}&w=" self.fasttext_url_weighted = "http://localhost:11425/fasttext/s2v?q={0}&w={1}" self.weighted = False def _add_extra_dict(self, path): with open(path, 'r') as inp: for line in inp: line = line.split(':')[-1] words = line.split(',') for word in words: jieba.add_word(word) def cut(self, input_): input_ = QueryUtils.static_remove_cn_punct(input_) seg = " ".join(jieba.cut(input_, cut_all=False)) tokens = _uniout.unescape(str(seg), 'utf8') return tokens def get_w2v_emb(self, tokens): # embedding=np.zeros((1,300),dtype=np.float32) # count=0 # # print_cn(tokens) # for word in tokens: # word = word.encode('utf-8') # if w2v_model.__contains__(word.strip()): # vector = w2v_model.__getitem__(word.strip()) # result = [v for v in vector] # # embedding=np.add(embedding,np.asarray(result)) # # print embedding # count+=1 # if count==0: # print('get...',count) # print_cn(tokens) # embedding=np.divide(embedding,count) ## get fasttext embedding from web embedding = self._fasttext_vector(tokens) return np.squeeze(embedding) def _fasttext_vector(self, tokens): if not self.weighted: try: weights = np.ones(shape=len(tokens)) url = self.fasttext_url_weighted.format( ','.join(tokens), ",".join([str(weight) for weight in weights])) except: traceback.print_exc() else: try: idf_url = "http://10.89.100.14:3032/s/{0}".format( "%7C".join(tokens)) idf_r = requests.get(url=idf_url) weights = [] returned_json = idf_r.json() max_weight = 1 for key, value in returned_json.iteritems(): if value > max_weight: max_weight = value for token in tokens: if token not in returned_json: weights.append(str(max_weight)) else: weights.append(str(returned_json[token])) url = self.fasttext_url_weighted.format( ','.join(tokens), ','.join(weights)) except: traceback.print_exc() url = self.fasttext_url.format(','.join(tokens)) try: r = requests.get(url=url) vector = r.json()['vector'] return vector except: print_cn(url) traceback.print_exc() return None # def check_zero_tokens(self,tokens): # count=0 # for word in tokens: # word = word.encode('utf-8') # if w2v_model.__contains__(word.strip()): # count+=1 # if count==0: # print_cn(tokens) # # return True if count!=0 else False def _prepare_data(self, files): print('prepare data...') embeddings = list() queries = list() queries_ = dict() labels = list() mlb = MultiLabelBinarizer() for index in xrange(len(files)): path = files[index] label = self.named_labels[index] queries_[label] = list() with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] question = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8') question = QueryUtils.static_remove_cn_punct(str(question)) tokens = QueryUtils.static_jieba_cut(question) # print_cn(tokens) if len(tokens) == 0: continue # cc=self.check_zero_tokens(tokens) # if not cc: # continue queries_[label].append(question) # print len(queries_) for label, questions in queries_.iteritems(): for question in questions: if question in queries and label not in labels[queries.index( question)]: # print_cn(question) index = queries.index(question) labels[index].append(label) else: # print_cn(question) queries.append(question) labels.append([label]) tokens = self.cut(question).split(' ') embedding = self.get_w2v_emb(tokens) embeddings.append(embedding) embeddings = np.array(embeddings) embeddings = np.squeeze(embeddings) self.mlb = mlb.fit(labels) labels = self.mlb.transform(labels) # print (embeddings.shape, len(queries)) # print_cn(labels.shape) return embeddings, labels, queries def _build(self, files): self._add_extra_dict('../data/sc/belief_graph.txt') return self._prepare_data(files) def train(self, pkl, files): embeddings, labels, queries = self._build(files) print 'train classifier...' self.kernel = OneVsRestClassifier( GradientBoostingClassifier(max_depth=5, n_estimators=1000)) self.kernel.fit(embeddings, labels) pickle.dump(self, open(pkl, 'wb')) print 'train done and saved.' print 'validation...' self.metrics_(labels, queries) def metrics_(self, labels, queries): correct = 0.0 total = 0 for i in xrange(len(queries)): query = queries[i] if not query: continue total += 1 label = labels[i] label = np.expand_dims(label, axis=0) real = self.mlb.inverse_transform(label)[0] real = list(real) label_, probs = self.predict(query) label_ = list(set(label_)) # label_ = self.mlb.inverse_transform(label_) if ' '.join(real) != ' '.join(list(label_)): print('{0}: {1}-->{2}'.format(query, ' '.join(real), ' '.join(list(label_)))) else: correct += 1 print('accuracy:{0}'.format(correct / total)) def validate(self, files): embeddings, labels, queries = self._prepare_data(files) self.metrics_(labels, queries) def predict(self, question): line = str(question).replace(" ", "").replace("\t", "") tokens = self.cut(line).split(' ') embedding = self.get_w2v_emb(tokens) embedding = np.reshape(embedding, [1, -1]) prediction = self.kernel.predict(embedding) prediction_index_first_sample = np.where(prediction[0] == 1) # label = self.mlb.inverse_transform(prediction) probs = self.kernel.predict_proba(embedding) ## note that in prediction stage n_sample==1 label_ = self.mlb.inverse_transform(prediction) if len(label_[0]) == 0: index = np.argmax(probs[0]) l = self.named_labels[index] prob = probs[0][index] return [l], [prob] return label_[0], probs[0][prediction_index_first_sample] def interface(self, q): label, probs = self.predict(q) probs_dict = {} for i in xrange(len(probs[0])): probs_dict[self.named_labels[i]] = probs[0][i] return self.mlb.inverse_transform(label)[0], probs_dict @staticmethod def get_instance(path): print('loading model file...') return pickle.load(open(path, 'r'))
"mouseUp": [3], "usernameWPS": [0.003121452894], "passwordWPS": [2.63E-03], "totalTimeSpent": [7715], "countShift": [1], "countCapslock": [0], "countKey": [23], "dwellTimeAverage": [79.73913043], "flightTimesAverage": [348], "upDownTimeAverage": [205.9047619] }) print(X_test) y_pred = model.predict(X_test) y_pred_prob = model.predict_proba(X_test) print(y_pred) print(y_pred_prob) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred) * 100) print(model.predict_proba(new_input)) precision = precision_score(y_test, y_pred, average='binary') recall = recall_score(y_test, y_pred, average='binary') score = f1_score(y_test, y_pred, average='binary') print('Recall: %.3f' % recall)
for text in x_test ]) print('X_train shape ', x_train_mybag.shape) print('X_test shape ', x_test_mybag.shape) y_train = label_binarize(y_train, classes=sorted(tags_counts.keys())) y_val = label_binarize(y_test, classes=sorted(tags_counts.keys())) import itertools a = [0.1, 1] b = ['l1', 'l2'] parameters = list(itertools.product(a, b)) print(parameters) for C_value, penalty_value in parameters: print(C_value, penalty_value) clf = OneVsRestClassifier( LogisticRegression(penalty=penalty_value, C=C_value)) clf.fit(x_train_mybag, y_train) y_val_predicted_labels_mybag = clf.predict_proba(x_test_mybag) y_val_labels = [[tag for tag in list(enumerate(tags)) if tag[1] == 1][0][0] for tags in y_val] print(y_val_labels[:10]) y_val_predicted_labels_mybag = [ sorted(list(enumerate(tags)), key=lambda x: x[1], reverse=True)[0][0] for tags in y_val_predicted_labels_mybag ] print(y_val_predicted_labels_mybag[:10]) print("Result with parameter: C: {}, penalty: {}".format( C_value, penalty_value)) print('F1 score weighted: {}'.format( f1_score(y_val_labels, y_val_predicted_labels_mybag, average='micro')))
def svm_bagclassifier(sentiment_data, file_name_classifier, file_name_vectorizer, file_name_features, bagging=False): """ vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(sentences) """ import time start = time.time() sentiments, sentences = zip(*sentiment_data[0:1000]) sentences = GeneralMethodsClassifiers.snowball_stemmer(sentences) sentences = GeneralMethodsClassifiers.pre_process_text(sentences) vectorize_class = HouzierVectorizer( sentences, "%s/CompiledModels/SentimentClassifiers" % base_dir, file_name_vectorizer, False, False) ##getting features list x_vectorize = vectorize_class.count_vectorize() tfidf = TfidfTransformer(norm="l2", sublinear_tf=True) ##convert them into term frequency x_transform = tfidf.fit_transform(x_vectorize) X_normalized = preprocessing.normalize(x_transform.toarray(), norm='l2') print "Feature after vectorization of the data [%s, %s]" % x_transform.shape ##Going for feature selection # This dataset is way too high-dimensional. Better do PCA: #pca = PCA() pca = KernelPCA(kernel="linear") #pca = RandomizedPCA() #pca = NMF() # ## Maybe some original features where good, too? ##this will select features basec on chi2 test selection = SelectKBest(chi2, k=2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit_transform(X_normalized, sentiments) with cd("%s/CompiledModels/SentimentClassifiers" % base_dir): joblib.dump(combined_features, file_name_features, compress=("zlib", 9)) """ dump(combined_features, open('%s/%s'%(SentimentClassifiersPath,SentimentFeatureFileName), 'wb'),HIGHEST_PROTOCOL) """ #X_pca = pca.fit_transform(x_transform) print "Feature after feature slection with pca and selectkbest\ of the data [%s, %s]" % X_features.shape #http://stackoverflow.com/questions/32934267/feature-union-of-hetereogenous-features #clf = SVC(C=1, kernel="linear", gamma=.001, probability=True, class_weight='auto') n_estimators = 3 svc_classifier = SVC(kernel='linear', C=1, gamma="auto", probability=True, decision_function_shape="ovr", class_weight="balanced", cache_size=20000) if bagging: classifier = OneVsRestClassifier( BaggingClassifier(svc_classifier, max_samples=1.0, max_features=1.0, n_jobs=-1, verbose=3, n_estimators=n_estimators, bootstrap=False)) else: classifier = svc_classifier classifier.fit(X_features, sentiments) print classifier.classes_ with cd("%s/CompiledModels/SentimentClassifiers" % base_dir): joblib.dump(classifier, file_name_classifier, compress=("zlib", 9)) """ dump(file_name_classifier,open('%s/%s'%(SentimentClassifiersPath, SentimentClassifierFileName ), 'wb'), HIGHEST_PROTOCOL) """ print "Storing Classifier with joblib" ##example to build your own vectorizer ##http://stackoverflow.com/questions/31744519/load-pickled-classifier-data-vocabulary-not-fitted-error from sklearn.feature_extraction.text import CountVectorizer #count_vectorizer = CountVectorizer() examples_negative = [ 'Free Viagra call today!', "I am dissapointed in you", "i am not good" ] examples_neutral = [ "I dont know", "Sun rises in the east", "I'm going to attend theLinux users group tomorrow." ] examples_positive = [ "hey there, I am too good to be true", "An Awesome man", "A beautiful beautiful lady" ] examples = examples_positive + examples_negative + examples_neutral #example_counts= example_counts.toarray() vocabulary_to_load = vectorize_class.return_vectorizer() #vectorize_class = HouzierVectorizer(examples, True, False) #x_vectorize = vectorize_class.count_vectorize() loaded_vectorizer = CountVectorizer(vocabulary=vocabulary_to_load) example_counts = loaded_vectorizer.transform(examples) print example_counts, example_counts.shape f = combined_features.transform(example_counts.toarray()) predictions = classifier.predict(f) predict_probabilities = classifier.predict_proba(f) for sent, prob, tag in zip(examples, predict_probabilities, predictions): print sent, prob, tag print time.time() - start return
def svmBasedClassification(): # tweets = pd.read_csv("data\\matilampu-label.csv") # tweets = pd.read_csv("data\\tweetclean600-only.csv", sep="|") tweets = pd.read_csv("data\\backup\\tweets333-only.csv", sep="|") # tweets = pd.read_csv("data\\backup\\tweets333-only-withattribute.csv", sep="|") tweets = tweets.drop_duplicates() tweets = tweets.dropna() list(tweets.columns.values) sentiment_counts = tweets.sentimen.value_counts() number_of_tweets = tweets.id.count() print(sentiment_counts) from nltk.probability import FreqDist fdist = FreqDist(tweets[(tweets.sentimen == 'negatif')]) print(fdist.most_common(50)) # count_vectorizer = CountVectorizer(ngram_range=(1,2)) count_vectorizer = TfidfVectorizer() vectorized_data = count_vectorizer.fit_transform(tweets.clean_tweet) indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data)) def sentiment2target(sentiment): return { 'negatif': 0, 'netral': 1, 'positif' : 2 }[sentiment] targets = tweets.sentimen.apply(sentiment2target) from sklearn.model_selection import train_test_split data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.4, random_state=0) data_train_index = data_train[:,0] data_train = data_train[:,1:] # print(data_train[0:2]) data_test_index = data_test[:,0] data_test = data_test[:,1:] from sklearn import svm from sklearn.multiclass import OneVsRestClassifier clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='rbf')) # clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='linear')) clf_output = clf.fit(data_train, targets_train) filename = 'model.sav' pickle.dump(clf_output, open(filename, 'wb')) print(clf.score(data_test, targets_test)) y_pred = clf.predict(data_test) print("Predict test data :\n"+str(y_pred)) print("Accuracy: ",accuracy_score(targets_test, y_pred)) print("Recall: ",recall_score(targets_test, y_pred, average='weighted')) print("Presisi: ",precision_score(targets_test, y_pred, average='weighted')) print("F1 score: ",f1_score(targets_test, y_pred, average='weighted')) sentences = count_vectorizer.transform([ "Negara kita ngutang buat bngun infrastruktur yang udah dipake masyarakat, terus masyarakatnya ngeluh karena negara ngutang, setiap negara itu pasti ngutang, utang bisa dibayar kalo negara dapet penghasilan. Penghasilan negara itu ya dari pajak", "Negara kita ngutang sehingga harga mahal dan masyarakat tercekik dan ngeluh", "Prabowo-Sandi Sepakat Tak Ambil Gaji karena Negara Sedang Susah", "Calon presiden Jokowi menjelaskan program Kartu Pra Kerja akan memberikan insentif dalam kurun waktu tertentu, bukan berarti memberikan gaji secara cuma-cuma bagi masyarakat yang belum berpenghasilan." ]) print(clf.predict_proba(sentences))
def train_model_one_vs_rest(model, vects, target, labels, **kwargs): model_performance = { 'roauc': [], 'f1': [], 'accuracy': [], } model = OneVsRestClassifier(model) for train_indices, test_indices in tqdm(kf.split(vects, target)): X_train = vects[train_indices] y_train = target[train_indices] X_test = vects[test_indices] y_test = target[test_indices] model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_ = model.predict_proba(X_test) model_performance['roauc'].append(roc_auc_score(y_test, y_pred_)) model_performance['f1'].append( f1_score(y_test, y_pred, average='weighted')) model_performance['accuracy'].append(accuracy_score(y_test, y_pred)) fig = plt.figure(figsize=(20, 18)) ax1 = plt.subplot2grid((3, 3), (0, 0), colspan=2) ax1.plot(model_performance['roauc'], label='roauc per iteration') ax1.plot(np.ones(10) * np.mean(model_performance['roauc']), '--', label='mean roauc') ax1.plot(model_performance['f1'], label='f1 per iteration') ax1.plot(np.ones(10) * np.mean(model_performance['f1']), '--', label='mean f1') ax1.plot(model_performance['accuracy'], label='accuracy per iteration') ax1.plot(np.ones(10) * np.mean(model_performance['accuracy']), '--', label='mean accuracy') ax1.grid() ax1.legend() ax1.set_xlabel('fold') ax1.set_ylabel('value') ax1.set_title('Model Performance') cm = [] cm.append( normalize( confusion_matrix(y_test[:, 0], y_pred[:, 0]), axis=1, norm='l1') * 100) ax2 = plt.subplot2grid((3, 3), (0, 2)) sns.heatmap(cm[-1], annot=True, square=True, ax=ax2, cmap='Blues') ax2.set_title(f'Confusion Matrix \'{labels[0]}\'') ax2.set_xlabel('Predicted') ax2.set_ylabel('Actual') for i, l in enumerate(labels[1:]): cm.append( normalize(confusion_matrix(y_test[:, i + 1], y_pred[:, i + 1]), axis=1, norm='l1') * 100) ax2 = plt.subplot2grid((3, 3), (i // 3 + 1, i % 3)) sns.heatmap(cm[-1], annot=True, square=True, ax=ax2, cmap='Blues') ax2.set_title(f'Confusion Matrix \'{l}\'') ax2.set_xlabel('Predicted') ax2.set_ylabel('Actual') return model_performance, cm, model
import prepare_data as prepare import evaluate from sklearn.lda import LDA from sklearn.multiclass import OneVsRestClassifier train_data, validation_data, test_data, basic_users_info = prepare.get_data() label_encoder = {} train_x, train_y = prepare.get_exclude_ndf_x(train_data, basic_users_info, label_encoder) validation_x, validation_y = prepare.get_exclude_ndf_x(validation_data, basic_users_info, label_encoder) rf = OneVsRestClassifier(LDA()).fit(train_x, train_y) #validation_predict = rf.predict(validation_x) validation_predict_proba = rf.predict_proba(validation_x) #print validation_predict_proba class_order = rf.classes_ predict_list = evaluate.candidate_classes(validation_predict_proba, class_order) ndcg = evaluate.ndcg(predict_list, validation_data) print(ndcg) test_x = prepare.get_exclude_ndf_test_x(test_data, basic_users_info, label_encoder) test_predict_proba = rf.predict_proba(test_x) test_predict_list = evaluate.candidate_classes(test_predict_proba, class_order) prepare.get_test_predict(test_data, test_predict_list)
def plot_roc_k_fold(clf, X, y, n_splits): """ Faz o treinamento de n classificadores com dados subdivididos em n pastas (KFold), e exibe uma curva ROC para cada classe com uma linha para cada pasta. Parameters ---------- clf : object Classificador X : array or DataFrame Conjunto de dados de treinamento e teste y : array or DataFrame Rótulos dos dados de treinamento e teste n_splits : int Número de subdivisões dos dados """ classes = np.unique(y) n_classes = len(classes) y_bin = label_binarize(y, classes=classes) cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0) classifier = OneVsRestClassifier(clf) dic = {'Fold': [], 'Class': [], 'FPR': [], 'TPR': []} for i, (train, test) in enumerate(cv.split(X, y_bin[:, 0])): classifier.fit(X[train], y_bin[train]) y_score = classifier.predict_proba(X) # Roc por classe for i_class in range(n_classes): fpr, tpr, _ = roc_curve(y_bin[test, i_class], y_score[test, i_class]) dic['Fold'] += [i] dic['Class'] += [classes[i_class]] dic['FPR'] += [fpr] dic['TPR'] += [tpr] df_result = pd.DataFrame(data=dic) # Imprime gráfico colors = ['orange', 'red', 'green', 'cyan', 'gold'] for class_ in list(df_result['Class'].unique()): df_plot = df_result[df_result['Class'] == class_] tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) count = 0 for index, row in df_plot.iterrows(): fold = row['Fold'] fpr = row['FPR'] tpr = row['TPR'] roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color=colors[count % len(colors)], lw=1.5, label='ROC fold {0} (AUC = {1:0.3f})' ''.format(fold, roc_auc), alpha=0.5) # Guarda pra o cálculo da média interp_tpr = np.interp(mean_fpr, fpr, tpr) tprs += [interp_tpr] aucs += [roc_auc] count += 1 # Calcula as médias mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) # Calcula desvio padrão std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='blue', lw=1.5, label='Média ROC (AUC = {:0.3f} $\pm$ {:0.3f})' ''.format(mean_auc, std_auc)) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 desvio padrão') plt.plot([0, 1], [0, 1], 'k--', lw=1.5) plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Taxa de Falso Positivo') plt.ylabel('Taxa de Verdadeiro Positivo') plt.title('Curva ROC \n (Classe=' + class_ + ')') plt.legend(loc="lower right") plt.show()
class BagOfVisualWords(ImageClassifier): def __init__(self, max_features=30, clusters_num=200, model=None, labels=None, voc=None, scaler=None, img_loader=None): super().__init__() self.__model = model self._labels = labels self.__vocabulary = voc self.__clusters_num = clusters_num self.__extractor_max_features = max_features self.__feature_extractor = cv2.SIFT_create() self.__scaler = scaler self._img_loader = img_loader # Train BoVW model def fit(self, img_data_generator: ImgDataGenerator): self._init_img_loader(img_data_generator) train_generator = img_data_generator.train_generator test_generator = img_data_generator.test_generator self._labels = list(train_generator.class_indices.keys()) shape = train_generator.image_shape train_samples_num = train_generator.samples print('Number of samples: ', train_samples_num) print('Image shape: ', shape) print('Labels:', self._labels) print() # Train samples descriptors, y_train = self.__get_data(train_generator) X_train, self.__vocabulary = self.__extract_features(descriptors, k=self.__clusters_num) # Train the Linear SVM self.__model = OneVsRestClassifier(SVC(kernel='linear',probability=True, max_iter=-1), n_jobs=-1) self.__model.fit(X_train, y_train) # Test samples test_descriptors, y_test = self.__get_data(test_generator) X_test, _ = self.__extract_features(test_descriptors, voc=self.__vocabulary, k=self.__clusters_num) y_pred = self.__model.predict_proba(X_test) y_pred = [self._labels[y.argmax()] for y in y_pred] y_test = [self._labels[y.argmax()] for y in y_test] print('\nConfusion Matrix:\n') print(confusion_matrix(y_test, y_pred)) print('\nReport:') print(classification_report(y_test, y_pred)) # Classify image def predict(self, img_path): image = self._img_loader.load_img(img_path) descriptors = self.__descriptors_from_img(image) # Get descritors des = descriptors[0] for descriptor in descriptors[1:]: des = np.vstack((des, descriptor)) # Calculate feature histogram features = np.zeros((1, self.__clusters_num), "float32") words, _ = vq(des, self.__vocabulary) for w in words: features[0][w] += 1 features = features.reshape(1, -1) features = self.__scaler.transform(features) # Perform probability prediction probabilities = self.__model.predict_proba(features) return self._labels, probabilities[0] # Save BoVW model to the given path def save(self, path): if not os.path.exists(path): os.makedirs(path) joblib.dump((self.__model, self._labels, self.__clusters_num, self.__vocabulary, self.__extractor_max_features, self.__scaler, self._img_loader), os.path.join(path, 'bovw.pkl')) # Load BoVW model from the given path @classmethod def load(cls, path): model, labels, clusters_num, voc, max_features, scl, loader = joblib.load(os.path.join(path, 'bovw.pkl')) return BagOfVisualWords(max_features, clusters_num, model, labels, voc, scl, loader) # Get labels and image descriptors from generator def __get_data(self, generator): samples = generator.samples batch_size = generator.batch_size descriptors, labels = list(), list() for _ in range(samples // batch_size + 1): data_batch, labels_batch = generator.next() for img_data, label in zip(data_batch, labels_batch): des = self.__descriptors_from_img(img_data) if des is not None: descriptors.append(des) labels.append(label) return descriptors, labels # Get descriptors from image def __descriptors_from_img(self, image_data): image_data *= 255 image8bit = cv2.normalize(image_data, None, 0, 255, cv2.NORM_MINMAX).astype('uint8') _, des = self.__feature_extractor.detectAndCompute(image8bit, None) return des # Calculate the histogram of features def __extract_features(self, descriptors, voc=None, scaler=None, k=200): # Stack all the descriptors vertically in a numpy array des = np.array(descriptors[0]) for descriptor in descriptors[1:]: des = np.vstack((des, descriptor)) # Convert integers to float, so kmeans will work properly descriptors_float = des.astype(float) if voc is None: # Perform k-means clustering and vector quantization voc, _ = kmeans(whiten(descriptors_float), k, 1) # Calculate the histogram of features and represent them as vector im_features = np.zeros((len(descriptors), k), "float32") for i in range(len(descriptors)): words, _ = vq(descriptors[i], voc) for w in words: im_features[i][w] += 1 # Standardize features by removing the mean and scaling to unit variance if scaler is None: self.__scaler = StandardScaler().fit(im_features) im_features = self.__scaler.transform(im_features) return im_features, voc
''' ##テスト df_test = pd.read_csv("test.csv") #df_test2 = df_test.sample(100, random_state=0) X_test = [] for i, row in df_test.iterrows(): img = Image.open(os.path.join(DIR_IMAGES, row.filename)) img = img.crop((row.left, row.top, row.right, row.bottom)) img = img.convert("L") img = img.resize((IMG_SIZE, IMG_SIZE), resample=Image.BICUBIC) x = np.asarray(img, dtype=np.float) x = x.flatten() X_test.append(x) X_test = np.array(X_test) #X_test.shape#100個のテストデータの10000次元ベクトル #scaler2 = StandardScaler()#変換器の初期化 #scaler2.fit(X_test)#開発データに合わせる,ないとエラー X_test_scaled = scaler.transform(X_test) #標準化されたデータが返される #decomposer2 = PCA(n_components=30, random_state=0)#圧縮先の次元数を指定 #decomposer2.fit(X_scaled2)#使うデータに合わせる X_test_pca = decomposer.transform(X_test_scaled) #PCAの結果を格 Y_test_pred = classifier.predict_proba(X_test_pca) np.savetxt('submission6.dat', Y_test_pred, fmt='%.6f')
x_validationset = x_validationset.groupby(x_validationset.index).apply(transformXY) x_testset = x_testset.groupby(x_testset.index).apply(transformXY) x_traindata = x_traindata.groupby(x_traindata.index).apply(transformXY) #Normalise the data df = x_traindata.iloc[:,1:] df_norm = (df - df.mean(axis=1)) / (df.max(axis=1) - df.min(axis=1)) x_traindata = df_norm #Train classifier clf = OneVsRestClassifier(SVC(C=0.1,kernel='poly', probability=True)) clf.fit(x_traindata, y_traindata) # now you can save it to a file with open('SKINclassifierpolytrainset_SVC_c01.pkl', 'wb') as f: pickle.dump(clf, f) ## and later you can load it with open('SKINclassifierlineartraindata_onevsone_padded_SVC_rs5.pkl', 'rb') as f: clf = pickle.load(f) #Make predictions preds = clf.predict_proba(x_testdata) predsdf = pd.DataFrame(preds) predsdf.to_pickle('predictions_SKIN_poly_c01_validationset.pkl') # where to save it, usually as a .pkl #Write outputfile check = predsdf predsdf = check Output.to_outputfile(check,1,'SKINpoly_c01_clean_validationset',clean=True, validation=True) Output.to_outputfile(check,1,'SKINpoly_c01_testdata',clean=False, validation=False)
X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ################################################ # Logistic Regression ################################################ # predict each class against the other C_parameter = 50. / len(X_train) # parameter for regularization of the model class_parameter = 'multinomial' # parameter for dealing with multiple classes penalty_parameter = 'l1' # parameter for the optimizer (solver) in the function solver_parameter = 'saga' # optimization system used tolerance_parameter = 0.1 # termination parameter classifier = OneVsRestClassifier(LogisticRegression(C=C_parameter, multi_class=class_parameter,\ penalty=penalty_parameter, solver=solver_parameter, tol=tolerance_parameter)) classifier.fit(X_train, y_train) # Training the algorithm y_predict = classifier.predict(X_test) # prediction probas = classifier.predict_proba(X_test) # probability # Compute ROC curve and ROC area for each class fpr = dict( ) # dictionary to assign fpr for each scenario (key is the name of class) tpr = dict() # dictionary to assign tpr roc_auc = dict() #dictionary to assign AUC of each class th = dict() # dictionary to assign probability threshold CM = dict() P = dict() R = dict() F1 = dict() for i in range(n_classes): fpr[i], tpr[i], th[i] = roc_curve(y_test[:, i], probas[:, i]) roc_auc[i] = auc( fpr[i], tpr[i]) # calculated area under the curve for a each scenario CM[i] = confusion_matrix(y_test[:, i], y_predict[:, i])
def coordinates_fe(X_modelar, y_modelar, X_estimar, K=4): est_IDs = X_estimar[0] X_est_mod = pd.concat([X_modelar, X_estimar], sort=False) coords = X_est_mod[[1, 2]].rename(columns={1:'X', 2:'Y'}) spatialTree = cKDTree(np.c_[coords.X.ravel(),coords.Y.ravel()]) X_est_mod.drop([0],inplace=True,axis=1) #X_est_mod = reduce_colors(X_est_mod) X_estimar.drop([0],inplace=True,axis=1) #X_estimar = reduce_colors(X_estimar) X_modelar.drop([0],inplace=True,axis=1) #X_modelar = reduce_colors(X_modelar) """ print(list(X_modelar.columns.values)) print(list(X_estimar.columns.values)) print(list(y_modelar.columns.values)) """ classifier = xgb.XGBClassifier() ovsr = OneVsRestClassifier(classifier,n_jobs=-1).fit(X_modelar,y_modelar) pred_estimar = ovsr.predict_proba(X_estimar) offset = X_modelar.shape[0] classes = get_categories_list() col_names = [] for i in range(7): col_names.append('coords_' + classes[i]) cont = [] for i in range(X_est_mod.shape[0]): indices = [0.0,0.0,0.0,0.0,0.0,0.0,0.0] neigh_dist, neigh_indices = spatialTree.query([[coords.iloc[i,0],coords.iloc[i,1]]],k=K) for j in range(1,K): # Para cada vecino sumamos 1 a la variable contexto de la clase de la finca # O en caso de que se encuentre en X_estimar sumamos las probabilidades if neigh_indices[0][j] < offset : indices[int(y_modelar.loc[neigh_indices[0][j], 'CLASS'])] += 1 else: indices = np.add(indices, pred_estimar[neigh_indices[0][j]-offset,:]) cont.append(indices)# Sin softmax #cont.append(softmax(np.array(indices))) #Con softmax indexes_est = [] for i in range(X_estimar.shape[0]): indexes_est.append(i) context = pd.DataFrame(data=cont,columns=col_names) context_modelar = context.loc[:offset-1] context_estimar = context.loc[offset:] context_estimar.index = range(5618) violin_plot_kdtree(context_modelar, y_modelar) #context.drop('coords_RESIDENTIAL',axis=1,inplace=True) #PROBAR CON Y SIN for column in col_names: X_modelar[column] = context_modelar[column] X_estimar[column] = context_estimar[column] #return X_modelar.values, X_estimar.values, est_IDs return X_modelar, X_estimar, est_IDs
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) #clf = svm.SVC(gamma=0.001, C=100.) clf = OneVsRestClassifier(LogisticRegression()) # clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train) kf = KFold(n_splits=3, shuffle=True) scores = cross_val_score(clf, X, y, cv=kf) print("Accuracy for 10 fold CV", scores) print("Average accuracy: ", numpy.mean(scores)) y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred, target_names=le.classes_)) preds = clf.predict_proba(X_test)[:,1] fpr, tpr, _ = metrics.roc_curve(y_test, preds) df = pd.DataFrame(dict(fpr=fpr, tpr=tpr)) #df.to_csv("/Users/shilpagundrathi/Downloads/RandomForest.csv") auc = metrics.auc(fpr,tpr) print(auc) # g= ggplot(df, aes(x='fpr', y='tpr')) +\ # geom_abline(linetype='dashed') # ggplot.ggsave(plot = g, filename = "new_test_file") #print (ggplot(df, aes(x='fpr', y='tpr')) + \ #eom_line(color='black') )
from sklearn import datasets from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC svc=OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear',probability=True))) svc.fit(xTrainS,yTrain) Y_pred_SVM=svc.predict(xTestS) from sklearn.metrics import confusion_matrix confusion_matrix(yTest,Y_pred_SVM) #Accuracy:-(58+289)/367----94% #ROC curve from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve svm_roc_auc=roc_auc_score(yTest,svc.predict(xTest)) svm_roc_auc fpr,tpr,thresholds=roc_curve(yTest,svc.predict_proba(xTest)[:,1]) # auc 48% ###Logistiuc regression ' from sklearn.linear_model import LogisticRegression logmodel=LogisticRegression() logmodel.fit(xTrain,yTrain) #prediction Y_pred_LG=logmodel.predict(xTest) #confusion matrix from sklearn.metrics import confusion_matrix confusion_matrix(yTest,Y_pred_LG)
def train(self, X, y, test_ratio=0.2): print("\tShuffling arrays...") p = np.random.permutation(X.shape[0]) X, y = X[p], y[p] print("\tTraining classifier...") train_instances = int((1 - test_ratio) * X.shape[0]) # train on the training samples (as many cpus as avail.) if self.classifier_type == "multiclass": y_hot = make_one_hot(y) self.model = OneVsRestClassifier(LogisticRegression(), n_jobs=-1).fit( X[:train_instances], y_hot[:train_instances]) if self.classifier_type == "logistic": self.model = LogisticRegression(penalty='l2', solver='sag').fit( X[:train_instances], y[:train_instances]) if self.classifier_type == "mlp": self.model = MLPClassifier( hidden_layer_sizes=(100, 50, 20, 5)).fit(X[:train_instances], y[:train_instances]) if self.classifier_type == "multiclass_logistic": y_hot = make_one_hot(y) layer1 = OneVsRestClassifier(LogisticRegression(), n_jobs=-1).fit( X[:train_instances], y_hot[:train_instances]) layer1_output = layer1.predict_proba(X[:train_instances]) layer2 = MLPClassifier(hidden_layer_sizes=(3, 3)).fit( layer1_output, y[:train_instances]) layer2_output = layer2.predict_proba(X[:train_instances]) output_layer = LogisticRegression().fit(layer2_output, y[:train_instances]) #self.model = LogisticRegression(OneVsRestClassifier(LogisticRegression(penalty='l2',solver='sag'),n_jobs=-1).fit(X[:train_instances],y_hot[:train_instances])) if self.classifier_type == "multiclass_logistic": l1_pred = layer1.predict_proba(X[train_instances:]) l2_pred = layer2.predict_proba(l1_pred) o_pred = output_layer.predict(l2_pred) #m1_pred = m1.predict_proba(X[train_instances:]) #m2_pred = m2.predict(m1_pred) num_correct = 0 for p, a in zip(o_pred, y_int[train_instances:]): if p == a: num_correct += 1 print("Accuracy %0.1f%%" % (100.0 * float(num_correct) / float(len(m2_pred)))) print("Plotting confusion matrix...") y_test = y_int[train_instances:] y_pred = o_pred plot_confusion_matrix(y_test, y_pred, self.class_names, train_instances, normalize=True) return elif self.classifier_type == "multiclass": y_hot = make_one_hot(y) self.scores = cross_val_score(self.model, X[train_instances:], y_hot[train_instances:], cv=5) else: y_hot = make_one_hot(y) self.scores = cross_val_score(self.model, X[train_instances:], y_int[train_instances:], cv=5) # score on the testing samples print("Accuracy: %0.1f%% (+/- %0.1f%%)" % (100 * self.scores.mean(), 100 * self.scores.std() * 2)) if self.class_names != None and self.classifier_type != "multiclass": print("Plotting confusion matrix...") y_test = y[train_instances:] y_pred = self.model.predict(X[train_instances:]) plot_confusion_matrix(y_test, y_pred, self.class_names, train_instances, normalize=True) return self.scores.mean(), self.scores.std() * 2
accuracy = (y_pred_rt == y_test).sum() / y_test.shape[0] n_estimator = 100 # Supervised transformation based on random forests sc_X = StandardScaler() sc_X.fit(X_train) X_train_std = sc.transform(x_train) rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = OneVsRestClassifier(LogisticRegression()) rf.fit(X_train, y_train_2) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(x_test))) y_pred_rf_lm = np.argmax(y_pred_rf_lm, axis=1) accuracy_rf_lm = (y_pred_rf_lm == y_test).sum() / y_test.shape[0] from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin from sklearn.preprocessing import LabelEncoder from sklearn.externals import six from sklearn.base import clone from sklearn.pipeline import _name_estimators import numpy as np import operator class MajorityVoteClassifier(BaseEstimator, ClassifierMixin): """ A majority vote ensemble classifier
def predict_multiclass(X_train, y_train, X_test, y_test, graphTitle="", max_depth=12, n_estimators=140, plot=True, weight=20): weights = {0: 1, 1: weight, 2: weight} #y manipulating y_train = label_binarize(y_train, classes=[0, 1, 2]) y_test = label_binarize(y_test, classes=[0, 1, 2]) m = OneVsRestClassifier( RandomForestClassifier(max_depth=max_depth, random_state=0, n_estimators=n_estimators)) y_score = m.fit(X_train, y_train).predict_proba(X_test) probs = m.predict_proba(X_test) probs_train = m.predict_proba(X_train) # Compute ROC curve and area the curve fpr = dict() tpr = dict() roc_auc = dict() n_classes = 3 for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'Some extension of Receiver operating characteristic to multi-class') plt.legend(loc="lower right") plt.show() return ()
def svm(i): train_x = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Train_CV_{i}.csv').iloc[:, 9:] train_y = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Train_CV_{i}.csv').iloc[:, 4] validation_x = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Validation_CV_{i}.csv' ).iloc[:, 9:] validation_y = pd.read_csv( f'./CV_FeCV_Features_631atures/ClassificationFeatures/Validation_CV_{i}.csv' ).iloc[:, 4] test_x = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Test_CV_{i}.csv').iloc[:, 9:] test_y = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Test_CV_{i}.csv').iloc[:, 4] encoder = LabelEncoder().fit( train_y) # #训练LabelEncoder, 把y_train中的类别编码为0,1,2,3,4,5 y = encoder.transform(train_y) y_train = pd.DataFrame( encoder.transform(train_y)) # 使用训练好的LabelEncoder对源数据进行编码 y_valid = pd.DataFrame(encoder.transform(validation_y)) y_test = pd.DataFrame(encoder.transform(test_y)) # 标签降维度 y_train = y_train.iloc[:, 0].ravel() y_valid = y_valid.iloc[:, 0].ravel() y_test = y_test.iloc[:, 0].ravel() # X标准化 scaler = StandardScaler() x_train_std = scaler.fit_transform(train_x) x_valid_std = scaler.fit_transform(validation_x) x_test_std = scaler.fit_transform(test_x) # ------------ # Gamma # ------------ accuracy_list_valid, f1_list_valid, auc_list_valid = [], [], [] gamma_range = np.logspace(-10, 1, 10, base=2) logger.info(gamma_range) for idx, gamma in enumerate(tqdm(gamma_range)): # ------------ # Training # ------------ time0 = time() logger.info( f">>>>>>>CV = {i}/10, Start Trainng {idx + 1}/{len(gamma_range)}>>>>>>>" ) print( f">>>>>>> CV = {i}/10, Start Training {idx + 1}/{len(gamma_range)}>>>>>>>" ) clf = OneVsRestClassifier( SVC( kernel='rbf', # gamma=gamma, C=1, # default degree=1, cache_size=5000, probability=True, class_weight='balanced')) clf.fit(x_train_std, y_train) # ------------ # Validation: Fine-tuning on Validation dataset # ------------ y_prediction_valid = clf.predict(x_valid_std) accuracy_valid = accuracy_score(y_valid, y_prediction_valid) accuracy_list_valid.append(accuracy_valid) f1_valid = f1_score(y_valid, y_prediction_valid, average="weighted") f1_list_valid.append(f1_valid) y_binary_valid = label_binarize(y_valid, classes=list(range(6))) result_valid = clf.decision_function(x_valid_std) auc_valid = roc_auc_score(y_binary_valid, result_valid, average='micro') auc_list_valid.append(auc_valid) # Logger logger.info( f"Validation Gamma >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}" ) print( f"Validation Gamma >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}" ) print( datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f")) best_gamma = gamma_range[accuracy_list_valid.index( max(accuracy_list_valid))] best_acc = max(accuracy_list_valid) best_f1 = f1_list_valid[accuracy_list_valid.index( max(accuracy_list_valid))] best_auc = auc_list_valid[accuracy_list_valid.index( max(accuracy_list_valid))] print( f"Validation >>> Best gamma = {best_gamma}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}\n" ) logger.info( f"Validation >>> Best gamma = {best_gamma}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}" ) # ------------ # C # ------------ best_gamma = gamma_range[accuracy_list_valid.index( max(accuracy_list_valid))] C = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] accuracy_list_C_valid = [] for idx, c in enumerate(tqdm(C)): time0 = time() logger.info( f">>>>>>>CV = {i}/10, Fine-Tuining C, Start Trainng {idx + 1}/{len(C)}>>>>>>>" ) print( f">>>>>>> CV = {i}/10, Fine-Tuining C, Start Training {idx + 1}/{len(C)}>>>>>>>" ) clf = OneVsRestClassifier( SVC( kernel='rbf', # gamma=best_gamma, C=c, # default degree=1, cache_size=5000, probability=True, class_weight='balanced')) clf.fit(x_train_std, y_train) # ------------ # Validation: Fine-tuning on Validation dataset # ------------ y_prediction_valid = clf.predict(x_valid_std) accuracy_valid = accuracy_score(y_valid, y_prediction_valid) accuracy_list_C_valid.append(accuracy_valid) f1_valid = f1_score(y_valid, y_prediction_valid, average="weighted") y_binary_valid = label_binarize(y_valid, classes=list(range(6))) result_valid = clf.decision_function(x_valid_std) auc_valid = roc_auc_score(y_binary_valid, result_valid, average='micro') # Logger logger.info( f"Validation C >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}" ) print( f"Validation C >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}" ) print( datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f")) best_c = C[accuracy_list_C_valid.index(max(accuracy_list_C_valid))] # logger best_acc = max(accuracy_list_C_valid) best_f1 = f1_list_valid[accuracy_list_valid.index( max(accuracy_list_valid))] best_auc = auc_list_valid[accuracy_list_valid.index( max(accuracy_list_valid))] print( f"Validation >>> Best gamma = {best_gamma}, Best C = {best_c}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}\n" ) logger.info( f"Validation >>> Best gamma = {best_gamma}, Best C = {best_c}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}" ) # ------------ # Test: Test on Test dataset with best gamma # ------------ clf_best_test = OneVsRestClassifier( SVC( kernel='rbf', # gamma=best_gamma, C=best_c, # default degree=1, cache_size=5000, probability=True, class_weight='balanced')) clf_best_test.fit(x_train_std, y_train) # accuracy & F1 & AUC on Test dataset y_test_prediction = clf_best_test.predict(x_test_std) test_accuracy = round(accuracy_score(y_test, y_test_prediction), 4) test_f1 = round(f1_score(y_test, y_test_prediction, average="weighted"), 4) y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot result_test = clf_best_test.decision_function(x_test_std) test_auc = round( roc_auc_score(y_test_binary, result_test, average='micro'), 4) print( f"CV = {i}, Test >>> gamma = {best_gamma}, Acc. ={test_accuracy}, F1-Score = {test_f1}, AUC = {test_auc}" ) logger.info( f"CV = {i}, Test >>> gamma = {best_gamma}, Acc. ={test_accuracy}, F1-Score = {test_f1}, AUC = {test_auc}" ) # save result_test = clf_best_test.predict_proba(x_test_std) df = pd.DataFrame(result_test) df.to_csv( f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210621_prediction_CV{i}_Gamma_{round(best_gamma,4)}_C_{round(best_c)}_ACC_{test_accuracy}_F1_{test_f1}_AUC_{test_auc}.csv" ) df2 = pd.DataFrame(y_test) df2.to_csv( f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210324_GT_CV{i}.csv" ) print(f">>>>>>> CV = {i}/10, Over Training >>>>>>>\n") logger.info(f">>>>>>> CV = {i}/10,Over Training >>>>>>>") return [test_accuracy, test_f1, test_auc]
import prepare_data as prepare import evaluate from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsRestClassifier train_data, validation_data, test_data, basic_users_info = prepare.get_data() label_encoder = {} train_x, train_y = prepare.get_exclude_ndf_x(train_data, basic_users_info, label_encoder) validation_x, validation_y = prepare.get_exclude_ndf_x(validation_data, basic_users_info, label_encoder) rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=11, criterion='gini')).fit(train_x, train_y) validation_predict = rf.predict(validation_x) validation_predict_proba = rf.predict_proba(validation_x) # print validation_predict_proba class_order = rf.classes_ predict_list = evaluate.candidate_classes(validation_predict_proba, class_order) #print predict_list ndcg = evaluate.ndcg(predict_list, validation_data) print(ndcg) test_x = prepare.get_exclude_ndf_test_x(test_data, basic_users_info, label_encoder) test_predict = rf.predict_proba(test_x) test_predict_list = evaluate.candidate_classes(test_predict, class_order) prepare.get_test_predict(test_data, test_predict_list)
# Print the accuracy print("Accuracy: {}".format(clf.score(X_test, y_test))) #3 # Instantiate the classifier: clf clf = OneVsRestClassifier(LogisticRegression()) # Fit it to the training data clf.fit(X_train, y_train) # Load the holdout data: holdout holdout = pd.read_csv('HoldoutData.csv', index_col=0) # Generate predictions: predictions predictions = clf.predict_proba(holdout[NUMERIC_COLUMNS].fillna(-1000)) #4 # Generate predictions: predictions predictions = clf.predict_proba(holdout[NUMERIC_COLUMNS].fillna(-1000)) # Format predictions in DataFrame: prediction_df prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns, index=holdout.index, data=predictions) # Save prediction_df to csv prediction_df.to_csv('predictions.csv') # Submit the predictions for scoring: score score = score_submission(pred_path='predictions.csv')
# class_weight='auto' produces reduced performance, val mrr 0.574 -> 0.527 # (see the notebook) # We use L1 regularization mainly to minimize the output model size, # though it seems to yield better precision+recall too. t_start = time.clock() cfier = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=4) cfier.fit(traindata.X, traindata.Y) t_end = time.clock() print('// training took %d seconds' % (t_end-t_start,)) sys.stdout.flush() ## Benchmarking with open(valfile, 'r') as f: valdata = VectorizedData(json.load(f), traindata.Xdict, traindata.Ydict) print('// valdata: %d questions' % (np.size(valdata.X, axis=0),)) sys.stdout.flush() val_score = valdata.cfier_score(cfier, lambda cfier, X: cfier.predict_proba(X)) print('// val sklScore %.3f, qRecallAny %.3f, qRecallAll %.3f, pathPrec %.3f, [qScoreMRR %.3f]' % ( val_score['sklScore'], val_score['qRecallAny'], val_score['qRecallAll'], val_score['pPrec'], val_score['qScoreMRR'])) sys.stdout.flush() ## Data Dump dump_cfier(cfier, traindata.Xdict, traindata.Ydict)
# ) # grid = GridSearchCV(clf_pipeline, param_grid, scoring='f1_samples') # grid.fit(x2, y3) X_train, X_test, y_train, y_test = train_test_split(x2, y3, test_size=0.1, random_state=42) #.fit(X_train, y_train) model = OneVsRestClassifier(clf) model.fit(X_train, y_train) # parameters = {'n_estimators':[200, 300, 400], 'min_samples_split':[4, 6, 8, 10], 'min_samples_leaf':[4,6,8]} # # gscv=GridSearchCV(model, param_grid, scoring="f1_samples") # gscv.fit(X_train,y_train) # predictions = model.predict(X_test) obs = y_test predprobs = model.predict_proba(X_test) sklearn.metrics.hamming_loss(y_test, predictions) sklearn.metrics.f1_score(y_test, predictions, average="samples") sklearn.metrics.precision_score(y_test, predictions) sklearn.metrics.recall_score(y_test, predictions) aucs = aucvals(predprobs, obs)
test_X = test_df.set_index('ncodpers').join(test_pre_df.set_index('ncodpers'), rsuffix = '_pre') test_X.products.loc[test_X.products.isnull()] = test_X.products.loc[test_X.products.isnull()].apply(lambda x: []) test_X.fillna(0, inplace = True) test_pre_y = multilabel_encoder.transform(test_X['products']) test_X.drop('products', axis = 1, inplace = True) test_X.reset_index(drop = True,inplace = True) #test_X = test_X.rename(columns = {'products':'pre_products'}) print "Random Forest model:" forest = RandomForestClassifier(n_estimators=250, random_state=1, verbose = 1, criterion='entropy') multi_label_forest = OneVsRestClassifier(forest, n_jobs=-1) multi_label_forest.fit(train_X, train_y) print "Predicting:" preds = multi_label_forest.predict_proba(test_X) new_preds = preds - test_pre_y new_preds = np.argsort(new_preds, axis=1) new_preds = np.fliplr(new_preds)[:,:7] if test_month == 18: final_preds = [' '.join([target_cols[pred] for pred in new_pred]) for new_pred in new_preds] out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds}) out_df.to_csv(output_file, columns = ['ncodpers', 'added_products'], index=False) else: print "Scoring..." test_preds = [[target_cols[pred] for pred in new_pred] for new_pred in new_preds] truth_list = np.array((test_y - test_pre_y)) ==1 truth_list = [''.join([target_cols[i] if i else '' for i in truth]).split() for truth in truth_list] print mapk(truth_list, test_preds, 7)
x = X.values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y) from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier # Fit and train model using training set model = OneVsRestClassifier( LogisticRegression(solver='liblinear', multi_class='ovr')) # 'lbfgs' , 'liblinear' model.fit(x_train, y_train) # Predict using trained model y_prob = model.predict_proba(x_test) # Pickle model filename = 'model.pkl' outfile = open(filename, 'wb') pickle.dump(model, outfile) outfile.close() # Map labels based on probability labels = Y_encode.columns y_pred = [] y_true = [] for i in y_prob: y_pred.append(labels[i.argmax()])
plt.ylim([0.0, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Some extension of Receiver operating characteristic to multi-class") plt.legend(loc="lower right") plt.show() # %% # Area under ROC for the multiclass problem # ......................................... # The :func:`sklearn.metrics.roc_auc_score` function can be used for # multi-class classification. The multi-class One-vs-One scheme compares every # unique pairwise combination of classes. In this section, we calculate the AUC # using the OvR and OvO schemes. We report a macro average, and a # prevalence-weighted average. y_prob = classifier.predict_proba(X_test) macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="macro") weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="weighted") macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="macro") weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob,
def plot_roc_curve(clf, X_train, y_train, X_test, y_test): """ Exibe a curva roc fazendo o treinamento considerando as classes um contra todos Parameters ---------- clf : object Classificador X_train : array or DataFrame Conjunto de dados de treinamento y_trian : array or DataFrame Rótulos dos dados de treinamento X_test : array or DataFrame Conjunto de dados de teste y_test : array or DataFrame Rótulos dos dados de teste """ # Obtém os nomes das classes classes = np.unique(y_test) n_classes = len(classes) # Binariza os rótulos y_train_bin = label_binarize(y_train, classes=classes) y_test_bin = label_binarize(y_test, classes=classes) # Cria o modelo um contra todos classifier = OneVsRestClassifier(clf) # Treina o modelo classifier.fit(X_train, y_train) # Faz predição com probabilidades sobre os dados de teste y_score = classifier.predict_proba(X_test) # Plotting and estimation of FPR, TPR fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # colors = cycle(['blue', 'red', 'green', 'cyan', 'gold']) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, lw=1.5, label='ROC curve of class {0} (area = {1:0.3f})' ''.format(classes[i], roc_auc[i])) plt.plot([0, 1], [0, 1], 'k-', lw=1.5) plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic for multi-class data') plt.legend(loc="lower right") plt.show()
class OneVsRestLogisticRegression: """Implements a Logistic Regression for a mutilabel problem using a OneVsRest approach. Attributes: model_: The model to be fit and used for prediction. negative_column_index_: The index of the column indicating the absence of other classes, if it exists. """ def __init__(self, negative_column_index=None, **kwargs): """The constructor for the OneVsRestLogisticRegression class. Parameters: negative_column_index: The index of the column indicating the absence of other classes. None by default. kwargs: Arguments to be passed to the LogisticRegression class. """ self.model_ = OneVsRestClassifier(LogisticRegression(**kwargs)) self.negative_column_index_ = negative_column_index def fit(self, X, y): """Fits the model to the data and its labels. If negative_column_index_ is not None, then the corresponding column in y is removed before fitting the model. Parameters: X: Numpy array with the features of the data to fit the model to. y: Numpy array with the one hot encoding of the labels of the data to fit the model to.The labels must be in an n x m matrix, where n is the number of data points, and m the number of classes. """ if self.negative_column_index_: self.model_.fit( X, np.delete(y, self.negative_column_index_, axis=1)) else: self.model_.fit( X, y) def predict(self, X, threshold=0.7, max_classes=3): """Predicts the labels for a given dataset. If negative_column_index_ is not None, then the corresponding column is reinserted after prediction. The values of this column will be equal to 1 in the cases where no class has been predicted. Parameters: X: Numpy array with the data that will be used for prediction. Returns: p: Numpy array with the predictions. """ prob = self.model_.predict_proba(X) consider = np.argsort(prob)[:, :-(max_classes+1):-1] mult = np.zeros(prob.shape) for a, b in zip(mult, consider): a[b] = 1 prob = mult*prob p = (prob >= threshold).astype(int) if self.negative_column_index_: p = np.insert( p, self.negative_column_index_, values=(p.sum(axis=1) == 0).astype(int), axis=1) return p
class EMRVC(BaseRVM, ClassifierMixin): """Relevance Vector Classifier. Implementation of Mike Tipping"s Relevance Vector Machine for classification using the scikit-learn API. The multiclass support is handled according to a one-vs-rest scheme. For details on the precise mathematical formulation of the provided kernel functions and how `gamma`, `coef0` and `degree` affect each other, see the corresponding section in the narrative documentation: :ref:`svm_kernels`. Parameters ---------- n_iter_posterior : int, optional (default=50) Number of iterations to calculate posterior. kernel : string, optional (default="rbf") Specifies the kernel type to be used in the algorithm. It must be one of "linear", "poly", "rbf", "sigmoid" or ‘precomputed’. If none is given, "rbf" will be used. degree : int, optional (default=3) Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. gamma : float, optional (default="auto") Kernel coefficient for "rbf", "poly" and "sigmoid". Current default is "auto" which uses 1 / n_features, if ``gamma="scale"`` is passed then it uses 1 / (n_features * X.var()) as value of gamma. The current default of gamma, "auto", will change to "scale" in version 0.22. "auto_deprecated", a deprecated version of "auto" is used as a default indicating that no explicit value of gamma was passed. coef0 : float, optional (default=0.0) Independent term in kernel function. It is only significant in "poly" and "sigmoid". tol : float, optional (default=1e-6) Tolerance for stopping criterion. threshold_alpha : float, optional (default=1e5) Threshold for alpha selection criterion. beta_fixed : {"not_fixed"} or float, optional (default="not_fixed") Fixed value for beta. If "not_fixed" selected, the beta is updated at each iteration. alpha_max : int, optional (default=1e9) Basis functions associated with alpha value beyond this limit will be purged. Must be a positive and big number. init_alpha : array-like of shape (n_sample) or None, optional (default=None) Initial value for alpha. If None is selected, the initial value of alpha is defined by init_alpha = 1 / M ** 2. bias_used : boolean, optional (default=False) Specifies if a constant (a.k.a. bias) should be added to the decision function. max_iter : int, optional (default=5000) Hard limit on iterations within solver. compute_score : boolean, optional (default=False) Specifies if the objective function is computed at each step of the model. verbose : boolean, optional (default=False) Enable verbose output. Attributes ---------- relevance_ : array-like, shape (n_relevance) Indices of relevance vectors. relevance_vectors_ : array-like, shape (n_relevance, n_features) Relevance vectors (equivalent to X[relevance_]). alpha_ : array-like, shape (n_samples) Estimated alpha values. gamma_ : array-like, shape (n_samples) Estimated gamma values. Phi_ : array-like, shape (n_samples, n_features) Estimated phi values. Sigma_ : array-like, shape (n_samples, n_features) Estimated covariance matrix of the weights. mu_ : array-like, shape (n_relevance, n_features) Coefficients of the classifier (mean of posterior distribution) coef_ : array, shape (n_class * (n_class-1) / 2, n_features) Coefficients of the classfier (mean of posterior distribution). Weights assigned to the features. This is only available in the case of a linear kernel. `coef_` is a readonly property derived from `mu` and `relevance_vectors_`. See Also -------- EMRVR Relevant Vector Machine for Regression. Notes ----- **References:** `The relevance vector machine. <http://www.miketipping.com/sparsebayes.htm>`__ """ def __init__(self, n_iter_posterior=50, kernel="rbf", degree=3, gamma="auto_deprecated", coef0=0.0, tol=1e-3, threshold_alpha=1e9, beta_fixed="not_fixed", alpha_max=1e10, init_alpha=None, bias_used=True, max_iter=5000, compute_score=False, epsilon=1e-08, verbose=False): self.n_iter_posterior = n_iter_posterior super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, threshold_alpha=threshold_alpha, beta_fixed=beta_fixed, alpha_max=alpha_max, init_alpha=init_alpha, bias_used=bias_used, max_iter=max_iter, compute_score=compute_score, epsilon=epsilon, verbose=verbose) def _classify(self, mu, Phi_): """ Perform Sigmoid Classification.""" return expit(np.dot(Phi_, mu)) def _log_posterior(self, mu, alpha, Phi_, t): """ Calculate log posterior.""" y = self._classify(mu, Phi_) log_p = -1 * (np.sum(np.log(y[t == 1]), 0) + np.sum(np.log(1 - y[t == 0]), 0)) log_p = log_p + 0.5 * np.dot(mu.T, np.dot(np.diag(alpha), mu)) jacobian = np.dot(np.diag(alpha), mu) - np.dot(Phi_.T, (t - y)) return log_p, jacobian def _compute_hessian(self, mu, alpha, Phi_, t): """ Perform the Inverse of Covariance.""" y = self._classify(mu, Phi_) B = np.diag(y * (1 - y)) return np.diag(alpha) + np.dot(Phi_.T, np.dot(B, Phi_)) def _posterior(self): """ Calculate the posterior likelihood.""" result = minimize(fun=self._log_posterior, hess=self._compute_hessian, x0=self.mu_, args=(self.alpha_, self.Phi_, self.t), method="Newton-CG", jac=True, options={"maxiter": self.n_iter_posterior}) self.mu_ = result.x hessian = self._compute_hessian(self.mu_, self.alpha_, self.Phi_, self.t) # Calculate Sigma # Use Cholesky decomposition for efficiency # Ref: https://arxiv.org/abs/1111.4144 chol_fail = False try: upper = scipy.linalg.cholesky(hessian) except linalg.LinAlgError: warnings.warn("Hessian not positive definite") chol_fail = True if chol_fail: try: self.Sigma_ = np.linalg.inv(hessian) except linalg.LinAlgError: warnings.warn("Using Pseudo-Inverse") self.Sigma_ = np.linalg.pinv(hessian) else: try: upper_inv = np.linalg.inv(upper) except linalg.LinAlgError: warnings.warn("Using Pseudo-Inverse") upper_inv = np.linalg.pinv(upper) self.Sigma_ = np.dot(upper_inv, upper_inv.conj().T) def fit(self, X, y): """Fit the RVC model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vectors. y : array-like, shape (n_samples,) Target values. Returns ------- self : object """ X, y = check_X_y(X, y, y_numeric=True, ensure_min_samples=2, dtype="float64") if self.kernel == "precomputed" and X.shape[0] != X.shape[1]: raise ValueError("X.shape[0] should be equal to X.shape[1]") if self.gamma in ("scale", "auto_deprecated"): X_var = X.var() if self.gamma == "scale": if X_var != 0: self._gamma = 1.0 / (X.shape[1] * X_var) else: self._gamma = 1.0 else: kernel_uses_gamma = (not callable(self.kernel) and self.kernel not in ("linear", "precomputed")) if kernel_uses_gamma and not np.isclose(X_var, 1.0): # NOTE: when deprecation ends we need to remove explicitly # setting `gamma` in examples (also in tests). See # https://github.com/scikit-learn/scikit-learn/pull/10331 # for the examples/tests that need to be reverted. warnings.warn( "The default value of gamma will change " "from 'auto' to 'scale' in version 0.22 to " "account better for unscaled features. Set " "gamma explicitly to 'auto' or 'scale' to " "avoid this warning.", FutureWarning) self._gamma = 1.0 / X.shape[1] elif self.gamma == 'auto': self._gamma = 1.0 / X.shape[1] else: self._gamma = self.gamma self.classes_ = np.unique(y) n_classes = len(self.classes_) self.scores_ = list() if n_classes < 2: raise ValueError("Need 2 or more classes.") elif n_classes == 2: self.t = np.zeros(y.shape) self.t[y == self.classes_[1]] = 1 n_samples = X.shape[0] self.Phi_ = self._get_kernel(X) # Scale Phi based on PRoNTO implementation # http://www.mlnl.cs.ucl.ac.uk/pronto/ self._scale = np.sqrt(np.sum(self.Phi_) / n_samples**2) self.Phi_ = self.Phi_ / self._scale if self.bias_used: self.Phi_ = np.hstack((np.ones((n_samples, 1)), self.Phi_)) M = self.Phi_.shape[1] self.y = y if self.init_alpha == None: self.init_alpha = 1 / M**2 self.relevance_ = np.array(range(n_samples)) if self.kernel != "precomputed": self.relevance_vectors_ = X else: self.relevance_vectors_ = None if self.beta_fixed == "not_fixed": # Suggested in the paper [1]. self.beta_ = 1e-6 else: self.beta_ = self.beta_fixed self.mu_ = np.zeros(M) self.alpha_ = self.init_alpha * np.ones(M) self._alpha_old = self.alpha_.copy() for i in range(self.max_iter): self._posterior() # Well-determinedness parameters (gamma) self.gamma_ = 1 - self.alpha_ * np.diag(self.Sigma_) self.alpha_ = np.maximum( self.gamma_, self.epsilon) / (self.mu_**2) + self.epsilon self.alpha_ = np.clip(self.alpha_, 0, self.alpha_max) if not self.beta_fixed: ed = np.sum((y - self.Phi_ @ self.mu_)**2) self.beta_ = np.maximum((n_samples - np.sum(self.gamma_)), self.epsilon) / ed + self.epsilon if self.compute_score: raise ("Score not yet implemented.") self._prune() if self.verbose: print("Iteration: {}".format(i)) print("Alpha: {}".format(self.alpha_)) print("Beta: {}".format(self.beta_)) print("Gamma: {}".format(self.gamma_)) print("m: {}".format(self.mu_)) print("Relevance Vectors: {}".format( self.relevance_.shape[0])) if self.compute_score: pass print() delta = np.amax( np.absolute( np.log(self.alpha_ + self.epsilon) - np.log(self._alpha_old + self.epsilon))) if delta < self.tol and i > 1: break self._alpha_old = self.alpha_.copy() return self else: self.multi_ = None self.multi_ = OneVsRestClassifier(self) self.multi_.fit(X, y) return self def predict_proba(self, X): """Return an array of class probabilities.""" #check_is_fitted(self, ["relevance_", "mu_", "Sigma_"]) if len(self.classes_) == 2: X = check_array(X) n_samples = X.shape[0] K = self._get_kernel(X, self.relevance_vectors_) K = K / self._scale if self.bias_used: K = np.hstack((np.ones((n_samples, 1)), K)) y = self._classify(self.mu_, K) return np.column_stack((1 - y, y)) else: return self.multi_.predict_proba(X) def predict(self, X): """Predict using the RVC model. In addition to the mean of the predictive distribution, its standard deviation can also be returned. Parameters ---------- X : array-like, shape (n_samples, n_features) Query points to be evaluate. Returns ------- results : array, shape = (n_samples, [n_output_dims]) Mean of predictive distribution at query points """ # Check is fit had been called #check_is_fitted(self, ["relevance_", "mu_", "Sigma_"]) if len(self.classes_) == 2: y = self.predict_proba(X) results = np.empty(y.shape[0], dtype=self.classes_.dtype) results[y[:, 1] <= 0.5] = self.classes_[0] results[y[:, 1] >= 0.5] = self.classes_[1] return results else: return self.multi_.predict(X)