Python OneVsRestClassifier.predict_proba 예제들, sklearn.multiclass.OneVsRestClassifier.predict_proba Python 예제들

예제 #1

0

파일 보기

파일: test_multiclass.py 프로젝트: jaguila/cert

def test_ovr_always_present():
    """Test that ovr works with classes that are always present or absent
    """
    # Note: tests is the case where _ConstantPredictor is utilised
    X = np.ones((10, 2))
    X[:5, :] = 0
    y = np.zeros((10, 3))
    y[5:, 0] = 1
    y[:, 1] = 1
    y[:, 2] = 1

    [[int(i >= 5), 2, 3] for i in range(10)]
    ovr = OneVsRestClassifier(LogisticRegression())
    assert_warns(UserWarning, ovr.fit, X, y)
    y_pred = ovr.predict(X)
    assert_array_equal(np.array(y_pred), np.array(y))
    y_pred = ovr.decision_function(X)
    assert_equal(np.unique(y_pred[:, -2:]), 1)
    y_pred = ovr.predict_proba(X)
    assert_array_equal(y_pred[:, -1], np.ones(X.shape[0]))

    # y has a constantly absent label
    y = np.zeros((10, 2))
    y[5:, 0] = 1  # variable label
    ovr = OneVsRestClassifier(LogisticRegression())
    assert_warns(UserWarning, ovr.fit, X, y)
    y_pred = ovr.predict_proba(X)
    assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))

예제 #2

0

파일 보기

파일: multilabel.py 프로젝트: hitalex/CCDM2014-contest

def OneVsRest_multilabel(train_feature, train_label, test_feature, BinaryClassifier, **kwargs):
    """ multi-label classification
    """
    from sklearn.multiclass import OneVsRestClassifier
    clf = OneVsRestClassifier(BinaryClassifier(**kwargs)).fit(train_feature, train_label)
    
    train_pred = clf.predict_proba(train_feature)
    test_pred = clf.predict_proba(test_feature)
        
    return train_pred, test_pred

예제 #3

0

파일 보기

파일: Stage4_KerasXGBoostMEUFsubmission.py 프로젝트: imclab/Yelp

def process_fold(X_train, X_val, y_train, y_val, X_test):
    #XGBoos
    clf = OneVsRestClassifier(xgb.XGBClassifier(learning_rate=0.005, n_estimators=500))
    clf.fit(X_train, y_train)
    y_p_x = clf.predict_proba(X_val)
    y_p_x_tst = clf.predict_proba(X_test)
    
    # Keras
    y_p_k, y_p_k_tst = KerasClassifier(X_train, y_train, X_val, y_val, X_test)
    
    return (y_p_x+y_p_k) / 2.0, (y_p_x_tst+y_p_k_tst) / 2.0

예제 #4

0

파일 보기

파일: test_multiclass.py 프로젝트: dsquareindia/scikit-learn

def test_ovr_fit_predict_sparse():
    for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]:
        base_clf = MultinomialNB(alpha=1)

        X, Y = datasets.make_multilabel_classification(
            n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0
        )

        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]

        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
        Y_pred_sprs = clf_sprs.predict(X_test)

        assert_true(clf.multilabel_)
        assert_true(sp.issparse(Y_pred_sprs))
        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)

        # Test predict_proba
        Y_proba = clf_sprs.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > 0.5
        assert_array_equal(pred, Y_pred_sprs.toarray())

        # Test decision_function
        clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train))
        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())

예제 #5

0

파일 보기

파일: experiment.py 프로젝트: Vanova/icu_rnn

def train_linear(X, Y, splits, model_config, results_dir, best_k=10, validation_score='f1',
                threshold_score='f1', threshold_criterion='zack', fn_prefix='', label_idx=None):
    label_idx = np.arange(Y.shape[1]) if label_idx is None else label_idx
    best_perf = None
    best_C = None
    best_model = None
    for C in np.logspace(-3,3, num=20):
        sys.stdout.write('Training Ridge Regression with C={0}...'.format(C))
        sys.stdout.flush()
        model = OneVsRestClassifier(LogisticRegression(C=C))
        try:
            model.fit(X[splits[0]], Y[splits[0]])
        except KeyboardInterrupt:
            sys.stdout.write('training interrupted...')
            break
        except:
            raise

        Yp = model.predict_proba(X[splits[1]])
        perf = compute_micro_evaluations(Y[splits[1]][:,label_idx], Yp[:,label_idx], k=best_k,
                                        threshold_score=threshold_score, criterion=threshold_criterion)
        sys.stdout.write(' {0}={1:.4f}'.format(validation_score, perf[validation_score]))
        sys.stdout.flush()
        if best_perf is None or perf[validation_score] > best_perf[validation_score]:
            best_perf = perf
            best_model = model
            best_C = C
            sys.stdout.write(' *BEST')
        sys.stdout.write('\n')

    model_config['C'] = best_C
    cPickle.dump(best_model, open(os.path.join(results_dir, fn_prefix + '-model.pkl'), 'wb'))

    return best_model, model_config

예제 #6

0

파일 보기

파일: find_mode.py 프로젝트: IsabellKonrad/SongsOfGod

def make_classifier():
    test_size=0
    X, y = make_X_Y()
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size)
    X_train = X_train.astype(int)
    X_test = X_test.astype(int)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)
    clf = OneVsRestClassifier(SVC(kernel='linear', class_weight='auto', probability=True))
    clf.fit(X_train, y_train)
    try:
        y_suggest = clf.predict_proba(X_test)
        nn = 0
        n = 0
        for y_s, y_t in zip(y_suggest, y_test):
            s1 = chords_Y[np.argmax(y_s)]
            y_s[np.argmax(y_s)]=0
            s2 = chords_Y[np.argmax(y_s)]
            t = chords_Y[np.argmax(y_t)]        
            print 'Suggest: ' + s1 + ' or ' + s2 + '  Real: ' + t
            n = n+1
            if s1==t:
                nn = nn+1
        if n>0:
            print 'Accuracy is ' + str(float(nn)/n)
    except ValueError:
        pass
    #print classification_report(clf.predict(X_test), y_test)
    pickle.dump(clf, open("classifier.bin", "wb"))

예제 #7

0

파일 보기

파일: initial.py 프로젝트: devikad/keyword_extraction_kaggle

def benchmark(clf_current):
    print('_' * 80)
    print("Test performance for: ")
    clf_descr = str(clf_current).split('(')[0]
    print(clf_descr)
    t0 = time()
    classif = OneVsRestClassifier(clf_current)
    classif.fit(X_train, Y_train.toarray())
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    t0 = time()
    if hasattr(clf_current,"decision_function"):
        dfmatrix = classif.decision_function(X_test)
        score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5))
    else:
        probsmatrix = classif.predict_proba(X_test)
        score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5))
        
    test_time = time() - t0

    
    print("f1-score:   %0.7f" % score)
    print("test time:  %0.3fs" % test_time)

    print('_' * 80)
    return clf_descr, score, train_time, test_time

예제 #8

0

파일 보기

파일: test_metrics.py 프로젝트: Sandy4321/pandas-ml

    def setUp(self):
        import sklearn.svm as svm
        import sklearn.preprocessing as pp
        from sklearn.multiclass import OneVsRestClassifier

        # 2 class
        iris = datasets.load_iris()
        self.data = iris.data
        self.target = pp.LabelBinarizer().fit_transform(iris.target)
        self.df = pdml.ModelFrame(self.data, target=self.target)
        self.assertEqual(self.df.shape, (150, 7))

        svc1 = svm.SVC(probability=True, random_state=self.random_state)
        estimator1 = OneVsRestClassifier(svc1)
        self.df.fit(estimator1)
        self.df.predict(estimator1)
        self.assertTrue(isinstance(self.df.predicted, pdml.ModelFrame))

        svc2 = svm.SVC(probability=True, random_state=self.random_state)
        estimator2 = OneVsRestClassifier(svc2)
        estimator2.fit(self.data, self.target)
        self.pred = estimator2.predict(self.data)
        self.proba = estimator2.predict_proba(self.data)
        self.decision = estimator2.decision_function(self.data)

        # argument for classification reports
        self.labels = np.array([2, 1, 0])

예제 #9

0

파일 보기

파일: classifier.py 프로젝트: nivm/learningchess

def ml_train(datasetFilePath, falsePredictionsFilePath, unknownPredictionsFilePath, confusionMatricesDir, classifierFilePath):
    logger.info("start of training and testing phase")

    classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True), n_jobs=NUMBER_OF_CPUS_TO_USE)

    logger.info("loading data set")
    dataset, features_names = load_dataset(datasetFilePath)

    #limited_dataset = limit_dataset(dataset)
    limited_dataset = dataset
    
    ml_dataset = split_dataset(limited_dataset, len(features_names))

    logger.info("fitting training set X_train - %s, y_train - %s" % (ml_dataset.X_train.shape, ml_dataset.y_train.shape))
    classifier.fit(ml_dataset.X_train, ml_dataset.y_train)

    logger.info("predicting test set X_test - %s, y_test - %s" % (ml_dataset.X_test.shape, ml_dataset.y_test.shape))
    y_pred = classifier.predict(ml_dataset.X_test)

    y_pred_probabilities = classifier.predict_proba(ml_dataset.X_test)

    y_pred_with_unknown_cls, y_pred_fictive, max_y_pred_probs = process_prediction_vector(ml_dataset.y_test, y_pred, y_pred_probabilities)

    validation(ml_dataset.y_test, y_pred, y_pred_with_unknown_cls, y_pred_fictive, list(classifier.classes_) + ["unknown"])
    plot_confusion_matrices(ml_dataset.y_test, y_pred, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "1")
    plot_confusion_matrices(ml_dataset.y_test, y_pred_with_unknown_cls, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "2")
    plot_confusion_matrices(ml_dataset.y_test, y_pred_fictive, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "3")

    produce_output(ml_dataset.y_test, y_pred, max_y_pred_probs, ml_dataset.test_terms_name, falsePredictionsFilePath, unknownPredictionsFilePath)

    logger.info("exporting classifier model")
    joblib.dump(classifier, classifierFilePath)

    logger.info("end of training and testing phase")

예제 #10

0

파일 보기

파일: labeler.py 프로젝트: hpam1/Machine-Learning

def trainAndPredictLR(trainX, trainY, testX):
    """
    Logistic regression is used for predicting the target labels of the test data
    The probability of belonging to each of the labels is predicted for every test
    data and the labels with the top 10 probability values are extracted
    
    Input:
        1. trainX: ntrainingSamples * 2000 numpy matrix representing training data features
        2. trainY: ntrainingSamples * 185 numpy matrix representing the training data labels
        3. testX: ntestSamples * 2000 numpy matrix representing test data features
    
    Output:
        testY: ntestSamples * 19 numpy matrix representing the labels for the test data
    
    """
    clf = OneVsRestClassifier(LogisticRegression(C = 1.0))
    clf.fit(trainX, trainY)
    actY = clf.predict_proba(testX)
    testY = []
    # fetch the labels with max probability
    for prob in actY:
        y = []
        for i in range(10):
            index = np.argmax(prob, axis=0)
            classVal = classOrder[index]
            y.append(classVal)
            prob[index] = -1
        testY.append(y)
    return np.array(testY)

예제 #11

0

파일 보기

파일: test_multiclass.py 프로젝트: jaguila/cert

def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       return_indicator=True,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # decision function only estimator. Fails in current implementation.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        # Estimator with predict_proba disabled, depending on parameters.
        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
        decision_only.fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred)

예제 #12

0

파일 보기

파일: svm.py 프로젝트: ShrikanthRamanathan/kaggle_otto

 def objective(args):
     c, gamma = args
     clf = OneVsRestClassifier(svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma,
                               probability=True, random_state=23))
     score1 = 0
     score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
     score = log_loss(valid_labels, clf.predict_proba(valid))
     print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (c, gamma, score1, score2, score)
     return score

예제 #13

0

파일 보기

파일: common.py 프로젝트: MoRandi91/pracweb-alg-classify

class Classifier(object):
    '''Classifier base class. Uses OneVsRest for multiclass problems'''
    def __init__(self, clf, x_train, y_train):
        n_classes = len(set(y_train))
        if n_classes > 2:
            self.clf = OneVsRestClassifier(clf)
        else:
            self.clf = clf
        self.clf.fit(x_train, y_train)

    def __call__(self, x_val):
        return self.clf.predict_proba(x_val)

예제 #14

0

파일 보기

파일: scratch.py 프로젝트: rgerkin/upsit

def fit_models_mc(imps, X, Y, all_props, props=None,
               labels=None, n_splits=5, 
               clf_args={'n_estimators':25, 
                         'max_features':'auto', 
                         'random_state':0}):
    if props is None:
        props = all_props
    n_obs = X['missing'].shape[0] # Number of observations.  
    n_features = X['missing'].shape[1] # Number of observations.  
    n_props = len(props) # Number of properties to predict.  
    test_size = 0.2
    if labels is None:
        shuffle_split = ShuffleSplit(n_iter=n_splits,
                                     test_size=test_size,random_state=0)
    else:
        shuffle_split = LabelShuffleSplit(n_iter=n_splits,
                                          test_size=test_size,random_state=0)
    n_test_samples = np.max([len(list(shuffle_split)[i][1]) \
                            for i in range(n_splits)])
    rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
    ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    feature_importances = None#{imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
    cols = np.array([i for i in range(len(all_props)) if all_props[i] in props])
    for imp in imps:
        for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),groups=labels)):
            #X_train,X_test = X[imp][train][:,cols],X[imp][test][:,cols]
            #Y_train,Y_test = Y[imp][train][:,cols],Y['missing'][test][:,cols]
            X_train,X_test = X[imp][train,:],X[imp][test,:]
            Y_train,Y_test = Y[imp][train,:],Y['missing'][test,:]
            clf_args_ = {key:(value if type(value) is not dict \
                         else value[prop])\
                         for key,value in clf_args.items()}
            if clf_args_['max_features'] not in [None, 'auto']:
               clf_args_['max_features'] = min(X_train.shape[1],
                                               clf_args_['max_features'])
            rfc = RandomForestClassifier(**clf_args_)
            onevsrest = OneVsRestClassifier(rfc)
            onevsrest.fit(X_train,Y_train)
            Y_predict = onevsrest.predict(X_test)#.reshape(-1,n_props)
            probs = onevsrest.predict_proba(X_test)
            if probs.shape[1]<2 and probs.mean()==1.0:
                n_test_samples = len(probs)
                ps[imp][:,k,:n_test_samples] = 0.0
            else:
                n_test_samples = len(probs[:,1])
                ps[imp][:,k,:n_test_samples] = probs.T
            ys[imp][:,k,:n_test_samples] = Y_test.T
            for i in range(n_props):
                rs[imp][i,k] = np.ma.corrcoef(Y_predict[:,i],Y_test[:,i])[0,1]
            #feature_importances[imp][n_prop,:,k] = onevsrest.feature_importances_
    return rs,feature_importances,ys,ps

예제 #15

0

파일 보기

파일: classifier.py 프로젝트: cginestra/san_francisco_crime

    def go():

        input = TrainingFactory.build_sparse_matrix_input(limit=10000)
        targets = TrainingFactory.build_sparse_matrix_target(limit=10000)

        input_train, input_test, target_train, target_test = train_test_split(input, targets, test_size=0.1)

        classif = OneVsRestClassifier(SVC(kernel='rbf', tol=0.001, probability=True))
        classif.fit(input_train, target_train)

        output_targets = classif.predict_proba(input_test)
        print ClassifierFactory.output_function(output_targets)
        print ClassifierFactory.output_function(target_test.todense())

        print log_loss(target_test, output_targets)
        print

예제 #16

0

파일 보기

파일: test_multiclass.py 프로젝트: dsquareindia/scikit-learn

    def conduct_test(base_clf, test_predict_proba=False):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_equal(set(y_pred), set("eggs"))

        if test_predict_proba:
            X_test = np.array([[0, 0, 4]])
            probabilities = clf.predict_proba(X_test)
            assert_equal(2, len(probabilities[0]))
            assert_equal(clf.classes_[np.argmax(probabilities, axis=1)], clf.predict(X_test))

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[3, 0, 0]])[0]
        assert_equal(y_pred, 1)

예제 #17

0

파일 보기

파일: plotCurve.py 프로젝트: ShangruZhong/pattern-recognition-course

def model(train_data, train_label, test_data, test_label, n_classes):
    # Binarize the output
    train_label = label_binarize(train_label, classes=list(np.arange(n_classes)))
    test_label = label_binarize(test_label, classes=list(np.arange(n_classes)))

    # Basic classifier
    # basic_clf = LogisticRegression(C=1.0)
    # basic_clf = SVC()
    # basic_clf = KNeighborsClassifier()
    basic_clf = GaussianNB()
    # Multi-class
    classifier = OneVsRestClassifier(basic_clf)
    classifier.fit(train_data, train_label)
    # test_score = classifier.decision_function(test_data)
    test_score = classifier.predict_proba(test_data)
    return test_score, test_label

예제 #18

0

파일 보기

파일: common.py 프로젝트: beral/pracweb-alg-classify

class Classifier(object):
    '''Classifier base class. Uses OneVsRest for multiclass problems'''
    def __init__(self, clf, x_train, y_train):
        n_classes = len(set(y_train))
        if n_classes > 2:
            self.clf = OneVsRestClassifier(clf)
        else:
            self.clf = clf
        self.clf.fit(x_train, y_train)

    def __call__(self, x_val):
        return self.clf.predict_proba(x_val)

    def describe(self):
        return dict(
            (k, v)
            for k, v in self.clf.get_params().iteritems()
            if not callable(v))

예제 #19

0

파일 보기

파일: Stage3_BlendLRModelsCV.py 프로젝트: imclab/Yelp

def process_data_set(X_train, y_train, X_val, X_test, c=1.0):
   
    cls = OneVsRestClassifier(LogisticRegression(C=c))
    
    # 4096 + 4096 + 384*4 + 256*4 # "fc6" "fc7" "flatten4" "flatten5"
    # [0, 4096, 8192, 9728, 10752]
    layers = np.array((4096, 4096, 384*4, 256*4))   
    layers = np.concatenate(([0], np.cumsum(layers)))

    r_ = range(layers[0], layers[4])
    x_tr = X_train[:, r_]
    x_vl = X_val[:, r_]
    x_ts = X_test[:, r_]
    
    cls.fit(x_tr, y_train)
    y_vl = cls.predict_proba(x_vl)
    y_ts = cls.predict_proba(x_ts)

    return y_vl, y_ts

예제 #20

0

파일 보기

파일: test_multiclass.py 프로젝트: jaguila/cert

def test_ovr_single_label_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    X, Y = iris.data, iris.target
    X_train, Y_train = X[:80], Y[:80]
    X_test, Y_test = X[80:], Y[80:]
    clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

    # decision function only estimator. Fails in current implementation.
    decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
    assert_raises(AttributeError, decision_only.predict_proba, X_test)

    Y_pred = clf.predict(X_test)
    Y_proba = clf.predict_proba(X_test)

    assert_almost_equal(Y_proba.sum(axis=1), 1.0)
    # predict assigns a label if the probability that the
    # sample has the label is greater than 0.5.
    pred = np.array([l.argmax() for l in Y_proba])
    assert_false((pred - Y_pred).any())

예제 #21

0

파일 보기

파일: svm.py 프로젝트: micchu/TwoChannelNIRS

def multiclass_svm(training_feature_array, training_label_array, test_feature_array, test_label_array, 
        kernel_type = "rbf", grid_search = True, n_fold = 5, costs = None, gammas = None):
    """
    多クラス分類のSVC
    @param training_feature_array: トレーニング用データ
    @param training_label_array: トレーニング用データラベル
    @param test_feature_array: テスト用データ
    @param test_label_array: テスト用データラベル
    @keyword kernel_type: カーネル種別
    @keyword grid_search: パラメータ最適化をするか否か 
    @keyword n_fold: フォールド数
    @keyword costs: コスト値リスト
    @keyword gammas: ガンマ値リスト  
    
    @return: 識別率, 識別結果のリスト, 識別面からの距離のリスト, SVCオブジェクト
    """
    # 多クラス識別器の生成
    multi_svm_model = OneVsRestClassifier(sksvm.SVC(kernel=kernel_type, probability=True))
#    print multi_svm_model.get_params()
    if grid_search:
        # パラメータ最適化
        ret_c, ret_gamma = optimizeParameter(multi_svm_model, kernel_type, training_feature_array, training_label_array, _fold=n_fold, _costs=costs, _gammas=gammas)
    else:
        # パラメータ最適化を行わない場合、一般的なデフォルト値を用いる
        # コスト値は1.0で、ガンマ値は1/特徴量次元数
        ret_c = 1.0
        ret_gamma = 1/len(training_feature_array[0,])
    # 最適なコスト値、ガンマ値の設定
    #multi_svm_model.set_params(C=ret_c, gamma=ret_gamma)
    multi_svm_model.estimator.set_params(C=ret_c, gamma=ret_gamma)
    
    # 学習
    multi_svm_model.fit(training_feature_array, training_label_array)
    # 予測
    result_class_list = multi_svm_model.predict(test_feature_array)
    # クラス尤度の計算
    result_probability_list = multi_svm_model.predict_proba(test_feature_array)
    
    # 識別率の計算
    try:
        precision = skmet.accuracy_score(test_label_array, result_class_list)
    except DeprecationWarning, e:
        pass

예제 #22

0

파일 보기

파일: test_multiclass.py 프로젝트: AlexisMignon/scikit-learn

def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # Decision function only estimator.
        decision_only = OneVsRestClassifier(svm.SVR(gamma='scale')
                                            ).fit(X_train, Y_train)
        assert_false(hasattr(decision_only, 'predict_proba'))

        # Estimator with predict_proba disabled, depending on parameters.
        decision_only = OneVsRestClassifier(svm.SVC(gamma='scale',
                                                    probability=False))
        assert_false(hasattr(decision_only, 'predict_proba'))
        decision_only.fit(X_train, Y_train)
        assert_false(hasattr(decision_only, 'predict_proba'))
        assert_true(hasattr(decision_only, 'decision_function'))

        # Estimator which can get predict_proba enabled after fitting
        gs = GridSearchCV(svm.SVC(gamma='scale', probability=False),
                          param_grid={'probability': [True]})
        proba_after_fit = OneVsRestClassifier(gs)
        assert_false(hasattr(proba_after_fit, 'predict_proba'))
        proba_after_fit.fit(X_train, Y_train)
        assert_true(hasattr(proba_after_fit, 'predict_proba'))

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred)

예제 #23

0

파일 보기

파일: test_multiclass.py 프로젝트: AlexisMignon/scikit-learn

def test_ovr_single_label_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    X, Y = iris.data, iris.target
    X_train, Y_train = X[:80], Y[:80]
    X_test = X[80:]
    clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

    # Decision function only estimator.
    decision_only = OneVsRestClassifier(svm.SVR(gamma='scale')
                                        ).fit(X_train, Y_train)
    assert_false(hasattr(decision_only, 'predict_proba'))

    Y_pred = clf.predict(X_test)
    Y_proba = clf.predict_proba(X_test)

    assert_almost_equal(Y_proba.sum(axis=1), 1.0)
    # predict assigns a label if the probability that the
    # sample has the label is greater than 0.5.
    pred = np.array([l.argmax() for l in Y_proba])
    assert_false((pred - Y_pred).any())

예제 #24

0

파일 보기

파일: test_multiclass.py 프로젝트: hmshan/scikit-learn

    def conduct_test(base_clf, test_predict_proba=False):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_array_equal(y_pred, ["eggs"])
        if hasattr(base_clf, 'decision_function'):
            dec = clf.decision_function(X)
            assert_equal(dec.shape, (5,))

        if test_predict_proba:
            X_test = np.array([[0, 0, 4]])
            probabilities = clf.predict_proba(X_test)
            assert_equal(2, len(probabilities[0]))
            assert_equal(clf.classes_[np.argmax(probabilities, axis=1)],
                         clf.predict(X_test))

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[3, 0, 0]])[0]
        assert_equal(y_pred, 1)

예제 #25

0

파일 보기

def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(
            n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, random_state=0
        )
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # decision function only estimator. Fails in current implementation.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than than 0.5.
        pred = [tuple(l.nonzero()[0]) for l in (Y_proba > 0.5)]
        assert_equal(pred, Y_pred)

예제 #26

0

파일 보기

파일: lda.py 프로젝트: micchu/TwoChannelNIRS

def multiclass_lda(training_feature_array, training_label_array,
               test_feature_array, test_label_array):
    """
    多クラス分類LDA
    @param training_feature_array: トレーニング用データ
    @param training_label_array: トレーニング用データラベル
    @param test_feature_array: テスト用データ
    @param test_label_array: テスト用データラベル
    
    @return: 全体識別率, 識別結果のリスト, 識別されたクラスへの所属確率のリスト, LDAオブジェクト

    動作確認済
    """
    multi_lda_obj = OneVsRestClassifier(slda.LDA())
    multi_lda_obj.fit(training_feature_array, training_label_array)
    
    print "test..."
    class_result = multi_lda_obj.predict(test_feature_array)
    proba_result = multi_lda_obj.predict_proba(test_feature_array)    
    proba_max_result = np.max(proba_result, axis=1)
    try:
        precision = smet.accuracy_score(test_label_array, class_result)
    except DeprecationWarning, e:
        pass

예제 #27

0

파일 보기

파일: svm.py 프로젝트: grzesiekzajac/ziwm

class SVM(Classifier):

    def __init__(self):
        self.__class_zero_indexing = True
        self.__class_num = 0
        self.__clf = OneVsRestClassifier(SVC(probability=True))
    
    @staticmethod
    def name():
        return "svm"
    
    def train(self, X, Y, class_number=-1):
        self.__class_num = max(np.unique(Y).size, class_number)
        self.__clf.fit(X, Y)

    def predict(self, X):
        out = self.__clf.predict_proba(X)
        assert len(out[0]) == self.__class_num
        return out

    def predict2(self, X):
        out = self.__clf.predict(X)
        # assert len(out[0]) == self.__class_num
        return out

예제 #28

0

파일 보기

파일: Experiment_ALL_plot30_NN_out.py 프로젝트: raybenchen/DiabeticsReadmissionPrediction

clf2.fit( x_train, y_train )
clf3.fit( x_train, y_train )
clf4.fit( x_train, y_train )
print "training ended"
et = time.time()
tt = et - st
print "Training Time = " + str(tt) + "\n"

#predictions
pred1 = clf1.predict( x_test )
pred2 = clf2.predict( x_test )
pred3 = clf3.predict( x_test )
pred4 = clf4.predict( x_test )
pred = pred2;
#NOTE: change to decision_function or predict_proba depending on the classifier
y_score1 = clf1.predict_proba(x_test)
y_score2 = clf2.predict_proba(x_test)
y_score3 = clf3.predict_proba(x_test)
y_score4 = clf4.predict_proba(x_test)
#y_score = clf.decision_function(x_test)
y_score = y_score1 + y_score2 + y_score3 + y_score4


#################################################################################
#PrecisionRecall-plot
precision = dict()
recall = dict()
PR_area = dict()
PR_thresholds = dict()
average_precision = dict()
for i in range(n_classes):

예제 #29

0

파일 보기

train_num=1500
test_num=1500
data_train=data[0:train_num,]
label_train=label[0:train_num,]
# label_train=label_train[0:2]
# print(label_train.shape)

data_test=data[train_num:train_num+test_num,]
label_test=label[train_num:train_num+test_num,]
# print(label_test.shape)

## multi classification
model_0 =OneVsRestClassifier(SVC(kernel='linear', probability=True,gamma='scale'))

model_0.fit(data_train, label_train)
pre_0 = model_0.predict_proba(data_test)

max_ind=np.argmax(pre_0,axis=1)
# print(max_ind)
pre=np.zeros_like(pre_0)
for i in range(pre.shape[0]):
    pre[i,max_ind[i]]=1
# print(pre)
pre_train0=model_0.predict_proba(data_train)
max_ind_train=np.argmax(pre_train0,axis=1)
# print(max_ind)
pre_train=np.zeros_like(pre_0)
for i in range(max_ind_train.shape[0]):
    pre_train[i,max_ind_train[i]]=1

print(metrics.accuracy_score(label_train,pre_train))

예제 #30

0

파일 보기

파일: classification.py 프로젝트: abdcelikkanat/node2sig

def evaluate(graph_path,
             embedding_file,
             number_of_shuffles,
             training_ratios,
             classification_method,
             file_type="binary"):
    #print("Basladi")
    cache_size = 10240

    g = nx.read_gml(graph_path)

    node2community = get_node2community(g)

    # N = g.number_of_nodes()
    K = detect_number_of_communities(g)
    #print("K: {}".format(K))
    # nodelist = [node for node in g.nodes()]
    nodelist = [int(node) for node in node2community]
    #nodelist.sort()

    N = len(nodelist)
    #print("N: {}".format(N))
    #print("--------", x.shape

    if file_type == "binary":
        x = read_binary_emb_file(file_path=embedding_file, nodelist=nodelist)
    else:
        x = read_embedding_file(embedding_file, nodelist=nodelist)
    #print("Basladi 2")

    label_matrix = [[
        1 if k in node2community[str(node)] else 0 for k in range(K)
    ] for node in nodelist]
    label_matrix = csr_matrix(label_matrix)

    results = {}

    for score_t in _score_types:
        results[score_t] = OrderedDict()
        for ratio in training_ratios:
            results[score_t].update({ratio: []})

    print("+ Similarity matrix is begin computed!")
    if classification_method == "svm-hamming":
        sim = 1.0 - cdist(x, x, 'hamming')
    elif classification_method == "svm-cosine":
        sim = 1.0 - cdist(x, x, 'cosine')
    else:
        raise ValueError("Invalid classification method name: {}".format(
            classification_method))

    #print("\t- Completed!")

    for train_ratio in training_ratios:

        for shuffleIdx in range(number_of_shuffles):

            print("Current train ratio: {} - shuffle: {}/{}".format(
                train_ratio, shuffleIdx + 1, number_of_shuffles))

            # Shuffle the data
            shuffled_idx = np.random.permutation(N)
            shuffled_sim = sim[shuffled_idx, :]
            shuffled_sim = shuffled_sim[:, shuffled_idx]
            shuffled_labels = label_matrix[shuffled_idx]

            # Get the training size
            train_size = int(train_ratio * N)
            # Divide the data into the training and test sets
            train_sim = shuffled_sim[0:train_size, :]
            train_sim = train_sim[:, 0:train_size]
            train_labels = shuffled_labels[0:train_size]

            test_sim = shuffled_sim[train_size:, :]
            test_sim = test_sim[:, 0:train_size]
            test_labels = shuffled_labels[train_size:]

            # Train the classifier
            ovr = OneVsRestClassifier(
                SVC(kernel="precomputed",
                    cache_size=cache_size,
                    probability=True))

            ovr.fit(train_sim, train_labels)

            # Find the predictions, each node can have multiple labels
            test_prob = np.asarray(ovr.predict_proba(test_sim))
            y_pred = []
            for i in range(test_labels.shape[0]):
                k = test_labels[i].getnnz(
                )  # The number of labels to be predicted
                pred = test_prob[i, :].argsort()[-k:]
                y_pred.append(pred)

            # Find the true labels
            y_true = [[] for _ in range(test_labels.shape[0])]
            co = test_labels.tocoo()
            for i, j in zip(co.row, co.col):
                y_true[i].append(j)

            mlb = MultiLabelBinarizer(range(K))
            for score_t in _score_types:
                score = f1_score(y_true=mlb.fit_transform(y_true),
                                 y_pred=mlb.fit_transform(y_pred),
                                 average=score_t)

                results[score_t][train_ratio].append(score)

    return results

예제 #31

0

파일 보기

def ML_with_BN_feat(bn_feat_file='../data/factors_n_bn_feat.csv',
                    n_comp=100,
                    plotting=False):
    plt.close('all')
    if n_comp < 50:
        n_comp = 50
    # Importing the bottleneck features for each image
    feat_df = pd.read_csv(bn_feat_file, index_col=0, dtype='unicode')
    #    feat_df = feat_df.sample(frac=0.05)
    print('Data frame shape:', feat_df.shape)
    #    feat_df = feat_df.iloc[0:300,:]
    mask = feat_df.loc[:, 'label'].isin(['Parasitized', 'Uninfected'])
    feat_df = feat_df.loc[mask, :].drop_duplicates()
    print('Number of bottleneck features:', feat_df.shape[1] - 7)
    y = feat_df.loc[:, ['label']].values
    print(type(y), y.shape)

    print('Number of samples for each label \n',
          feat_df.groupby('label')['label'].count())
    X = feat_df.loc[:, 'x0':'x2047'].astype(float).values
    #    print(list(feat_df.loc[:, 'x0':].columns))

    ##-- Dealing with imbalanced data

    #    from imblearn.over_sampling import RandomOverSampler
    #    ros = RandomOverSampler(random_state=0)
    #
    #    X_resampled, y_resampled = ros.fit_sample(X, y[:,0])
    #
    #    from collections import Counter
    #    print(sorted(Counter(y_resampled).items()))
    #
    #    X, y = X_resampled, y_resampled
    # checking for nulls in DF
    #nulls = BN_featues.isnull().any(axis=1)

    # checking for nulls in DF
    #nulls = BN_featues.isnull().any(axis=1)
    # In[3]:

    class_names = set(feat_df.loc[:, 'label'])
    # Binarize the labels
    # print(class_names)
    #    lb = label_binarize(y = y, classes = list(class_names))
    # classes.remove('unknown')
    # lb.fit(y) #for LabelBinarizer not lable_binerize()
    # lb.classes_ #for LabelBinarizer not lable_binerize

    # Split the training data for cross validation
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    ##### Dimensionality Reduction ####

    # In[4]:

    # Princple Component Analysis
    # Use n_components = None first to determine variability of principle components
    # Then limit the number of principle components that are reasonable
    # n_components=None --> min(n observation, n features)
    print('...running PCA analysis...' '')
    pca_none = PCA(n_components=None)
    pca_none.fit_transform(X_train)
    #    print(X_test.shape, type(X_test))
    #    arr_index = np.where(X_test == '0.1465795w85188675')
    #    print('arr_index', arr_index)
    #    print('X_test[arr_index]',X_test[arr_index])
    pca_none.transform(X_test)
    explained_variance = pca_none.explained_variance_ratio_
    plt.figure(0)
    plt.plot(explained_variance)
    plt.xlabel('n_components')
    plt.ylabel('variance')
    plt.suptitle('Explained Variance of Principle Components')
    #    plt.show(block=False)
    plt.savefig('../plots/pca_var_vs_ncomp.png')
    # #### After about 70 components there is very little variance gain  ####
    # Applying Principle Component Decomposition

    # In[5]:

    #    n_comp = 11 # the number of Principal Components to project/decompose the data into
    print('...running PCA with', n_comp, 'components')
    pca = PCA(n_components=n_comp)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    explained_variance1 = pca.explained_variance_ratio_
    plt.figure(1)
    plt.plot(explained_variance1)
    plt.xlabel('n_components')
    plt.ylabel('variance')
    plt.suptitle('Explained Variance of Principle Components')
    plt.show(block=False)
    plt.savefig('../plots/pca_var_vs_{}_ncomp.png'.format(n_comp))
    # Save feature reduction PCA
    save_PCA = '../models/trained_PCA.sav'
    pickle.dump(pca, open(save_PCA, 'wb'))

    # In[6]:
    if plotting:
        # Pairwise plots of 11 PCA, note this only works with two labels
        feat_df_ploting = pd.DataFrame({'label': y_train[:, 0]})
        caa_plot_pairs(X_train[:, :11], feat_df_ploting, 'PCA')
        plt.figure(figsize=(16, 24))
        plt.show(block=False)

    # In[70]:
    # seaborn plot of PCA
    # need to add columns to pca X_train
    # conver to a dataframe
    #Pairwise plots of 11 components
    pca_DF = pd.DataFrame(X_train[:, :11])

    df_y_train = pd.DataFrame(y_train,
                              columns=['label'])  #,'Date','group_idx'])
    df_pca_train = pd.concat([df_y_train, pca_DF], axis=1)
    #    dates = list(set(df_pca_train['Date']))

    #    print(list(feat_df.columns))
    feature_names = df_pca_train.columns[1:]
    n_comp_pca = pca_DF.shape[1]
    print('n_comp_pca', n_comp_pca)
    print('feature_names', feature_names)
    print('df_pca_train columns', list(df_pca_train.columns))

    plt.close('all')

    # Set up plot to compare confusion matrices
    params = {
        'axes.titlesize': 'x-large',
        #            'legend.fontsize': 'large',
        #          'figure.figsize': (15, 5),
        'axes.labelsize': 'large',
        'axes.titlesize': 'large',
        'xtick.labelsize': 'medium',
        'ytick.labelsize': 'medium'
    }
    plt.rcParams.update(params)

    fig, axs = plt.subplots(1, 4, sharey=True, figsize=(15, 8.5))
    font = {
        'linespacing':
        1.5,  #'family': 'serif', 'color':  'darkred', 'weight': 'normal',
        'size': 14
    }

    # ## Exploring Different Algorithms For Mutliclass Classfication

    #Metric in this case is F2
    from sklearn.metrics import fbeta_score, make_scorer
    ftwo_scorer = make_scorer(fbeta_score, beta=2)
    # In[7.5]:
    # Let's scale the features and plug into logisitc regression classifier
    #    from sklearn.preprocessing import StandardScaler
    #    X_scaled = StandardScaler().fit_transform(X_train)

    from sklearn import linear_model
    log_reg_classifier = linear_model.LogisticRegression(penalty='l2',
                                                         tol=0.0001,
                                                         C=1.0,
                                                         fit_intercept=True,
                                                         intercept_scaling=1,
                                                         class_weight=None,
                                                         random_state=None,
                                                         solver='liblinear',
                                                         max_iter=100,
                                                         multi_class='ovr',
                                                         n_jobs=1)
    log_r = log_reg_classifier.fit(X_train, df_y_train['label'].values)

    y_test_predictions_log_r = log_r.predict(X_test)
    y_predict_prob_log_r = log_r.predict_proba(X_test)
    # save results into a DF
    results = pd.DataFrame()
    results['y_test'] = y_test[:, 0]
    results['log_r_pred'] = list(y_test_predictions_log_r)
    results['log_r_prob'] = y_predict_prob_log_r[:, 0]

    #Perform 3-fold cross validation and return the mean accuracy on each fold
    cv_scores_lr = cross_val_score(estimator=log_r, X=X_train,
                                   y=y_train)  #, scoring = ftwo_scorer)
    print('Logistic regression cv_scores', cv_scores_lr)

    save_LR = '../models/trained_log_reg.sav'
    pickle.dump(log_reg_classifier, open(save_LR, 'wb'))

    # Confusion Matrix for Logistic Regresssion
    cmNB = confusion_matrix(y_test,
                            y_test_predictions_log_r,
                            labels=list(class_names))
    plt.subplot(1, 4, 1)
    plot_confusion_matrix(cm1=cmNB,
                          classes=class_names,
                          normalize=True,
                          gradientbar=False,
                          title='Logistic Regression\n')
    cv_scores_lr = ["{:.2f}".format(x) for x in cv_scores_lr]

    p_r_fscore_lr = precision_recall_fscore_support(y_test,
                                                    y_test_predictions_log_r,
                                                    beta=2.0,
                                                    labels=['Parasitized'],
                                                    pos_label='Parasitized',
                                                    average='binary')

    print(p_r_fscore_lr[:3])
    plt.text(
        0.01,
        -1,
        '\nCV Scores:\n' + str(cv_scores_lr) + '\n' +
        'Precision: {d[0]:.2f}\nRecall: {d[1]:.2f} \nF2 score: {d[2]:.2f} \n'.
        format(d=p_r_fscore_lr[:3]),
        ha='left',
        va='bottom',
        fontdict=font,
        transform=plt.subplot(1, 4, 1).transAxes)

    # In[7]:

    # ### OneVsRestClassifier with Naive Bayes

    classifier = OneVsRestClassifier(GaussianNB())
    nbclf = classifier.fit(X_train, df_y_train['label'].values)
    y_test_predictions_nbclf = nbclf.predict(X_test)
    y_predict_prob = nbclf.predict_proba(X_test)
    # save results into a DF
    results['NB_pred'] = list(y_test_predictions_nbclf)
    results['NB_r_prob'] = y_predict_prob[:, 0]

    #Perform 3-fold cross validation and return the mean accuracy on each fold
    cv_scores = cross_val_score(classifier, X_train,
                                y_train)  #default 3-fold cross validation
    print('NB cv_scores', cv_scores)
    #    answer = pd.DataFrame(y_predict_prob, columns = class_names).round(decimals=3) # index= pd.DataFrame(X_test).index.tolist())
    #print('One vs Rest - Naive Bayes\n', answer.head())

    # Confusion Matrix for Naive Bayes
    cmNB = confusion_matrix(y_test,
                            y_test_predictions_nbclf,
                            labels=list(class_names))
    plt.subplot(1, 4, 2)
    plot_confusion_matrix(cm1=cmNB,
                          classes=class_names,
                          normalize=True,
                          gradientbar=False,
                          title='One vs Rest - Naive Bayes\n')
    cv_scores = ["{:.2f}".format(x) for x in cv_scores]

    p_r_fscore_NB = precision_recall_fscore_support(y_test,
                                                    y_test_predictions_nbclf,
                                                    beta=2.0,
                                                    labels=['Parasitized'],
                                                    pos_label='Parasitized',
                                                    average='binary')
    print(p_r_fscore_NB[:3])
    plt.text(
        0.01,
        -1,
        '\nCV Scores:\n' + str(cv_scores) + '\n' +
        'Precision: {d[0]:.2f}\nRecall: {d[1]:.2f} \nF2 score: {d[2]:.2f} \n'.
        format(d=p_r_fscore_NB[:3]),
        ha='left',
        va='bottom',
        fontdict=font,
        transform=plt.subplot(1, 4, 2).transAxes)

    # ### Random Forest Classification

    # In[8]:

    # Next, let's try Random Forest Classifier
    if n_comp < 100:
        f = n_comp
    else:
        f = 100
    n = 30
    RFclf = OneVsRestClassifier(
        RandomForestClassifier(n_estimators=n, max_features=f))
    RFclf.fit(X_train, df_y_train['label'].values)
    y_test_predictions_RF = RFclf.predict(X_test)
    #    y_score_RF = RFclf.predict_proba(X_test)
    y_score_answer_RF = RFclf.predict_proba(X_test)

    # save results into a DF
    results['RF'] = list(y_test_predictions_RF)
    results['RF_prob'] = y_score_answer_RF[:, 0]

    #Perform 3-fold cross validation and return the mean accuracy on each fold
    cv_scores_RF = cross_val_score(RFclf, X_train,
                                   y_train)  #default 3-fold cross validation
    print('Random Forest cv_scores', cv_scores_RF)
    #    answer_RF = pd.DataFrame(y_score_answer_RF)
    save_RF = '../models/trained_RF.sav'
    pickle.dump(RFclf, open(save_RF, 'wb'))
    #print('Random Forest\n', answer_RF.head())

    # confusion matrix
    cmRF = confusion_matrix(y_test,
                            y_test_predictions_RF,
                            labels=list(class_names))
    plt.subplot(1, 4, 3)
    plot_confusion_matrix(
        cm1=cmRF,
        classes=class_names,
        normalize=True,
        gradientbar=False,
        title='Random Forests\nestimators: {0}\n max_features: {1}\n'.format(
            n, f))
    cv_scores_RF = ["{:.2f}".format(x) for x in cv_scores_RF]

    p_r_fscore_RF = precision_recall_fscore_support(y_test,
                                                    y_test_predictions_RF,
                                                    beta=2.0,
                                                    labels=['Parasitized'],
                                                    pos_label='Parasitized',
                                                    average='binary')
    print(p_r_fscore_RF[:3])
    plt.text(
        0.01,
        -1,
        '\nCV Scores:\n' + str(cv_scores_RF) + '\n' +
        'Precision: {d[0]:.2f}\nRecall: {d[1]:.2f} \nF2 score: {d[2]:.2f} \n'.
        format(d=p_r_fscore_RF[:3]),
        ha='left',
        va='bottom',
        fontdict=font,
        transform=plt.subplot(1, 4, 3).transAxes)

    # ### Adaptive Boosting Classifier
    # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

    # In[9]:

    AdaBoost = AdaBoostClassifier()
    AdaBoost.fit(X_train, y_train)
    y_predAB = AdaBoost.predict(X_test)
    y_predAB_prob = AdaBoost.predict_proba(X_test)
    #    y_predAB_binarized = label_binarize(y_predAB,
    #                                     classes=['single_product','market_place'])
    # save results into a DF
    results['AB_pred'] = list(y_predAB)
    results['AB_prob'] = y_predAB_prob[:, 0]

    results.to_csv('../data/y_test_predictions')
    #Perform 3-fold cross validation and return the mean accuracy on each fold
    cv_scores_AB = cross_val_score(AdaBoost, X_train,
                                   y_train)  #default 3-fold cross validation
    print('Adaptive Boosting cv_scores', cv_scores_AB)
    save_AdaBoost = '../models/trained_AdaBoost.sav'
    pickle.dump(AdaBoost, open(save_AdaBoost, 'wb'))

    plt.subplot(1, 4, 4)
    cmAdaBoost = confusion_matrix(y_test, y_predAB, labels=list(class_names))
    plot_confusion_matrix(cm1=cmAdaBoost,
                          normalize=True,
                          classes=class_names,
                          title='AdaBoost\n',
                          gradientbar=False)
    cv_scores_AB = ["{:.2f}".format(x) for x in cv_scores_AB]

    p_r_fscore_AB = precision_recall_fscore_support(y_test,
                                                    y_predAB,
                                                    beta=2.0,
                                                    labels=['Parasitized'],
                                                    pos_label='Parasitized',
                                                    average='binary')
    print(p_r_fscore_AB[:3])

    plt.text(
        0.01,
        -1,
        '\nCV Scores:\n' + str(cv_scores_AB) + '\n' +
        'Precision: {d[0]:.2f}\nRecall: {d[1]:.2f} \nF2 score: {d[2]:.2f} \n'.
        format(d=p_r_fscore_AB[:3]),
        ha='left',
        va='bottom',
        fontdict=font,
        transform=plt.subplot(1, 4, 4).transAxes)

    # #### Comparing mean accuracy and confusion matrices of difference classification algorithrms

    # In[10]:
    print('\nLogistic Regression mean accuracy:',
          round(log_reg_classifier.score(X_test, y_test), 4))
    print('One vs Rest - Naive Bayes mean accuracy:',
          round(classifier.score(X_test, y_test), 4))
    print('Random Forest Classifier mean accuracy:',
          round(RFclf.score(X_test, y_test), 4))
    print('Adaptive Boosting Classifier mean accuracy:',
          round(AdaBoost.score(X_test, y_test), 4))
    plt.tight_layout()
    fig.tight_layout()
    plt.savefig('../plots/confusion_matrix_result_1.png')
    plt.show(block=False)

    ### -- ROC and AUC
    # Compute ROC curve and area the curve
    plt.figure(12)
    #    print('y_test before binirization', y_test[0:4])
    y_test = label_binarize(y_test, classes=['Uninfected', 'Parasitized'])
    #    print('y_test after binirization', y_test[0:4])

    #    print(y_predict_prob_log_r[1:4, 0])
    fpr, tpr, thresholds = roc_curve(y_test, y_predict_prob_log_r[:, 0])
    roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds})
    roc_df.to_csv('../data/roc_data.csv')
    #    tprs = [interp(mean_fpr, fpr, tpr)]
    #    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic', fontsize=18)
    plt.plot(fpr,
             tpr,
             lw=2,
             color='#3399ff',
             label='AUC = {0:.2f}'.format(roc_auc))

    plt.plot([0, 1], [0, 1],
             linestyle='--',
             lw=2,
             color='gray',
             label='Chance',
             alpha=.8)

    plt.ylabel('True Positive Rate', fontsize=14)
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig('../plots/ROC_CNN_log_reg.png')
    plt.show()
    plt.close('all')
    print(
        'If launched from command line use ctrl+z to close all plots and finish'
    )

예제 #32

0

파일 보기

class SceneClassifier:
    def __init__(self):
        self.kernel = None
        self.named_labels = [
            'base', 'greeting', 'qa', 'repeat_user', 'repeat_machine', 'sale'
        ]
        self.fasttext_url = "http://localhost:11425/fasttext/s2v?q={0}&w="
        self.fasttext_url_weighted = "http://localhost:11425/fasttext/s2v?q={0}&w={1}"
        self.weighted = False

    def _add_extra_dict(self, path):
        with open(path, 'r') as inp:
            for line in inp:
                line = line.split(':')[-1]
                words = line.split(',')
                for word in words:
                    jieba.add_word(word)

    def cut(self, input_):
        input_ = QueryUtils.static_remove_cn_punct(input_)
        seg = " ".join(jieba.cut(input_, cut_all=False))
        tokens = _uniout.unescape(str(seg), 'utf8')
        return tokens

    def get_w2v_emb(self, tokens):
        # embedding=np.zeros((1,300),dtype=np.float32)
        # count=0
        # # print_cn(tokens)
        # for word in tokens:
        #     word = word.encode('utf-8')
        #     if w2v_model.__contains__(word.strip()):
        #         vector = w2v_model.__getitem__(word.strip())
        #         result = [v for v in vector]
        #
        #         embedding=np.add(embedding,np.asarray(result))
        #         # print embedding
        #         count+=1
        # if count==0:
        #     print('get...',count)
        #     print_cn(tokens)
        # embedding=np.divide(embedding,count)
        ## get fasttext embedding from web

        embedding = self._fasttext_vector(tokens)
        return np.squeeze(embedding)

    def _fasttext_vector(self, tokens):
        if not self.weighted:
            try:
                weights = np.ones(shape=len(tokens))
                url = self.fasttext_url_weighted.format(
                    ','.join(tokens),
                    ",".join([str(weight) for weight in weights]))
            except:
                traceback.print_exc()
        else:
            try:
                idf_url = "http://10.89.100.14:3032/s/{0}".format(
                    "%7C".join(tokens))
                idf_r = requests.get(url=idf_url)
                weights = []
                returned_json = idf_r.json()
                max_weight = 1
                for key, value in returned_json.iteritems():
                    if value > max_weight:
                        max_weight = value
                for token in tokens:
                    if token not in returned_json:
                        weights.append(str(max_weight))
                    else:
                        weights.append(str(returned_json[token]))

                url = self.fasttext_url_weighted.format(
                    ','.join(tokens), ','.join(weights))
            except:
                traceback.print_exc()
                url = self.fasttext_url.format(','.join(tokens))
        try:
            r = requests.get(url=url)
            vector = r.json()['vector']
            return vector
        except:
            print_cn(url)
            traceback.print_exc()
            return None

    # def check_zero_tokens(self,tokens):
    #     count=0
    #     for word in tokens:
    #         word = word.encode('utf-8')
    #         if w2v_model.__contains__(word.strip()):
    #             count+=1
    #     if count==0:
    #         print_cn(tokens)
    #
    #     return True if count!=0 else False

    def _prepare_data(self, files):
        print('prepare data...')

        embeddings = list()
        queries = list()
        queries_ = dict()
        labels = list()
        mlb = MultiLabelBinarizer()

        for index in xrange(len(files)):
            path = files[index]
            label = self.named_labels[index]
            queries_[label] = list()
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    question = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8')
                    question = QueryUtils.static_remove_cn_punct(str(question))
                    tokens = QueryUtils.static_jieba_cut(question)
                    # print_cn(tokens)
                    if len(tokens) == 0:
                        continue
                    # cc=self.check_zero_tokens(tokens)
                    # if not cc:
                    #     continue
                    queries_[label].append(question)
        # print len(queries_)
        for label, questions in queries_.iteritems():
            for question in questions:
                if question in queries and label not in labels[queries.index(
                        question)]:
                    # print_cn(question)
                    index = queries.index(question)
                    labels[index].append(label)
                else:
                    # print_cn(question)
                    queries.append(question)
                    labels.append([label])
                    tokens = self.cut(question).split(' ')
                    embedding = self.get_w2v_emb(tokens)
                    embeddings.append(embedding)

        embeddings = np.array(embeddings)
        embeddings = np.squeeze(embeddings)
        self.mlb = mlb.fit(labels)
        labels = self.mlb.transform(labels)

        # print (embeddings.shape, len(queries))
        # print_cn(labels.shape)

        return embeddings, labels, queries

    def _build(self, files):
        self._add_extra_dict('../data/sc/belief_graph.txt')
        return self._prepare_data(files)

    def train(self, pkl, files):
        embeddings, labels, queries = self._build(files)
        print 'train classifier...'

        self.kernel = OneVsRestClassifier(
            GradientBoostingClassifier(max_depth=5, n_estimators=1000))
        self.kernel.fit(embeddings, labels)

        pickle.dump(self, open(pkl, 'wb'))

        print 'train done and saved.'

        print 'validation...'
        self.metrics_(labels, queries)

    def metrics_(self, labels, queries):
        correct = 0.0
        total = 0
        for i in xrange(len(queries)):
            query = queries[i]
            if not query:
                continue
            total += 1
            label = labels[i]
            label = np.expand_dims(label, axis=0)
            real = self.mlb.inverse_transform(label)[0]
            real = list(real)
            label_, probs = self.predict(query)
            label_ = list(set(label_))
            # label_ = self.mlb.inverse_transform(label_)

            if ' '.join(real) != ' '.join(list(label_)):
                print('{0}: {1}-->{2}'.format(query, ' '.join(real),
                                              ' '.join(list(label_))))
            else:
                correct += 1
        print('accuracy:{0}'.format(correct / total))

    def validate(self, files):
        embeddings, labels, queries = self._prepare_data(files)
        self.metrics_(labels, queries)

    def predict(self, question):
        line = str(question).replace(" ", "").replace("\t", "")
        tokens = self.cut(line).split(' ')
        embedding = self.get_w2v_emb(tokens)
        embedding = np.reshape(embedding, [1, -1])
        prediction = self.kernel.predict(embedding)
        prediction_index_first_sample = np.where(prediction[0] == 1)
        # label = self.mlb.inverse_transform(prediction)
        probs = self.kernel.predict_proba(embedding)
        ## note that in prediction stage n_sample==1
        label_ = self.mlb.inverse_transform(prediction)
        if len(label_[0]) == 0:
            index = np.argmax(probs[0])
            l = self.named_labels[index]
            prob = probs[0][index]
            return [l], [prob]
        return label_[0], probs[0][prediction_index_first_sample]

    def interface(self, q):
        label, probs = self.predict(q)
        probs_dict = {}
        for i in xrange(len(probs[0])):
            probs_dict[self.named_labels[i]] = probs[0][i]
        return self.mlb.inverse_transform(label)[0], probs_dict

    @staticmethod
    def get_instance(path):
        print('loading model file...')
        return pickle.load(open(path, 'r'))

예제 #33

0

파일 보기

    "mouseUp": [3],
    "usernameWPS": [0.003121452894],
    "passwordWPS": [2.63E-03],
    "totalTimeSpent": [7715],
    "countShift": [1],
    "countCapslock": [0],
    "countKey": [23],
    "dwellTimeAverage": [79.73913043],
    "flightTimesAverage": [348],
    "upDownTimeAverage": [205.9047619]
})

print(X_test)
y_pred = model.predict(X_test)

y_pred_prob = model.predict_proba(X_test)
print(y_pred)
print(y_pred_prob)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print(accuracy_score(y_test, y_pred) * 100)

print(model.predict_proba(new_input))

precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
score = f1_score(y_test, y_pred, average='binary')

print('Recall: %.3f' % recall)

예제 #34

0

파일 보기

파일: text_classification.py 프로젝트: FerdinandZhong/shopee_contest

     for text in x_test
 ])
 print('X_train shape ', x_train_mybag.shape)
 print('X_test shape ', x_test_mybag.shape)
 y_train = label_binarize(y_train, classes=sorted(tags_counts.keys()))
 y_val = label_binarize(y_test, classes=sorted(tags_counts.keys()))
 import itertools
 a = [0.1, 1]
 b = ['l1', 'l2']
 parameters = list(itertools.product(a, b))
 print(parameters)
 for C_value, penalty_value in parameters:
     print(C_value, penalty_value)
     clf = OneVsRestClassifier(
         LogisticRegression(penalty=penalty_value, C=C_value))
     clf.fit(x_train_mybag, y_train)
     y_val_predicted_labels_mybag = clf.predict_proba(x_test_mybag)
     y_val_labels = [[tag for tag in list(enumerate(tags))
                      if tag[1] == 1][0][0] for tags in y_val]
     print(y_val_labels[:10])
     y_val_predicted_labels_mybag = [
         sorted(list(enumerate(tags)), key=lambda x: x[1],
                reverse=True)[0][0] for tags in y_val_predicted_labels_mybag
     ]
     print(y_val_predicted_labels_mybag[:10])
     print("Result with parameter: C: {}, penalty: {}".format(
         C_value, penalty_value))
     print('F1 score weighted: {}'.format(
         f1_score(y_val_labels,
                  y_val_predicted_labels_mybag,
                  average='micro')))

예제 #35

0

파일 보기

    def svm_bagclassifier(sentiment_data,
                          file_name_classifier,
                          file_name_vectorizer,
                          file_name_features,
                          bagging=False):
        """
                vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
                X_train = vectorizer.fit_transform(sentences)
                """
        import time
        start = time.time()
        sentiments, sentences = zip(*sentiment_data[0:1000])
        sentences = GeneralMethodsClassifiers.snowball_stemmer(sentences)
        sentences = GeneralMethodsClassifiers.pre_process_text(sentences)
        vectorize_class = HouzierVectorizer(
            sentences, "%s/CompiledModels/SentimentClassifiers" % base_dir,
            file_name_vectorizer, False, False)

        ##getting features list
        x_vectorize = vectorize_class.count_vectorize()
        tfidf = TfidfTransformer(norm="l2", sublinear_tf=True)

        ##convert them into term frequency
        x_transform = tfidf.fit_transform(x_vectorize)

        X_normalized = preprocessing.normalize(x_transform.toarray(),
                                               norm='l2')
        print "Feature after vectorization of the data [%s, %s]" % x_transform.shape
        ##Going for feature selection
        # This dataset is way too high-dimensional. Better do PCA:
        #pca = PCA()
        pca = KernelPCA(kernel="linear")
        #pca = RandomizedPCA()
        #pca = NMF()
        #
        ## Maybe some original features where good, too?
        ##this will select features basec on chi2 test

        selection = SelectKBest(chi2, k=2)
        combined_features = FeatureUnion([("pca", pca),
                                          ("univ_select", selection)])

        X_features = combined_features.fit_transform(X_normalized, sentiments)
        with cd("%s/CompiledModels/SentimentClassifiers" % base_dir):
            joblib.dump(combined_features,
                        file_name_features,
                        compress=("zlib", 9))
        """
                dump(combined_features,
                     open('%s/%s'%(SentimentClassifiersPath,SentimentFeatureFileName), 'wb'),HIGHEST_PROTOCOL)

                """
        #X_pca = pca.fit_transform(x_transform)

        print "Feature after feature slection with pca and selectkbest\
                    of the data [%s, %s]" % X_features.shape

        #http://stackoverflow.com/questions/32934267/feature-union-of-hetereogenous-features

        #clf = SVC(C=1, kernel="linear", gamma=.001, probability=True, class_weight='auto')

        n_estimators = 3
        svc_classifier = SVC(kernel='linear',
                             C=1,
                             gamma="auto",
                             probability=True,
                             decision_function_shape="ovr",
                             class_weight="balanced",
                             cache_size=20000)

        if bagging:
            classifier = OneVsRestClassifier(
                BaggingClassifier(svc_classifier,
                                  max_samples=1.0,
                                  max_features=1.0,
                                  n_jobs=-1,
                                  verbose=3,
                                  n_estimators=n_estimators,
                                  bootstrap=False))
        else:
            classifier = svc_classifier

        classifier.fit(X_features, sentiments)

        print classifier.classes_
        with cd("%s/CompiledModels/SentimentClassifiers" % base_dir):
            joblib.dump(classifier, file_name_classifier, compress=("zlib", 9))
        """
                dump(file_name_classifier,open('%s/%s'%(SentimentClassifiersPath,
                                                       SentimentClassifierFileName
                                                        ),
                                               'wb'), HIGHEST_PROTOCOL)
                """

        print "Storing Classifier with joblib"
        ##example to build your own vectorizer
        ##http://stackoverflow.com/questions/31744519/load-pickled-classifier-data-vocabulary-not-fitted-error
        from sklearn.feature_extraction.text import CountVectorizer
        #count_vectorizer = CountVectorizer()
        examples_negative = [
            'Free Viagra call today!', "I am dissapointed in you",
            "i am not good"
        ]
        examples_neutral = [
            "I dont know", "Sun rises in the east",
            "I'm going to attend theLinux users group tomorrow."
        ]
        examples_positive = [
            "hey there, I am too good to be true", "An Awesome man",
            "A beautiful beautiful lady"
        ]

        examples = examples_positive + examples_negative + examples_neutral

        #example_counts= example_counts.toarray()
        vocabulary_to_load = vectorize_class.return_vectorizer()
        #vectorize_class = HouzierVectorizer(examples, True, False)
        #x_vectorize = vectorize_class.count_vectorize()

        loaded_vectorizer = CountVectorizer(vocabulary=vocabulary_to_load)
        example_counts = loaded_vectorizer.transform(examples)

        print example_counts, example_counts.shape

        f = combined_features.transform(example_counts.toarray())

        predictions = classifier.predict(f)
        predict_probabilities = classifier.predict_proba(f)
        for sent, prob, tag in zip(examples, predict_probabilities,
                                   predictions):
            print sent, prob, tag

        print time.time() - start
        return

예제 #36

0

파일 보기

def svmBasedClassification():

    # tweets = pd.read_csv("data\\matilampu-label.csv")
    # tweets = pd.read_csv("data\\tweetclean600-only.csv", sep="|")
    tweets = pd.read_csv("data\\backup\\tweets333-only.csv", sep="|")
    # tweets = pd.read_csv("data\\backup\\tweets333-only-withattribute.csv", sep="|")
    tweets = tweets.drop_duplicates()
    tweets = tweets.dropna()
    list(tweets.columns.values)

    sentiment_counts = tweets.sentimen.value_counts()
    number_of_tweets = tweets.id.count()
    print(sentiment_counts)

    from nltk.probability import FreqDist

    fdist = FreqDist(tweets[(tweets.sentimen == 'negatif')])
    print(fdist.most_common(50))


    # count_vectorizer = CountVectorizer(ngram_range=(1,2))
    count_vectorizer = TfidfVectorizer()

    vectorized_data = count_vectorizer.fit_transform(tweets.clean_tweet)
    indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))



    def sentiment2target(sentiment):
        return {
            'negatif': 0,
            'netral': 1,
            'positif' : 2
        }[sentiment]

    targets = tweets.sentimen.apply(sentiment2target)


    from sklearn.model_selection import train_test_split
    data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.4, random_state=0)
    data_train_index = data_train[:,0]
    data_train = data_train[:,1:]
    # print(data_train[0:2])
    data_test_index = data_test[:,0]
    data_test = data_test[:,1:]

    from sklearn import svm
    from sklearn.multiclass import OneVsRestClassifier
    clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='rbf'))
    # clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='linear'))
    clf_output = clf.fit(data_train, targets_train)
    filename = 'model.sav'
    pickle.dump(clf_output, open(filename, 'wb'))

    print(clf.score(data_test, targets_test))

    y_pred = clf.predict(data_test)
    print("Predict test data :\n"+str(y_pred))
    print("Accuracy: ",accuracy_score(targets_test, y_pred))
    print("Recall: ",recall_score(targets_test, y_pred, average='weighted'))
    print("Presisi: ",precision_score(targets_test, y_pred, average='weighted'))
    print("F1 score: ",f1_score(targets_test, y_pred, average='weighted'))


    sentences = count_vectorizer.transform([
        "Negara kita ngutang buat bngun infrastruktur yang udah dipake masyarakat, terus masyarakatnya ngeluh karena negara ngutang, setiap negara itu pasti ngutang,  utang bisa dibayar kalo negara dapet penghasilan. Penghasilan negara itu ya dari pajak",
        "Negara kita ngutang sehingga harga mahal dan masyarakat tercekik dan ngeluh",
        "Prabowo-Sandi Sepakat Tak Ambil Gaji karena Negara Sedang Susah",
        "Calon presiden Jokowi menjelaskan program Kartu Pra Kerja akan memberikan insentif dalam kurun waktu tertentu, bukan berarti memberikan gaji secara cuma-cuma bagi masyarakat yang belum berpenghasilan."
    ])
    print(clf.predict_proba(sentences))

예제 #37

0

파일 보기

파일: ml.py 프로젝트: Ritvik19/Toxic-Comment-Classification

def train_model_one_vs_rest(model, vects, target, labels, **kwargs):
    model_performance = {
        'roauc': [],
        'f1': [],
        'accuracy': [],
    }

    model = OneVsRestClassifier(model)

    for train_indices, test_indices in tqdm(kf.split(vects, target)):
        X_train = vects[train_indices]
        y_train = target[train_indices]

        X_test = vects[test_indices]
        y_test = target[test_indices]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_ = model.predict_proba(X_test)
        model_performance['roauc'].append(roc_auc_score(y_test, y_pred_))
        model_performance['f1'].append(
            f1_score(y_test, y_pred, average='weighted'))
        model_performance['accuracy'].append(accuracy_score(y_test, y_pred))

    fig = plt.figure(figsize=(20, 18))

    ax1 = plt.subplot2grid((3, 3), (0, 0), colspan=2)

    ax1.plot(model_performance['roauc'], label='roauc per iteration')
    ax1.plot(np.ones(10) * np.mean(model_performance['roauc']),
             '--',
             label='mean roauc')

    ax1.plot(model_performance['f1'], label='f1 per iteration')
    ax1.plot(np.ones(10) * np.mean(model_performance['f1']),
             '--',
             label='mean f1')

    ax1.plot(model_performance['accuracy'], label='accuracy per iteration')
    ax1.plot(np.ones(10) * np.mean(model_performance['accuracy']),
             '--',
             label='mean accuracy')

    ax1.grid()
    ax1.legend()
    ax1.set_xlabel('fold')
    ax1.set_ylabel('value')
    ax1.set_title('Model Performance')

    cm = []

    cm.append(
        normalize(
            confusion_matrix(y_test[:, 0], y_pred[:, 0]), axis=1, norm='l1') *
        100)
    ax2 = plt.subplot2grid((3, 3), (0, 2))
    sns.heatmap(cm[-1], annot=True, square=True, ax=ax2, cmap='Blues')
    ax2.set_title(f'Confusion Matrix \'{labels[0]}\'')
    ax2.set_xlabel('Predicted')
    ax2.set_ylabel('Actual')

    for i, l in enumerate(labels[1:]):
        cm.append(
            normalize(confusion_matrix(y_test[:, i + 1], y_pred[:, i + 1]),
                      axis=1,
                      norm='l1') * 100)
        ax2 = plt.subplot2grid((3, 3), (i // 3 + 1, i % 3))
        sns.heatmap(cm[-1], annot=True, square=True, ax=ax2, cmap='Blues')
        ax2.set_title(f'Confusion Matrix \'{l}\'')
        ax2.set_xlabel('Predicted')
        ax2.set_ylabel('Actual')

    return model_performance, cm, model

예제 #38

0

파일 보기

import prepare_data as prepare
import evaluate
from sklearn.lda import LDA
from sklearn.multiclass import OneVsRestClassifier

train_data, validation_data, test_data, basic_users_info = prepare.get_data()
label_encoder = {}
train_x, train_y = prepare.get_exclude_ndf_x(train_data, basic_users_info,
                                             label_encoder)
validation_x, validation_y = prepare.get_exclude_ndf_x(validation_data,
                                                       basic_users_info,
                                                       label_encoder)

rf = OneVsRestClassifier(LDA()).fit(train_x, train_y)
#validation_predict = rf.predict(validation_x)
validation_predict_proba = rf.predict_proba(validation_x)
#print validation_predict_proba
class_order = rf.classes_

predict_list = evaluate.candidate_classes(validation_predict_proba,
                                          class_order)
ndcg = evaluate.ndcg(predict_list, validation_data)
print(ndcg)

test_x = prepare.get_exclude_ndf_test_x(test_data, basic_users_info,
                                        label_encoder)
test_predict_proba = rf.predict_proba(test_x)
test_predict_list = evaluate.candidate_classes(test_predict_proba, class_order)
prepare.get_test_predict(test_data, test_predict_list)

예제 #39

0

파일 보기

def plot_roc_k_fold(clf, X, y, n_splits):
    """
    Faz o treinamento de n classificadores com dados subdivididos em n pastas (KFold),
    e exibe uma curva ROC para cada classe com uma linha para cada pasta.

    Parameters
    ----------
    clf : object
        Classificador

    X : array or DataFrame
        Conjunto de dados de treinamento e teste

    y : array or DataFrame
        Rótulos dos dados de treinamento e teste

    n_splits : int
        Número de subdivisões dos dados
    """
    classes = np.unique(y)
    n_classes = len(classes)
    y_bin = label_binarize(y, classes=classes)

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    classifier = OneVsRestClassifier(clf)
    dic = {'Fold': [], 'Class': [], 'FPR': [], 'TPR': []}

    for i, (train, test) in enumerate(cv.split(X, y_bin[:, 0])):
        classifier.fit(X[train], y_bin[train])
        y_score = classifier.predict_proba(X)

        # Roc por classe
        for i_class in range(n_classes):
            fpr, tpr, _ = roc_curve(y_bin[test, i_class], y_score[test,
                                                                  i_class])
            dic['Fold'] += [i]
            dic['Class'] += [classes[i_class]]
            dic['FPR'] += [fpr]
            dic['TPR'] += [tpr]

    df_result = pd.DataFrame(data=dic)

    # Imprime gráfico
    colors = ['orange', 'red', 'green', 'cyan', 'gold']
    for class_ in list(df_result['Class'].unique()):

        df_plot = df_result[df_result['Class'] == class_]

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        count = 0
        for index, row in df_plot.iterrows():
            fold = row['Fold']
            fpr = row['FPR']
            tpr = row['TPR']
            roc_auc = auc(fpr, tpr)

            plt.plot(fpr,
                     tpr,
                     color=colors[count % len(colors)],
                     lw=1.5,
                     label='ROC fold {0} (AUC = {1:0.3f})'
                     ''.format(fold, roc_auc),
                     alpha=0.5)

            # Guarda pra o cálculo da média
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            tprs += [interp_tpr]
            aucs += [roc_auc]

            count += 1

        # Calcula as médias
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        # Calcula desvio padrão
        std_auc = np.std(aucs)
        plt.plot(mean_fpr,
                 mean_tpr,
                 color='blue',
                 lw=1.5,
                 label='Média ROC (AUC = {:0.3f} $\pm$ {:0.3f})'
                 ''.format(mean_auc, std_auc))

        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        plt.fill_between(mean_fpr,
                         tprs_lower,
                         tprs_upper,
                         color='grey',
                         alpha=.2,
                         label=r'$\pm$ 1 desvio padrão')

        plt.plot([0, 1], [0, 1], 'k--', lw=1.5)
        plt.xlim([-0.05, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Taxa de Falso Positivo')
        plt.ylabel('Taxa de Verdadeiro Positivo')
        plt.title('Curva ROC \n (Classe=' + class_ + ')')
        plt.legend(loc="lower right")
        plt.show()

예제 #40

0

파일 보기

파일: BagOfVisualWords.py 프로젝트: BoysGang/SemanticRecognition

class BagOfVisualWords(ImageClassifier):
    def __init__(self, max_features=30, clusters_num=200, model=None, labels=None, voc=None, scaler=None, img_loader=None):
        super().__init__()

        self.__model = model
        self._labels = labels
        self.__vocabulary = voc
        self.__clusters_num = clusters_num
        self.__extractor_max_features = max_features
        self.__feature_extractor = cv2.SIFT_create()
        self.__scaler = scaler
        self._img_loader = img_loader
    
    # Train BoVW model
    def fit(self, img_data_generator: ImgDataGenerator):
        self._init_img_loader(img_data_generator)

        train_generator = img_data_generator.train_generator
        test_generator = img_data_generator.test_generator

        self._labels = list(train_generator.class_indices.keys())

        shape = train_generator.image_shape
        train_samples_num = train_generator.samples

        print('Number of samples: ', train_samples_num)
        print('Image shape: ', shape)
        print('Labels:', self._labels)
        print()

        # Train samples
        descriptors, y_train = self.__get_data(train_generator)
        X_train, self.__vocabulary = self.__extract_features(descriptors, k=self.__clusters_num)

        # Train the Linear SVM
        self.__model = OneVsRestClassifier(SVC(kernel='linear',probability=True, max_iter=-1), n_jobs=-1)
        self.__model.fit(X_train, y_train)

        # Test samples
        test_descriptors, y_test = self.__get_data(test_generator)
        X_test, _ = self.__extract_features(test_descriptors,
                                            voc=self.__vocabulary, 
                                            k=self.__clusters_num)

        y_pred = self.__model.predict_proba(X_test)

        y_pred = [self._labels[y.argmax()] for y in y_pred]
        y_test = [self._labels[y.argmax()] for y in y_test]

        print('\nConfusion Matrix:\n')
        print(confusion_matrix(y_test, y_pred))
        
        print('\nReport:')
        print(classification_report(y_test, y_pred))

    # Classify image
    def predict(self, img_path):
        image = self._img_loader.load_img(img_path)
        
        descriptors = self.__descriptors_from_img(image)
        
        # Get descritors
        des = descriptors[0]
        for descriptor in descriptors[1:]:
            des = np.vstack((des, descriptor))

        # Calculate feature histogram
        features = np.zeros((1, self.__clusters_num), "float32")
        words, _ = vq(des, self.__vocabulary)
        for w in words:
            features[0][w] += 1

        features = features.reshape(1, -1)
        features = self.__scaler.transform(features)

        # Perform probability prediction
        probabilities = self.__model.predict_proba(features)

        return self._labels, probabilities[0]

    # Save BoVW model to the given path
    def save(self, path):
        if not os.path.exists(path):
            os.makedirs(path)

        joblib.dump((self.__model, 
                        self._labels, 
                        self.__clusters_num, 
                        self.__vocabulary,
                        self.__extractor_max_features,
                        self.__scaler,
                        self._img_loader), 
                    os.path.join(path, 'bovw.pkl'))

    # Load BoVW model from the given path
    @classmethod
    def load(cls, path):
        model, labels, clusters_num, voc, max_features, scl, loader = joblib.load(os.path.join(path, 'bovw.pkl'))
        return BagOfVisualWords(max_features, clusters_num, model, labels, voc, scl, loader)

    # Get labels and image descriptors from generator
    def __get_data(self, generator):
        samples = generator.samples
        batch_size = generator.batch_size

        descriptors, labels = list(), list()
        for _ in range(samples // batch_size + 1):
            data_batch, labels_batch = generator.next()

            for img_data, label in zip(data_batch, labels_batch):
                des = self.__descriptors_from_img(img_data)

                if des is not None:
                    descriptors.append(des)
                    labels.append(label)

        return descriptors, labels

    # Get descriptors from image
    def __descriptors_from_img(self, image_data):
        image_data *= 255
        image8bit = cv2.normalize(image_data, None, 0, 255, cv2.NORM_MINMAX).astype('uint8')
        _, des = self.__feature_extractor.detectAndCompute(image8bit, None)

        return des

    # Calculate the histogram of features
    def __extract_features(self, descriptors, voc=None, scaler=None, k=200):
        # Stack all the descriptors vertically in a numpy array
        des = np.array(descriptors[0])
        for descriptor in descriptors[1:]:
            des = np.vstack((des, descriptor))

        # Convert integers to float, so kmeans will work properly
        descriptors_float = des.astype(float)

        if voc is None:
            # Perform k-means clustering and vector quantization
            voc, _ = kmeans(whiten(descriptors_float), k, 1)

        # Calculate the histogram of features and represent them as vector
        im_features = np.zeros((len(descriptors), k), "float32")
        for i in range(len(descriptors)):
            words, _ = vq(descriptors[i], voc)
            for w in words:
                im_features[i][w] += 1

        # Standardize features by removing the mean and scaling to unit variance
        if scaler is None:
            self.__scaler = StandardScaler().fit(im_features)
        im_features = self.__scaler.transform(im_features)

        return im_features, voc

예제 #41

0

파일 보기

파일: SVC_roc2.py 프로젝트: takuzoo121/some

'''

##テスト
df_test = pd.read_csv("test.csv")
#df_test2 = df_test.sample(100, random_state=0)

X_test = []
for i, row in df_test.iterrows():
    img = Image.open(os.path.join(DIR_IMAGES, row.filename))
    img = img.crop((row.left, row.top, row.right, row.bottom))
    img = img.convert("L")
    img = img.resize((IMG_SIZE, IMG_SIZE), resample=Image.BICUBIC)

    x = np.asarray(img, dtype=np.float)
    x = x.flatten()
    X_test.append(x)

X_test = np.array(X_test)
#X_test.shape#100個のテストデータの10000次元ベクトル

#scaler2 = StandardScaler()#変換器の初期化
#scaler2.fit(X_test)#開発データに合わせる,ないとエラー
X_test_scaled = scaler.transform(X_test)  #標準化されたデータが返される

#decomposer2 = PCA(n_components=30,  random_state=0)#圧縮先の次元数を指定
#decomposer2.fit(X_scaled2)#使うデータに合わせる
X_test_pca = decomposer.transform(X_test_scaled)  #PCAの結果を格

Y_test_pred = classifier.predict_proba(X_test_pca)
np.savetxt('submission6.dat', Y_test_pred, fmt='%.6f')

예제 #42

0

파일 보기

파일: skinSVM.py 프로젝트: natanaeladit/TeamGreaterThanBrains

x_validationset = x_validationset.groupby(x_validationset.index).apply(transformXY)
x_testset = x_testset.groupby(x_testset.index).apply(transformXY)
x_traindata = x_traindata.groupby(x_traindata.index).apply(transformXY)

#Normalise the data
df = x_traindata.iloc[:,1:]
df_norm = (df - df.mean(axis=1)) / (df.max(axis=1) - df.min(axis=1))
x_traindata = df_norm

#Train classifier
clf = OneVsRestClassifier(SVC(C=0.1,kernel='poly', probability=True))
clf.fit(x_traindata, y_traindata)

# now you can save it to a file
with open('SKINclassifierpolytrainset_SVC_c01.pkl', 'wb') as f:
    pickle.dump(clf, f)

## and later you can load it
with open('SKINclassifierlineartraindata_onevsone_padded_SVC_rs5.pkl', 'rb') as f:
    clf = pickle.load(f)
    
#Make predictions
preds = clf.predict_proba(x_testdata)
predsdf = pd.DataFrame(preds)
predsdf.to_pickle('predictions_SKIN_poly_c01_validationset.pkl')  # where to save it, usually as a .pkl

#Write outputfile
check = predsdf
predsdf = check
Output.to_outputfile(check,1,'SKINpoly_c01_clean_validationset',clean=True, validation=True)
Output.to_outputfile(check,1,'SKINpoly_c01_testdata',clean=False, validation=False)

예제 #43

0

파일 보기

파일: Dermatology_dataset.py 프로젝트: gmashouf/UCI_dermatology_dataset

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
################################################
# Logistic Regression
################################################
# predict each class against the other
C_parameter = 50. / len(X_train)  # parameter for regularization of the model
class_parameter = 'multinomial'  # parameter for dealing with multiple classes
penalty_parameter = 'l1'  # parameter for the optimizer (solver) in the function
solver_parameter = 'saga'  # optimization system used
tolerance_parameter = 0.1  # termination parameter
classifier = OneVsRestClassifier(LogisticRegression(C=C_parameter, multi_class=class_parameter,\
                                penalty=penalty_parameter, solver=solver_parameter, tol=tolerance_parameter))
classifier.fit(X_train, y_train)  # Training the algorithm
y_predict = classifier.predict(X_test)  # prediction
probas = classifier.predict_proba(X_test)  # probability
# Compute ROC curve and ROC area for each class
fpr = dict(
)  # dictionary to assign fpr for each scenario (key is the name of class)
tpr = dict()  # dictionary to assign tpr
roc_auc = dict()  #dictionary to assign AUC of each class
th = dict()  # dictionary to assign probability threshold
CM = dict()
P = dict()
R = dict()
F1 = dict()
for i in range(n_classes):
    fpr[i], tpr[i], th[i] = roc_curve(y_test[:, i], probas[:, i])
    roc_auc[i] = auc(
        fpr[i], tpr[i])  # calculated area under the curve for a each scenario
    CM[i] = confusion_matrix(y_test[:, i], y_predict[:, i])

예제 #44

0

파일 보기

파일: feature_engineering.py 프로젝트: Polaryti/UniversityHack-2020

def coordinates_fe(X_modelar, y_modelar, X_estimar, K=4):
    est_IDs = X_estimar[0]
    X_est_mod = pd.concat([X_modelar, X_estimar], sort=False)
    coords = X_est_mod[[1, 2]].rename(columns={1:'X', 2:'Y'})

    spatialTree = cKDTree(np.c_[coords.X.ravel(),coords.Y.ravel()])

    X_est_mod.drop([0],inplace=True,axis=1)
    #X_est_mod = reduce_colors(X_est_mod)

    X_estimar.drop([0],inplace=True,axis=1)
    #X_estimar = reduce_colors(X_estimar)

    X_modelar.drop([0],inplace=True,axis=1)
    #X_modelar = reduce_colors(X_modelar)

    """
    print(list(X_modelar.columns.values))
    print(list(X_estimar.columns.values))
    print(list(y_modelar.columns.values))
    """

    classifier = xgb.XGBClassifier()
    ovsr = OneVsRestClassifier(classifier,n_jobs=-1).fit(X_modelar,y_modelar)
    pred_estimar = ovsr.predict_proba(X_estimar)

    offset = X_modelar.shape[0]
    classes = get_categories_list()
    col_names = []

    for i in range(7):
        col_names.append('coords_' + classes[i])

    cont = [] 

    for i in range(X_est_mod.shape[0]):
        
        indices = [0.0,0.0,0.0,0.0,0.0,0.0,0.0]
        
        neigh_dist, neigh_indices = spatialTree.query([[coords.iloc[i,0],coords.iloc[i,1]]],k=K)
        
        for j in range(1,K):
            # Para cada vecino sumamos 1 a la variable contexto de la clase de la finca 
            # O en caso de que se encuentre en X_estimar sumamos las probabilidades
            if neigh_indices[0][j] < offset : 
                indices[int(y_modelar.loc[neigh_indices[0][j], 'CLASS'])] += 1
            else:
                
                indices = np.add(indices, pred_estimar[neigh_indices[0][j]-offset,:])

        cont.append(indices)# Sin softmax
        #cont.append(softmax(np.array(indices))) #Con softmax

    indexes_est = []
    for i in range(X_estimar.shape[0]):
        indexes_est.append(i)

    context  = pd.DataFrame(data=cont,columns=col_names)
    context_modelar = context.loc[:offset-1]
    
    context_estimar = context.loc[offset:]
    context_estimar.index = range(5618)
    violin_plot_kdtree(context_modelar, y_modelar)
    
    #context.drop('coords_RESIDENTIAL',axis=1,inplace=True) #PROBAR CON Y SIN

    for column in col_names:
        X_modelar[column] = context_modelar[column]
        X_estimar[column] = context_estimar[column] 

    #return X_modelar.values, X_estimar.values, est_IDs
    return X_modelar, X_estimar, est_IDs

예제 #45

0

파일 보기

파일: Precision_Recall.py 프로젝트: anurita/IR-Project

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)



#clf = svm.SVC(gamma=0.001, C=100.)
clf = OneVsRestClassifier(LogisticRegression())
# clf = RandomForestClassifier(n_estimators=100)

clf.fit(X_train, y_train)
kf = KFold(n_splits=3, shuffle=True)
scores = cross_val_score(clf, X, y, cv=kf)
print("Accuracy for 10 fold CV", scores)
print("Average accuracy: ", numpy.mean(scores))
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

preds = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
#df.to_csv("/Users/shilpagundrathi/Downloads/RandomForest.csv")
auc = metrics.auc(fpr,tpr)
print(auc)

# g=  ggplot(df, aes(x='fpr', y='tpr')) +\
#     geom_abline(linetype='dashed')
# ggplot.ggsave(plot = g, filename = "new_test_file")

#print (ggplot(df, aes(x='fpr', y='tpr')) + \
    #eom_line(color='black') )

예제 #46

0

파일 보기

파일: creditRisk_validate.py 프로젝트: sonalia2/PythonCodes

from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

svc=OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear',probability=True)))
svc.fit(xTrainS,yTrain)
Y_pred_SVM=svc.predict(xTestS)
from sklearn.metrics import confusion_matrix
confusion_matrix(yTest,Y_pred_SVM)
#Accuracy:-(58+289)/367----94%
#ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
svm_roc_auc=roc_auc_score(yTest,svc.predict(xTest))
svm_roc_auc
fpr,tpr,thresholds=roc_curve(yTest,svc.predict_proba(xTest)[:,1])
# auc 48%



###Logistiuc regression '
from sklearn.linear_model import LogisticRegression
logmodel=LogisticRegression()
logmodel.fit(xTrain,yTrain)

#prediction
Y_pred_LG=logmodel.predict(xTest)

#confusion matrix
from sklearn.metrics import confusion_matrix 
confusion_matrix(yTest,Y_pred_LG)

예제 #47

0

파일 보기

    def train(self, X, y, test_ratio=0.2):

        print("\tShuffling arrays...")
        p = np.random.permutation(X.shape[0])
        X, y = X[p], y[p]

        print("\tTraining classifier...")
        train_instances = int((1 - test_ratio) * X.shape[0])

        # train on the training samples (as many cpus as avail.)
        if self.classifier_type == "multiclass":
            y_hot = make_one_hot(y)
            self.model = OneVsRestClassifier(LogisticRegression(),
                                             n_jobs=-1).fit(
                                                 X[:train_instances],
                                                 y_hot[:train_instances])

        if self.classifier_type == "logistic":
            self.model = LogisticRegression(penalty='l2', solver='sag').fit(
                X[:train_instances], y[:train_instances])

        if self.classifier_type == "mlp":
            self.model = MLPClassifier(
                hidden_layer_sizes=(100, 50, 20,
                                    5)).fit(X[:train_instances],
                                            y[:train_instances])

        if self.classifier_type == "multiclass_logistic":

            y_hot = make_one_hot(y)
            layer1 = OneVsRestClassifier(LogisticRegression(), n_jobs=-1).fit(
                X[:train_instances], y_hot[:train_instances])
            layer1_output = layer1.predict_proba(X[:train_instances])

            layer2 = MLPClassifier(hidden_layer_sizes=(3, 3)).fit(
                layer1_output, y[:train_instances])
            layer2_output = layer2.predict_proba(X[:train_instances])

            output_layer = LogisticRegression().fit(layer2_output,
                                                    y[:train_instances])

            #self.model = LogisticRegression(OneVsRestClassifier(LogisticRegression(penalty='l2',solver='sag'),n_jobs=-1).fit(X[:train_instances],y_hot[:train_instances]))

        if self.classifier_type == "multiclass_logistic":
            l1_pred = layer1.predict_proba(X[train_instances:])
            l2_pred = layer2.predict_proba(l1_pred)
            o_pred = output_layer.predict(l2_pred)

            #m1_pred = m1.predict_proba(X[train_instances:])
            #m2_pred = m2.predict(m1_pred)

            num_correct = 0
            for p, a in zip(o_pred, y_int[train_instances:]):
                if p == a: num_correct += 1

            print("Accuracy %0.1f%%" %
                  (100.0 * float(num_correct) / float(len(m2_pred))))

            print("Plotting confusion matrix...")
            y_test = y_int[train_instances:]
            y_pred = o_pred
            plot_confusion_matrix(y_test,
                                  y_pred,
                                  self.class_names,
                                  train_instances,
                                  normalize=True)
            return

        elif self.classifier_type == "multiclass":
            y_hot = make_one_hot(y)
            self.scores = cross_val_score(self.model,
                                          X[train_instances:],
                                          y_hot[train_instances:],
                                          cv=5)
        else:
            y_hot = make_one_hot(y)
            self.scores = cross_val_score(self.model,
                                          X[train_instances:],
                                          y_int[train_instances:],
                                          cv=5)

        # score on the testing samples
        print("Accuracy: %0.1f%% (+/- %0.1f%%)" %
              (100 * self.scores.mean(), 100 * self.scores.std() * 2))
        if self.class_names != None and self.classifier_type != "multiclass":
            print("Plotting confusion matrix...")
            y_test = y[train_instances:]
            y_pred = self.model.predict(X[train_instances:])
            plot_confusion_matrix(y_test,
                                  y_pred,
                                  self.class_names,
                                  train_instances,
                                  normalize=True)

        return self.scores.mean(), self.scores.std() * 2

예제 #48

0

파일 보기

accuracy = (y_pred_rt == y_test).sum() / y_test.shape[0]

n_estimator = 100
# Supervised transformation based on random forests
sc_X = StandardScaler()
sc_X.fit(X_train)
X_train_std = sc.transform(x_train)

rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = OneVsRestClassifier(LogisticRegression())
rf.fit(X_train, y_train_2)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(x_test)))
y_pred_rf_lm = np.argmax(y_pred_rf_lm, axis=1)
accuracy_rf_lm = (y_pred_rf_lm == y_test).sum() / y_test.shape[0]

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator


class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    """ A majority vote ensemble classifier

예제 #49

0

파일 보기

def predict_multiclass(X_train,
                       y_train,
                       X_test,
                       y_test,
                       graphTitle="",
                       max_depth=12,
                       n_estimators=140,
                       plot=True,
                       weight=20):
    weights = {0: 1, 1: weight, 2: weight}

    #y manipulating
    y_train = label_binarize(y_train, classes=[0, 1, 2])
    y_test = label_binarize(y_test, classes=[0, 1, 2])

    m = OneVsRestClassifier(
        RandomForestClassifier(max_depth=max_depth,
                               random_state=0,
                               n_estimators=n_estimators))
    y_score = m.fit(X_train, y_train).predict_proba(X_test)

    probs = m.predict_proba(X_test)
    probs_train = m.predict_proba(X_train)

    # Compute ROC curve and area the curve
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    n_classes = 3
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(fpr["micro"],
             tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["micro"]),
             color='deeppink',
             linestyle=':',
             linewidth=4)

    plt.plot(fpr["macro"],
             tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='navy',
             linestyle=':',
             linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i],
                 tpr[i],
                 color=color,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(
        'Some extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    plt.show()

    return ()

예제 #50

0

파일 보기

def svm(i):
    train_x = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Train_CV_{i}.csv').iloc[:,
                                                                           9:]
    train_y = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Train_CV_{i}.csv').iloc[:,
                                                                           4]
    validation_x = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Validation_CV_{i}.csv'
    ).iloc[:, 9:]
    validation_y = pd.read_csv(
        f'./CV_FeCV_Features_631atures/ClassificationFeatures/Validation_CV_{i}.csv'
    ).iloc[:, 4]
    test_x = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Test_CV_{i}.csv').iloc[:,
                                                                          9:]
    test_y = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Test_CV_{i}.csv').iloc[:, 4]

    encoder = LabelEncoder().fit(
        train_y)  # #训练LabelEncoder, 把y_train中的类别编码为0，1，2，3，4，5
    y = encoder.transform(train_y)
    y_train = pd.DataFrame(
        encoder.transform(train_y))  # 使用训练好的LabelEncoder对源数据进行编码
    y_valid = pd.DataFrame(encoder.transform(validation_y))
    y_test = pd.DataFrame(encoder.transform(test_y))

    # 标签降维度
    y_train = y_train.iloc[:, 0].ravel()
    y_valid = y_valid.iloc[:, 0].ravel()
    y_test = y_test.iloc[:, 0].ravel()

    # X标准化
    scaler = StandardScaler()
    x_train_std = scaler.fit_transform(train_x)
    x_valid_std = scaler.fit_transform(validation_x)
    x_test_std = scaler.fit_transform(test_x)

    # ------------
    # Gamma
    # ------------
    accuracy_list_valid, f1_list_valid, auc_list_valid = [], [], []
    gamma_range = np.logspace(-10, 1, 10, base=2)
    logger.info(gamma_range)
    for idx, gamma in enumerate(tqdm(gamma_range)):
        # ------------
        # Training
        # ------------
        time0 = time()
        logger.info(
            f">>>>>>>CV = {i}/10, Start Trainng {idx + 1}/{len(gamma_range)}>>>>>>>"
        )
        print(
            f">>>>>>> CV = {i}/10, Start Training {idx + 1}/{len(gamma_range)}>>>>>>>"
        )
        clf = OneVsRestClassifier(
            SVC(
                kernel='rbf',  #
                gamma=gamma,
                C=1,  # default
                degree=1,
                cache_size=5000,
                probability=True,
                class_weight='balanced'))
        clf.fit(x_train_std, y_train)
        # ------------
        # Validation: Fine-tuning on Validation dataset
        # ------------
        y_prediction_valid = clf.predict(x_valid_std)
        accuracy_valid = accuracy_score(y_valid, y_prediction_valid)
        accuracy_list_valid.append(accuracy_valid)
        f1_valid = f1_score(y_valid, y_prediction_valid, average="weighted")
        f1_list_valid.append(f1_valid)
        y_binary_valid = label_binarize(y_valid, classes=list(range(6)))
        result_valid = clf.decision_function(x_valid_std)
        auc_valid = roc_auc_score(y_binary_valid,
                                  result_valid,
                                  average='micro')
        auc_list_valid.append(auc_valid)
        # Logger
        logger.info(
            f"Validation Gamma >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}"
        )
        print(
            f"Validation Gamma >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}"
        )
        print(
            datetime.datetime.fromtimestamp(time() -
                                            time0).strftime("%M:%S:%f"))

    best_gamma = gamma_range[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    best_acc = max(accuracy_list_valid)
    best_f1 = f1_list_valid[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    best_auc = auc_list_valid[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    print(
        f"Validation >>> Best gamma = {best_gamma}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}\n"
    )
    logger.info(
        f"Validation >>> Best gamma = {best_gamma}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}"
    )

    # ------------
    # C
    # ------------
    best_gamma = gamma_range[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    C = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
    accuracy_list_C_valid = []
    for idx, c in enumerate(tqdm(C)):
        time0 = time()
        logger.info(
            f">>>>>>>CV = {i}/10, Fine-Tuining C, Start Trainng {idx + 1}/{len(C)}>>>>>>>"
        )
        print(
            f">>>>>>> CV = {i}/10, Fine-Tuining C, Start Training {idx + 1}/{len(C)}>>>>>>>"
        )
        clf = OneVsRestClassifier(
            SVC(
                kernel='rbf',  #
                gamma=best_gamma,
                C=c,  # default
                degree=1,
                cache_size=5000,
                probability=True,
                class_weight='balanced'))
        clf.fit(x_train_std, y_train)
        # ------------
        # Validation: Fine-tuning on Validation dataset
        # ------------
        y_prediction_valid = clf.predict(x_valid_std)
        accuracy_valid = accuracy_score(y_valid, y_prediction_valid)
        accuracy_list_C_valid.append(accuracy_valid)
        f1_valid = f1_score(y_valid, y_prediction_valid, average="weighted")
        y_binary_valid = label_binarize(y_valid, classes=list(range(6)))
        result_valid = clf.decision_function(x_valid_std)
        auc_valid = roc_auc_score(y_binary_valid,
                                  result_valid,
                                  average='micro')
        # Logger
        logger.info(
            f"Validation C >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}"
        )
        print(
            f"Validation C >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}"
        )
        print(
            datetime.datetime.fromtimestamp(time() -
                                            time0).strftime("%M:%S:%f"))
    best_c = C[accuracy_list_C_valid.index(max(accuracy_list_C_valid))]

    # logger
    best_acc = max(accuracy_list_C_valid)
    best_f1 = f1_list_valid[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    best_auc = auc_list_valid[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    print(
        f"Validation >>> Best gamma = {best_gamma}, Best C = {best_c}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}\n"
    )
    logger.info(
        f"Validation >>> Best gamma = {best_gamma}, Best C = {best_c}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}"
    )

    # ------------
    # Test: Test on Test dataset with best gamma
    # ------------
    clf_best_test = OneVsRestClassifier(
        SVC(
            kernel='rbf',  #
            gamma=best_gamma,
            C=best_c,  # default
            degree=1,
            cache_size=5000,
            probability=True,
            class_weight='balanced'))
    clf_best_test.fit(x_train_std, y_train)
    # accuracy & F1 & AUC on Test dataset
    y_test_prediction = clf_best_test.predict(x_test_std)
    test_accuracy = round(accuracy_score(y_test, y_test_prediction), 4)
    test_f1 = round(f1_score(y_test, y_test_prediction, average="weighted"), 4)
    y_test_binary = label_binarize(y_test,
                                   classes=list(range(6)))  # 转化为one-hot
    result_test = clf_best_test.decision_function(x_test_std)
    test_auc = round(
        roc_auc_score(y_test_binary, result_test, average='micro'), 4)
    print(
        f"CV = {i}, Test >>> gamma = {best_gamma}, Acc. ={test_accuracy}, F1-Score = {test_f1}, AUC = {test_auc}"
    )
    logger.info(
        f"CV = {i}, Test >>> gamma = {best_gamma}, Acc. ={test_accuracy}, F1-Score = {test_f1}, AUC = {test_auc}"
    )

    # save
    result_test = clf_best_test.predict_proba(x_test_std)
    df = pd.DataFrame(result_test)
    df.to_csv(
        f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210621_prediction_CV{i}_Gamma_{round(best_gamma,4)}_C_{round(best_c)}_ACC_{test_accuracy}_F1_{test_f1}_AUC_{test_auc}.csv"
    )
    df2 = pd.DataFrame(y_test)
    df2.to_csv(
        f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210324_GT_CV{i}.csv"
    )
    print(f">>>>>>> CV = {i}/10, Over Training >>>>>>>\n")
    logger.info(f">>>>>>> CV = {i}/10,Over Training >>>>>>>")
    return [test_accuracy, test_f1, test_auc]

예제 #51

0

파일 보기

파일: onevsrest_multiclassifier.py 프로젝트: joseph5wu/255-assignment2

import prepare_data as prepare
import evaluate
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

train_data, validation_data, test_data, basic_users_info = prepare.get_data()
label_encoder = {}
train_x, train_y = prepare.get_exclude_ndf_x(train_data, basic_users_info, label_encoder)
validation_x, validation_y = prepare.get_exclude_ndf_x(validation_data, basic_users_info, label_encoder)

rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=11, criterion='gini')).fit(train_x, train_y)
validation_predict = rf.predict(validation_x)
validation_predict_proba = rf.predict_proba(validation_x)
# print validation_predict_proba
class_order = rf.classes_

predict_list = evaluate.candidate_classes(validation_predict_proba, class_order)

#print predict_list
ndcg = evaluate.ndcg(predict_list, validation_data)
print(ndcg)

test_x = prepare.get_exclude_ndf_test_x(test_data, basic_users_info, label_encoder)
test_predict = rf.predict_proba(test_x)
test_predict_list = evaluate.candidate_classes(test_predict, class_order)
prepare.get_test_predict(test_data, test_predict_list)

예제 #52

0

파일 보기

파일: Creating_a_simple_first_model.py 프로젝트: akgarhwal/DataCamp-Data-Scientist-Track-Python

# Print the accuracy
print("Accuracy: {}".format(clf.score(X_test, y_test)))

#3
# Instantiate the classifier: clf
clf = OneVsRestClassifier(LogisticRegression())

# Fit it to the training data
clf.fit(X_train, y_train)

# Load the holdout data: holdout
holdout = pd.read_csv('HoldoutData.csv', index_col=0)

# Generate predictions: predictions
predictions = clf.predict_proba(holdout[NUMERIC_COLUMNS].fillna(-1000))

#4
# Generate predictions: predictions
predictions = clf.predict_proba(holdout[NUMERIC_COLUMNS].fillna(-1000))

# Format predictions in DataFrame: prediction_df
prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns,
                             index=holdout.index,
                             data=predictions)

# Save prediction_df to csv
prediction_df.to_csv('predictions.csv')

# Submit the predictions for scoring: score
score = score_submission(pred_path='predictions.csv')

예제 #53

0

파일 보기

파일: fbpath_train_logistic.py 프로젝트: AmitShah/yodaqa

    # class_weight='auto' produces reduced performance, val mrr 0.574 -> 0.527
    # (see the notebook)
    # We use L1 regularization mainly to minimize the output model size,
    # though it seems to yield better precision+recall too.
    t_start = time.clock()
    cfier = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=4)
    cfier.fit(traindata.X, traindata.Y)
    t_end = time.clock()
    print('// training took %d seconds' % (t_end-t_start,))
    sys.stdout.flush()


    ## Benchmarking

    with open(valfile, 'r') as f:
        valdata = VectorizedData(json.load(f), traindata.Xdict, traindata.Ydict)
    print('// valdata: %d questions' % (np.size(valdata.X, axis=0),))
    sys.stdout.flush()

    val_score = valdata.cfier_score(cfier, lambda cfier, X: cfier.predict_proba(X))
    print('// val sklScore %.3f, qRecallAny %.3f, qRecallAll %.3f, pathPrec %.3f, [qScoreMRR %.3f]' % (
          val_score['sklScore'],
          val_score['qRecallAny'], val_score['qRecallAll'], val_score['pPrec'],
          val_score['qScoreMRR']))
    sys.stdout.flush()


    ## Data Dump

    dump_cfier(cfier, traindata.Xdict, traindata.Ydict)

예제 #54

0

파일 보기

    #                   )
    # grid = GridSearchCV(clf_pipeline, param_grid, scoring='f1_samples')
    # grid.fit(x2, y3)

    X_train, X_test, y_train, y_test = train_test_split(x2,
                                                        y3,
                                                        test_size=0.1,
                                                        random_state=42)

    #.fit(X_train, y_train)

    model = OneVsRestClassifier(clf)
    model.fit(X_train, y_train)

    # parameters = {'n_estimators':[200, 300, 400], 'min_samples_split':[4, 6, 8, 10], 'min_samples_leaf':[4,6,8]}
    #
    # gscv=GridSearchCV(model, param_grid, scoring="f1_samples")
    # gscv.fit(X_train,y_train)
    #

    predictions = model.predict(X_test)
    obs = y_test
    predprobs = model.predict_proba(X_test)

    sklearn.metrics.hamming_loss(y_test, predictions)
    sklearn.metrics.f1_score(y_test, predictions, average="samples")
    sklearn.metrics.precision_score(y_test, predictions)
    sklearn.metrics.recall_score(y_test, predictions)

    aucs = aucvals(predprobs, obs)

예제 #55

0

파일 보기

파일: multilabel_RF.py 프로젝트: taipeifx/bootcamp007_project


test_X = test_df.set_index('ncodpers').join(test_pre_df.set_index('ncodpers'), rsuffix = '_pre')
test_X.products.loc[test_X.products.isnull()] = test_X.products.loc[test_X.products.isnull()].apply(lambda x: [])
test_X.fillna(0, inplace = True)
test_pre_y = multilabel_encoder.transform(test_X['products'])
test_X.drop('products', axis = 1, inplace = True)
test_X.reset_index(drop = True,inplace = True)

#test_X = test_X.rename(columns = {'products':'pre_products'})
print "Random Forest model:"
forest = RandomForestClassifier(n_estimators=250, random_state=1, verbose = 1, criterion='entropy')
multi_label_forest = OneVsRestClassifier(forest, n_jobs=-1)
multi_label_forest.fit(train_X, train_y)
print "Predicting:"
preds = multi_label_forest.predict_proba(test_X)
new_preds = preds - test_pre_y

new_preds = np.argsort(new_preds, axis=1)
new_preds = np.fliplr(new_preds)[:,:7]
if test_month == 18:
	final_preds = [' '.join([target_cols[pred] for pred in new_pred]) for new_pred in new_preds]
	out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
	out_df.to_csv(output_file, columns = ['ncodpers', 'added_products'], index=False)
else:
	print "Scoring..."
	test_preds = [[target_cols[pred] for pred in new_pred] for new_pred in new_preds]
	truth_list = np.array((test_y - test_pre_y)) ==1
	truth_list = [''.join([target_cols[i]  if i else '' for i in truth]).split() for truth in truth_list]
	print mapk(truth_list, test_preds, 7)

예제 #56

0

파일 보기

x = X.values
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y)

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Fit and train model using training set
model = OneVsRestClassifier(
    LogisticRegression(solver='liblinear',
                       multi_class='ovr'))  # 'lbfgs' , 'liblinear'
model.fit(x_train, y_train)
# Predict using trained model
y_prob = model.predict_proba(x_test)

# Pickle model
filename = 'model.pkl'
outfile = open(filename, 'wb')
pickle.dump(model, outfile)
outfile.close()

# Map labels based on probability
labels = Y_encode.columns
y_pred = []
y_true = []

for i in y_prob:
    y_pred.append(labels[i.argmax()])

예제 #57

0

파일 보기

plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Some extension of Receiver operating characteristic to multi-class")
plt.legend(loc="lower right")
plt.show()

# %%
# Area under ROC for the multiclass problem
# .........................................
# The :func:`sklearn.metrics.roc_auc_score` function can be used for
# multi-class classification. The multi-class One-vs-One scheme compares every
# unique pairwise combination of classes. In this section, we calculate the AUC
# using the OvR and OvO schemes. We report a macro average, and a
# prevalence-weighted average.
y_prob = classifier.predict_proba(X_test)

macro_roc_auc_ovo = roc_auc_score(y_test,
                                  y_prob,
                                  multi_class="ovo",
                                  average="macro")
weighted_roc_auc_ovo = roc_auc_score(y_test,
                                     y_prob,
                                     multi_class="ovo",
                                     average="weighted")
macro_roc_auc_ovr = roc_auc_score(y_test,
                                  y_prob,
                                  multi_class="ovr",
                                  average="macro")
weighted_roc_auc_ovr = roc_auc_score(y_test,
                                     y_prob,

예제 #58

0

파일 보기

def plot_roc_curve(clf, X_train, y_train, X_test, y_test):
    """
    Exibe a curva roc fazendo o treinamento considerando as classes um contra todos

    Parameters
    ----------
    clf : object
        Classificador

    X_train : array or DataFrame
        Conjunto de dados de treinamento

    y_trian : array or DataFrame
        Rótulos dos dados de treinamento

    X_test : array or DataFrame
        Conjunto de dados de teste

    y_test : array or DataFrame
        Rótulos dos dados de teste

    """
    # Obtém os nomes das classes
    classes = np.unique(y_test)
    n_classes = len(classes)
    # Binariza os rótulos
    y_train_bin = label_binarize(y_train, classes=classes)
    y_test_bin = label_binarize(y_test, classes=classes)

    # Cria o modelo um contra todos
    classifier = OneVsRestClassifier(clf)
    # Treina o modelo
    classifier.fit(X_train, y_train)
    # Faz predição com probabilidades sobre os dados de teste
    y_score = classifier.predict_proba(X_test)

    # Plotting and estimation of FPR, TPR
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    #
    colors = cycle(['blue', 'red', 'green', 'cyan', 'gold'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i],
                 tpr[i],
                 color=color,
                 lw=1.5,
                 label='ROC curve of class {0} (area = {1:0.3f})'
                 ''.format(classes[i], roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k-', lw=1.5)
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic for multi-class data')
    plt.legend(loc="lower right")
    plt.show()

예제 #59

0

파일 보기

파일: classifiers.py 프로젝트: MinisterioPublicoRJ/apimpmapas

class OneVsRestLogisticRegression:
    """Implements a Logistic Regression for a mutilabel problem
    using a OneVsRest approach.
    Attributes:
        model_: The model to be fit and used for prediction.
        negative_column_index_: The index of the column indicating the absence
            of other classes, if it exists.
    """

    def __init__(self, negative_column_index=None, **kwargs):
        """The constructor for the OneVsRestLogisticRegression class.
        Parameters:
            negative_column_index: The index of the column indicating the
                absence of other classes. None by default.
            kwargs: Arguments to be passed to the LogisticRegression class.
        """
        self.model_ = OneVsRestClassifier(LogisticRegression(**kwargs))
        self.negative_column_index_ = negative_column_index

    def fit(self, X, y):
        """Fits the model to the data and its labels.
        If negative_column_index_ is not None, then the corresponding column
        in y is removed before fitting the model.
        Parameters:
            X: Numpy array with the features of the data to fit the model to.
            y: Numpy array with the one hot encoding of the labels of the data
                to fit the model to.The labels must be in an n x m matrix,
                where n is the number of data points, and m the number of
                classes.
        """
        if self.negative_column_index_:
            self.model_.fit(
                X,
                np.delete(y, self.negative_column_index_, axis=1))
        else:
            self.model_.fit(
                X,
                y)

    def predict(self, X, threshold=0.7, max_classes=3):
        """Predicts the labels for a given dataset.
        If negative_column_index_ is not None, then the corresponding column
        is reinserted after prediction. The values of this column will be
        equal to 1 in the cases where no class has been predicted.
        Parameters:
            X: Numpy array with the data that will be used for prediction.
        Returns:
            p: Numpy array with the predictions.
        """
        prob = self.model_.predict_proba(X)
        consider = np.argsort(prob)[:, :-(max_classes+1):-1]
        mult = np.zeros(prob.shape)
        for a, b in zip(mult, consider):
            a[b] = 1
        prob = mult*prob

        p = (prob >= threshold).astype(int)

        if self.negative_column_index_:
            p = np.insert(
                p,
                self.negative_column_index_,
                values=(p.sum(axis=1) == 0).astype(int),
                axis=1)
        return p

예제 #60

0

파일 보기

파일: em_rvm.py 프로젝트: coursekevin/sklearn-rvm

class EMRVC(BaseRVM, ClassifierMixin):
    """Relevance Vector Classifier.

    Implementation of Mike Tipping"s Relevance Vector Machine for
    classification using the scikit-learn API.

    The multiclass support is handled according to a one-vs-rest scheme.

    For details on the precise mathematical formulation of the provided
    kernel functions and how `gamma`, `coef0` and `degree` affect each
    other, see the corresponding section in the narrative documentation:
    :ref:`svm_kernels`.

    Parameters
    ----------
    n_iter_posterior : int, optional (default=50)
        Number of iterations to calculate posterior.

    kernel : string, optional (default="rbf")
        Specifies the kernel type to be used in the algorithm.
        It must be one of "linear", "poly", "rbf", "sigmoid" or ‘precomputed’.
        If none is given, "rbf" will be used.

    degree : int, optional (default=3)
        Degree of the polynomial kernel function ("poly"). Ignored by all other
        kernels.

    gamma : float, optional (default="auto")
        Kernel coefficient for "rbf", "poly" and "sigmoid".

        Current default is "auto" which uses 1 / n_features,
        if ``gamma="scale"`` is passed then it uses 1 / (n_features * X.var())
        as value of gamma. The current default of gamma, "auto", will change
        to "scale" in version 0.22. "auto_deprecated", a deprecated version of
        "auto" is used as a default indicating that no explicit value of gamma
        was passed.

    coef0 : float, optional (default=0.0)
        Independent term in kernel function. It is only significant in "poly"
        and "sigmoid".

    tol : float, optional (default=1e-6)
        Tolerance for stopping criterion.

    threshold_alpha : float, optional (default=1e5)
        Threshold for alpha selection criterion.

    beta_fixed : {"not_fixed"} or float, optional (default="not_fixed")
        Fixed value for beta. If "not_fixed" selected, the beta is updated at
        each iteration.

    alpha_max : int, optional (default=1e9)
        Basis functions associated with alpha value beyond this limit will be
        purged. Must be a positive and big number.

    init_alpha : array-like of shape (n_sample) or None, optional (default=None)
        Initial value for alpha. If None is selected, the initial value of
        alpha is defined by init_alpha = 1 / M ** 2.

    bias_used : boolean, optional (default=False)
        Specifies if a constant (a.k.a. bias) should be added to the decision
        function.

    max_iter : int, optional (default=5000)
        Hard limit on iterations within solver.

    compute_score : boolean, optional (default=False)
        Specifies if the objective function is computed at each step of the model.

    verbose : boolean, optional (default=False)
        Enable verbose output.

    Attributes
    ----------
    relevance_ : array-like, shape (n_relevance)
        Indices of relevance vectors.

    relevance_vectors_ : array-like, shape (n_relevance, n_features)
        Relevance vectors (equivalent to X[relevance_]).

    alpha_ : array-like, shape (n_samples)
        Estimated alpha values.

    gamma_ : array-like, shape (n_samples)
        Estimated gamma values.

    Phi_ : array-like, shape (n_samples, n_features)
        Estimated phi values.

    Sigma_ : array-like, shape (n_samples, n_features)
        Estimated covariance matrix of the weights.

    mu_ : array-like, shape (n_relevance, n_features)
        Coefficients of the classifier (mean of posterior distribution)

    coef_ : array, shape (n_class * (n_class-1) / 2, n_features)
        Coefficients of the classfier (mean of posterior distribution).
        Weights assigned to the features. This is only available in the case
        of a linear kernel. `coef_` is a readonly property derived from `mu`
        and `relevance_vectors_`.

    See Also
    --------
    EMRVR
        Relevant Vector Machine for Regression.

    Notes
    -----
    **References:**
    `The relevance vector machine.
    <http://www.miketipping.com/sparsebayes.htm>`__
    """
    def __init__(self,
                 n_iter_posterior=50,
                 kernel="rbf",
                 degree=3,
                 gamma="auto_deprecated",
                 coef0=0.0,
                 tol=1e-3,
                 threshold_alpha=1e9,
                 beta_fixed="not_fixed",
                 alpha_max=1e10,
                 init_alpha=None,
                 bias_used=True,
                 max_iter=5000,
                 compute_score=False,
                 epsilon=1e-08,
                 verbose=False):

        self.n_iter_posterior = n_iter_posterior

        super().__init__(kernel=kernel,
                         degree=degree,
                         gamma=gamma,
                         coef0=coef0,
                         tol=tol,
                         threshold_alpha=threshold_alpha,
                         beta_fixed=beta_fixed,
                         alpha_max=alpha_max,
                         init_alpha=init_alpha,
                         bias_used=bias_used,
                         max_iter=max_iter,
                         compute_score=compute_score,
                         epsilon=epsilon,
                         verbose=verbose)

    def _classify(self, mu, Phi_):
        """ Perform Sigmoid Classification."""
        return expit(np.dot(Phi_, mu))

    def _log_posterior(self, mu, alpha, Phi_, t):
        """ Calculate log posterior."""
        y = self._classify(mu, Phi_)

        log_p = -1 * (np.sum(np.log(y[t == 1]), 0) +
                      np.sum(np.log(1 - y[t == 0]), 0))
        log_p = log_p + 0.5 * np.dot(mu.T, np.dot(np.diag(alpha), mu))

        jacobian = np.dot(np.diag(alpha), mu) - np.dot(Phi_.T, (t - y))

        return log_p, jacobian

    def _compute_hessian(self, mu, alpha, Phi_, t):
        """ Perform the Inverse of Covariance."""
        y = self._classify(mu, Phi_)
        B = np.diag(y * (1 - y))
        return np.diag(alpha) + np.dot(Phi_.T, np.dot(B, Phi_))

    def _posterior(self):
        """ Calculate the posterior likelihood."""
        result = minimize(fun=self._log_posterior,
                          hess=self._compute_hessian,
                          x0=self.mu_,
                          args=(self.alpha_, self.Phi_, self.t),
                          method="Newton-CG",
                          jac=True,
                          options={"maxiter": self.n_iter_posterior})

        self.mu_ = result.x

        hessian = self._compute_hessian(self.mu_, self.alpha_, self.Phi_,
                                        self.t)

        # Calculate Sigma
        # Use Cholesky decomposition for efficiency
        # Ref: https://arxiv.org/abs/1111.4144
        chol_fail = False
        try:
            upper = scipy.linalg.cholesky(hessian)
        except linalg.LinAlgError:
            warnings.warn("Hessian not positive definite")
            chol_fail = True

        if chol_fail:
            try:
                self.Sigma_ = np.linalg.inv(hessian)
            except linalg.LinAlgError:
                warnings.warn("Using Pseudo-Inverse")
                self.Sigma_ = np.linalg.pinv(hessian)

        else:
            try:
                upper_inv = np.linalg.inv(upper)
            except linalg.LinAlgError:
                warnings.warn("Using Pseudo-Inverse")
                upper_inv = np.linalg.pinv(upper)

            self.Sigma_ = np.dot(upper_inv, upper_inv.conj().T)

    def fit(self, X, y):
        """Fit the RVC model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vectors.

        y : array-like, shape (n_samples,)
            Target values.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X,
                         y,
                         y_numeric=True,
                         ensure_min_samples=2,
                         dtype="float64")

        if self.kernel == "precomputed" and X.shape[0] != X.shape[1]:
            raise ValueError("X.shape[0] should be equal to X.shape[1]")

        if self.gamma in ("scale", "auto_deprecated"):
            X_var = X.var()
            if self.gamma == "scale":
                if X_var != 0:
                    self._gamma = 1.0 / (X.shape[1] * X_var)
                else:
                    self._gamma = 1.0
            else:
                kernel_uses_gamma = (not callable(self.kernel) and self.kernel
                                     not in ("linear", "precomputed"))
                if kernel_uses_gamma and not np.isclose(X_var, 1.0):
                    # NOTE: when deprecation ends we need to remove explicitly
                    # setting `gamma` in examples (also in tests). See
                    # https://github.com/scikit-learn/scikit-learn/pull/10331
                    # for the examples/tests that need to be reverted.
                    warnings.warn(
                        "The default value of gamma will change "
                        "from 'auto' to 'scale' in version 0.22 to "
                        "account better for unscaled features. Set "
                        "gamma explicitly to 'auto' or 'scale' to "
                        "avoid this warning.", FutureWarning)
                self._gamma = 1.0 / X.shape[1]
        elif self.gamma == 'auto':
            self._gamma = 1.0 / X.shape[1]
        else:
            self._gamma = self.gamma

        self.classes_ = np.unique(y)

        n_classes = len(self.classes_)

        self.scores_ = list()

        if n_classes < 2:
            raise ValueError("Need 2 or more classes.")

        elif n_classes == 2:

            self.t = np.zeros(y.shape)
            self.t[y == self.classes_[1]] = 1

            n_samples = X.shape[0]
            self.Phi_ = self._get_kernel(X)

            # Scale Phi based on PRoNTO implementation
            # http://www.mlnl.cs.ucl.ac.uk/pronto/
            self._scale = np.sqrt(np.sum(self.Phi_) / n_samples**2)
            self.Phi_ = self.Phi_ / self._scale

            if self.bias_used:
                self.Phi_ = np.hstack((np.ones((n_samples, 1)), self.Phi_))

            M = self.Phi_.shape[1]

            self.y = y

            if self.init_alpha == None:
                self.init_alpha = 1 / M**2

            self.relevance_ = np.array(range(n_samples))
            if self.kernel != "precomputed":
                self.relevance_vectors_ = X
            else:
                self.relevance_vectors_ = None

            if self.beta_fixed == "not_fixed":
                # Suggested in the paper [1].
                self.beta_ = 1e-6
            else:
                self.beta_ = self.beta_fixed

            self.mu_ = np.zeros(M)

            self.alpha_ = self.init_alpha * np.ones(M)

            self._alpha_old = self.alpha_.copy()

            for i in range(self.max_iter):
                self._posterior()

                # Well-determinedness parameters (gamma)
                self.gamma_ = 1 - self.alpha_ * np.diag(self.Sigma_)
                self.alpha_ = np.maximum(
                    self.gamma_, self.epsilon) / (self.mu_**2) + self.epsilon
                self.alpha_ = np.clip(self.alpha_, 0, self.alpha_max)

                if not self.beta_fixed:
                    ed = np.sum((y - self.Phi_ @ self.mu_)**2)
                    self.beta_ = np.maximum((n_samples - np.sum(self.gamma_)),
                                            self.epsilon) / ed + self.epsilon

                if self.compute_score:
                    raise ("Score not yet implemented.")

                self._prune()

                if self.verbose:
                    print("Iteration: {}".format(i))
                    print("Alpha: {}".format(self.alpha_))
                    print("Beta: {}".format(self.beta_))
                    print("Gamma: {}".format(self.gamma_))
                    print("m: {}".format(self.mu_))
                    print("Relevance Vectors: {}".format(
                        self.relevance_.shape[0]))
                    if self.compute_score:
                        pass
                    print()

                delta = np.amax(
                    np.absolute(
                        np.log(self.alpha_ + self.epsilon) -
                        np.log(self._alpha_old + self.epsilon)))

                if delta < self.tol and i > 1:
                    break

                self._alpha_old = self.alpha_.copy()

            return self

        else:
            self.multi_ = None
            self.multi_ = OneVsRestClassifier(self)
            self.multi_.fit(X, y)
            return self

    def predict_proba(self, X):
        """Return an array of class probabilities."""
        #check_is_fitted(self, ["relevance_", "mu_", "Sigma_"])

        if len(self.classes_) == 2:
            X = check_array(X)
            n_samples = X.shape[0]

            K = self._get_kernel(X, self.relevance_vectors_)
            K = K / self._scale

            if self.bias_used:
                K = np.hstack((np.ones((n_samples, 1)), K))

            y = self._classify(self.mu_, K)
            return np.column_stack((1 - y, y))
        else:
            return self.multi_.predict_proba(X)

    def predict(self, X):
        """Predict using the RVC model.

        In addition to the mean of the predictive distribution, its
        standard deviation can also be returned.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Query points to be evaluate.

        Returns
        -------
        results : array, shape = (n_samples, [n_output_dims])
            Mean of predictive distribution at query points
        """
        # Check is fit had been called
        #check_is_fitted(self, ["relevance_", "mu_", "Sigma_"])

        if len(self.classes_) == 2:
            y = self.predict_proba(X)

            results = np.empty(y.shape[0], dtype=self.classes_.dtype)
            results[y[:, 1] <= 0.5] = self.classes_[0]
            results[y[:, 1] >= 0.5] = self.classes_[1]
            return results

        else:
            return self.multi_.predict(X)