예제 #1
0
def calculate(A, X, Y):

    A = sp.coo_matrix(A)
    A = A + A.T.multiply(A.T > A) - A.multiply(A.T > A)
    rowsum = np.array(A.sum(1)).clip(min=1)
    r_inv_sqrt = np.power(rowsum, -0.5).flatten()
    r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
    A = A.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)

    low = 0.5 * sp.eye(A.shape[0]) + A
    high = 0.5 * sp.eye(A.shape[0]) - A
    low = low.todense()
    high = high.todense()

    low_signal = np.dot(np.dot(low, low), X)
    high_signal = np.dot(np.dot(high, high), X)

    low_MLP = MLPClassifier(hidden_layer_sizes=(16),
                            activation='relu',
                            max_iter=2000)
    low_MLP.fit(low_signal[:100, :], Y[:100])
    low_pred = low_MLP.predict(low_signal[100:, :])

    high_MLP = MLPClassifier(hidden_layer_sizes=(16),
                             activation='relu',
                             max_iter=2000)
    high_MLP.fit(high_signal[:100, :], Y[:100])
    high_pred = high_MLP.predict(high_signal[100:, :])

    return acc(Y[100:], low_pred), acc(Y[100:], high_pred)
예제 #2
0
 def on_epoch_end(self,epoch,logs=None):
     x_1 = self.validation_data[0]
     x_2 = self.validation_data[1]
     y_test = self.validation_data[2]   
     
     print('Dims Validation data: %s %s %s',x_1.shape,x_2.shape,y_test.shape)
     # predicting outputs for val data
     y_pred = self.model.predict([x_1,x_2])
     
     # selecting the top value of predictions and
     y_test = np.argmax(y_test, axis=-1)
     y_pred = np.argmax(y_pred, axis=-1)       
     
     self.acc_val.append(acc(y_test,y_pred))
     print ('Acc: ',acc(y_test,y_pred))   
예제 #3
0
def decision_tree_accuracy(X, y, random, depth, test, crit):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test,
                                                        random_state=random)
    regressor = tree.DecisionTreeClassifier(criterion=crit,
                                            max_depth=depth,
                                            random_state=random)
    regressor.fit(X_train, y_train)
    ytr_pred = regressor.predict(X_train)
    yts_pred = regressor.predict(X_test)
    acc_ytr = acc(y_train, ytr_pred)
    acc_yts = acc(y_test, yts_pred)
    print(f'Accuracy test = {acc_yts}')
    print(f'Accuracy train = {acc_ytr}')
예제 #4
0
def main():

    x_train, y_train, x_test, y_test = get_data()

    for n in [2, 3, 5, 10, 16]:
        sfs = SFS(KNeighborsClassifier(n_neighbors=7),
                  k_features=n,
                  forward=False,
                  floating=True,
                  scoring='accuracy',
                  cv=0)
        sfs = sfs.fit(x_train, y_train)

        print('\nSequential Floating Forward Selection: ', n)
        feat_cols = list(sfs.k_feature_idx_)
        print(feat_cols)

        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(x_train[:, feat_cols], y_train)

        y_train_pred = knn.predict(x_train[:, feat_cols])
        print('Training accuracy on selected features: %.3f' %
              acc(y_train, y_train_pred))

        y_test_pred = knn.predict(x_test[:, feat_cols])
        print('Testing accuracy on selected features: %.3f' %
              acc(y_test, y_test_pred))

        print(confusion_matrix(y_test, y_test_pred))
        print(classification_report(y_test, y_test_pred))

        if n == 2:
            fig, axs = plt.subplots(2)
            fig.suptitle("SFS(KNN) Scatter Plot", fontsize='small')
            axs[0].scatter(x_train[:, feat_cols[0]],
                           x_train[:, feat_cols[1]],
                           marker='o',
                           c=y_train,
                           s=25,
                           edgecolor='k')
            axs[1].scatter(x_test[:, feat_cols[0]],
                           x_test[:, feat_cols[1]],
                           marker='o',
                           c=y_test,
                           s=25,
                           edgecolor='k')

            plt.show()
예제 #5
0
def evaluate(model,
             iterator_function,
             _batch_count,
             cuda_device,
             output_buffer=sys.stderr):
    if output_buffer is not None:
        print(_batch_count, file=output_buffer)
    model.eval()
    with torch.no_grad():
        predictions = []
        expectations = []
        batch_generator = range(_batch_count)
        if output_buffer is not None:
            batch_generator = tqdm(batch_generator)
        for _ in batch_generator:
            features, targets = iterator_function()
            if cuda_device != -1:
                features = features.cuda(device=cuda_device)
            probs, _, _ = model(example_batch=features)
            batch_pred = np.argmax(probs.detach().cpu().numpy(),
                                   axis=-1).tolist()
            batch_tgt = targets.detach().cpu().numpy().tolist()
            predictions.extend(batch_pred)
            expectations.extend(batch_tgt)
        model.train()
        return acc(expectations, predictions) * 100, \
               pr(expectations, predictions) * 100, \
               rc(expectations, predictions) * 100, \
               f1(expectations, predictions) * 100,
예제 #6
0
def combinationPredict(predict, samples_test):
    labels_samples, merger_min, merger_max, merger_sum, merger_pro = add.fusoesDiego(
        predict, samples_test)

    classSeg = np_utils.categorical_probas_to_classes(predict)
    classMin = np_utils.categorical_probas_to_classes(merger_min)
    classMax = np_utils.categorical_probas_to_classes(merger_max)
    classSom = np_utils.categorical_probas_to_classes(merger_sum)
    classPro = np_utils.categorical_probas_to_classes(merger_pro)

    print()
    print("Min: " + str(acc(labels_samples, classMin)))
    print("Max: " + str(acc(labels_samples, classMax)))
    print("Sum: " + str(acc(labels_samples, classSom)))
    print("Product: " + str(acc(labels_samples, classPro)))
    print()
예제 #7
0
    def test_converter(self):
        interpreter = tf.lite.Interpreter(
            os.path.join(self.model_path, 'model.tflite'))
        interpreter.allocate_tensors()
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()

        #load test data
        [test_data, test_labels] = read_database(self.data_path)
        test_data = test_data / 255.0
        test_data = test_data[..., tf.newaxis].astype("float32")
        predData = np.ndarray(shape=(test_data.shape[0]), dtype='uint8')

        for i in range(0, test_data.shape[0]):
            test_data_temp = np.array(test_data[[i], :, :, :], dtype='float32')
            interpreter.set_tensor(input_details[0]['index'], test_data_temp)
            interpreter.invoke()
            output_data = interpreter.get_tensor(output_details[0]['index'])
            predData[i] = np.argmax(output_data)

        kappa = metrics.cohen_kappa_score(test_labels, predData)
        print('Kappa:', kappa)
        accuracy = acc(test_labels, predData)
        print('Accuracy:', accuracy)
        print('Confusion matrix:')
        confusion_mat = metrics.confusion_matrix(test_labels, predData)
        print(confusion_mat)

        return (accuracy)
예제 #8
0
 def actual_prediction_accuracy(self):
     if self.predicted:
         print("Prediction accuracy:", acc(self.y_test, self.y_pred))
     else:
         raise TypeError(
             "Predictions for model are not available, use 'predict' method first!"
         )
예제 #9
0
def predict(X_spam, X_ham, X_test, y_test):
    pred = []
    for X in X_test:
        spam = nb(X_spam, X)
        ham = nb(X_ham, X)
        pred.append(1) if spam > ham else pred.append(0)
    print(pred, y_test)
    print('Accuracy:', acc(pred, y_test))
def cluster_acc(Y, clusterLabels):
    assert (Y.shape == clusterLabels.shape)
    pred = np.empty_like(Y)
    for label in set(clusterLabels):
        mask = clusterLabels == label
        sub = Y[mask]
        target = Counter(sub).most_common(1)[0][0]
        pred[mask] = target
    return acc(Y, pred)
예제 #11
0
def build_classifier_and_test(train_X,
                              train_y,
                              test_X,
                              test_y,
                              clf,
                              print_train_result=True):
    clf.fit(train_X, train_y)
    if print_train_result == True:
        p_tr = clf.predict(train_X)
        print("Train Accuracy:\t", acc(train_y, p_tr))
        print("Train Precision:\t", pr(train_y, p_tr))
        print("Train Recall_score:\t", rc(train_y, p_tr))
        print("Train F-score:\t", f1(train_y, p_tr))
    predicted = clf.predict(test_X)
    print("Accuracy:\t", acc(test_y, predicted))
    print("Precision:\t", pr(test_y, predicted))
    print("Recall_score:\t", rc(test_y, predicted))
    print("F-score:\t", f1(test_y, predicted))
예제 #12
0
def clone_analysis(data_paths):
    code = []
    labels = []
    positives = 0
    for file_name in data_paths:
        data = json.load(open(file_name))
        for example in data:
            code.append(example['tokenized'])
            l = 0
            if 'label' in example.keys():
                l = int(example['label'])
            elif 'lebel' in example.keys():
                l = int(example['lebel'])
            elif 'leble' in example.keys():
                l = int(example['leble'])
            elif 'lable' in example.keys():
                l = int(example['lable'])
            if l > 1:
                l = 1
            positives += l
            labels.append(l)
    print(len(code), len(labels), positives, len(labels) - positives)
    vectorizer = TfidfVectorizer(input=code,
                                 lowercase=False,
                                 ngram_range=(1, 3))
    X = vectorizer.fit_transform(code)
    model = KMeans(n_clusters=10, max_iter=100)
    model.fit(X)
    y = model.predict(X)
    cluster_to_positive = [0] * 10
    cluster_to_negative = [0] * 10
    for pred, label in zip(y, labels):
        if label == 1:
            cluster_to_positive[pred] += 1
        else:
            cluster_to_negative[pred] += 1
    print(cluster_to_positive)
    print(cluster_to_negative)
    percentages = [
        float(p) / (p + n)
        for p, n in zip(cluster_to_positive, cluster_to_negative)
    ]
    for p in percentages:
        print(p)
    for _ in range(5):
        XTrain, XTest, YTrain, YTest = train_test_split(X,
                                                        labels,
                                                        test_size=0.2)
        model = RandomForestClassifier()
        model.fit(XTrain, YTrain)
        predicted = model.predict(XTest)
        print('%.3f\t%.3f\t%.3f\t%.3f' %
              (acc(YTest, predicted) * 100, pr(YTest, predicted) * 100,
               rc(YTest, predicted) * 100, f1(YTest, predicted) * 100))
    pass
예제 #13
0
def cluster_acc(Y, clusterLabels):  #used in clustering.py
    assert (Y.shape == clusterLabels.shape)
    pred = np.empty_like(Y)
    for label in set(clusterLabels):
        mask = clusterLabels == label
        sub = Y[mask]
        target = Counter(sub).most_common(1)[0][0]
        pred[mask] = target
    #    assert max(pred) == max(Y)
    #    assert min(pred) == min(Y)
    return acc(Y, pred)
예제 #14
0
def cluster_acc(y, cluster_labels):
    assert (y.shape == cluster_labels.shape)
    pred = np.empty_like(y)
    for label in set(cluster_labels):
        mask = cluster_labels == label
        sub = y[mask]
        target = Counter(sub).most_common(1)[0][0]
        pred[mask] = target
#    assert max(pred) == max(Y)
#    assert min(pred) == min(Y)
    return acc(y, pred)
예제 #15
0
파일: helpers.py 프로젝트: wanlipu/dsmain
def cluster_acc(Y,clusterLabels):
    assert (Y.shape == clusterLabels.shape)
    pred = np.empty_like(Y)
    for label in set(clusterLabels):
        mask = clusterLabels == label
        sub = Y[mask]
        target = Counter(sub).most_common(1)[0][0]
        pred[mask] = target
#    assert max(pred) == max(Y)
#    assert min(pred) == min(Y)    
    return acc(Y,pred)
예제 #16
0
    def train(self):
        x, y = np.load('images/64px_image_x.npy'), np.load(
            'images/64px_image_y.npy')
        x = np.reshape(x, (40000, 64, 64, 1))
        kmeans = KMeans(n_clusters=2, n_init=20)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        loss = 0
        ae_loss = 0
        index = 0
        maxiter = 80000
        update_interval = 100
        index_array = np.arange(x.shape[0])
        batch_size = 16
        tol = 0.001

        # model.load_weights('DEC_model_final.h5')

        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.model.predict(x, verbose=0)
                # update the auxiliary target distribution p
                p = self.target_distribution(q)

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    loss = np.round(loss, 5)
                    print(
                        'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f, loss=%.5f'
                        % (ite, acc, nmi, ari, loss))

                # check stop criterion - model convergence
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    break
            idx = np.random.randint(low=0, high=x.shape[0], size=batch_size)
            # ae_loss = ae.train_on_batch(x=x[idx], y=x[idx])
            loss = self.model.train_on_batch(x=x[idx], y=p[idx])
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

        self.model.save_weights('DEC_model_final_64px.h5')
        self.test_model()
예제 #17
0
 def on_epoch_end(self, epoch, logs=None):
     if int(epochs/10) != 0 and epoch % int(epochs/10) != 0:
         return
     feature_model = Model(self.model.input,
                           self.model.get_layer(
                               'encoder_%d' % (int(len(self.model.layers) / 2) - 1)).output)
     features = feature_model.predict(self.x)
     km = KMeans(n_clusters=len(np.unique(self.y)), n_init=20, n_jobs=4)
     y_pred = km.fit_predict(features)
     # print()
     print(' '*8 + '|==>  acc: %.4f,  nmi: %.4f  <==|'
           % (metrics.acc(self.y, y_pred), metrics.nmi(self.y, y_pred)))
예제 #18
0
def main():

    x_train, y_train, x_test, y_test = get_data()

    for n in [2, 3, 5, 10, 15]:
        pca = decomposition.PCA(n_components=n)
        pca.fit(x_train)
        pca_x_train = pca.transform(x_train)

        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(pca_x_train, y_train)

        print('\nPCA: ', n)

        y_train_pred = knn.predict(pca_x_train)
        print('Training accuracy on selected features: %.3f' %
              acc(y_train, y_train_pred))

        pca_x_test = pca.transform(x_test)
        y_test_pred = knn.predict(pca_x_test)
        print('Testing accuracy on selected features: %.3f' %
              acc(y_test, y_test_pred))

        if n == 2:
            fig, axs = plt.subplots(2)
            fig.suptitle("PCA Scatter Plot", fontsize='small')
            axs[0].scatter(pca_x_train[:, 0],
                           pca_x_train[:, 1],
                           marker='o',
                           c=y_train,
                           s=25,
                           edgecolor='k')
            axs[1].scatter(pca_x_test[:, 0],
                           pca_x_test[:, 1],
                           marker='o',
                           c=y_test,
                           s=25,
                           edgecolor='k')

            plt.show()
예제 #19
0
def cal_cost_tree(x, trn, trg):
    x = [int(a) for a in np.round(x)]
    if sum(x) == 0 : return np.inf
    x_index = [i for i in range(len(x)) if x[i]==1]
    trn = trn.reshape(trn.shape[1], -1)
    trn = trn[x_index, :]
    trn = np.transpose(trn)
    clf = tree.DecisionTreeClassifier()
    clf.fit(trn, trg)
    pre = clf.predict(trn)
    score = acc(pre, trg)
    error = 1 - score
    return (1-alpha)*error + alpha * (sum(x)*1.0/len(x)), error, sum(x)*1.0/len(x)
예제 #20
0
def updatePointsAtk(xc, clf, Xtr, ytr=None, Xtt=None, ytt=None, err_type='loss'):
    d = Xtr.shape[1]
    m = xc.size/d
    xc = xc.reshape(m,d)
#     if xcid.size != m:
#         print 'Attack points indices do not match xc, exit!'
#         return None
    X0 = np.concatenate([Xtr, xc], axis=0)
    # TODO: update SVC instead of retrain with xc
    clf.fit(X0)        # <---- this is just a lazy update
    # maximize the obj value (generalization error)   
    if err_type is 'fval':
        # objective values on untainted dataset Dtr
        return 1*(clf.fval - clf.C*(clf.kdist2(xc) - clf.r).sum())
    elif err_type is 'r':
        # the squared radius 
        return 1*clf.r
    elif err_type is 'xi':
        return 1*(clf.fval - clf.C*(clf.kdist2(xc) - clf.r).sum() - clf.r)
    elif err_type is 'f1':  
        if ytr is not None:
            return f1_score(ytr, clf.y[:ytr.size])
        else:
            print 'You need give the true labels!'
            return None
    elif err_type is 'acc':  
        if Xtt is not None and ytt is not None:
            y_clf = clf.predict_y(Xtt)
            return acc(ytt, y_clf)
        else:
            print 'You need give the test dataset!'
            return None
    elif err_type is 'loss':
        # min(\sum_xi - R^2) means let less samples lie out meanwhile maximize the ball
        # note: xc are excluded
        sum_xi_c = (clf.kdist2(xc) - clf.r).sum()
        return (clf.fval - clf.r)/clf.C - sum_xi_c - clf.r
    elif err_type is 'fn':
        if ytr is not None:
            pid = np.where(ytr==1)[0]
            return 0.5*((ytr[pid] - clf.y[pid]).sum())/pid.size
        else:
            print 'You need give the true labels!'
            return None
    elif err_type is 'fp':
        if ytr is not None:
            nid = np.where(ytr==-1)[0]
            return 0.5*((clf.y[nid] - ytr[nid]).sum())/nid.size
        else:
            print 'You need give the true labels!'
            return None    
예제 #21
0
def cal_cost_knn(x, trn, trg):
    x = list(map(int, np.round(x)))
    if sum(x) == 0 : 
        return np.inf, np.inf, 1
    x_index = [i for i in range(len(x)) if x[i]==1]
    trn = trn.reshape(trn.shape[1], -1)
    trn = trn[x_index, :]
    trn = np.transpose(trn)
    clf = knn(n_neighbors=nn)
    clf.fit(trn, trg)
    pre = clf.predict(trn)
    score = acc(pre, trg)
    error = 1 - score
    return (1-alpha)*error + alpha * (sum(x)*1.0/len(x)), error, sum(x)*1.0/len(x)
예제 #22
0
def cal_cost_svm(x, trn, trg):
    x = [int(a) for a in np.round(x)]
    if sum(x) == 0 : 
        return np.inf, np.inf, 1
    x_index = [i for i in range(len(x)) if x[i]==1]
    trn = trn.reshape(trn.shape[1], -1)
    trn = trn[x_index, :]
    trn = np.transpose(trn)
    clf = SVC(gamma="auto", kernel=svm_kernel)
    clf.fit(trn, trg)
    pre = clf.predict(trn)
    score = acc(pre, trg)
    error = 1 - score
    return (1-alpha)*error + alpha * (sum(x)*1.0/len(x)), error, sum(x)*1.0/len(x)
예제 #23
0
def cluster_acc(Y,clusterLabels):
    import numpy as np
    from collections import Counter
    from sklearn.metrics import accuracy_score as acc
    assert (Y.shape == clusterLabels.shape)
    pred = np.empty_like(Y)
    for label in set(clusterLabels):
        mask = clusterLabels == label
        sub = Y[mask]
        target = Counter(sub).most_common(1)[0][0]
        pred[mask] = target
#    assert max(pred) == max(Y)
#    assert min(pred) == min(Y)    
    return acc(Y,pred)
예제 #24
0
def dae_svm():
    dae_train = np.load('data/train_dae.npy')[:10000]
    dae_test = np.load('data/test_dae.npy')[:5000]

    svm_dae = svm.SVC(C=2.0, gamma=0.05, cache_size=2000)
    #svm_dae = GridSearchCV(svr, parameters)
    svm_dae.fit(dae_train, label_train)
    predicted_dae = svm_dae.predict(dae_test)
    daeacc = acc(label_test, predicted_dae)
    #model_params = str(svm_dae.best_estimator_)

    print 'DAE accuracy - ' + str(daeacc)

    with open(PATHS + 'dae_svm', 'wb') as f:
        pickle.dump(daeacc, f)
예제 #25
0
def reportStats(weight, current_iteration, X_train, y_train, X_test, y_test):

    y_train[y_train < 0] = 0
    y_test[y_test < 0] = 0

    ypred_is = predict_all(X_train, weight)
    ypred_oos = predict_all(X_test, weight)

    np_err_handling = np.seterr(invalid='ignore')

    is_acc = acc(y_train, ypred_is)
    is_mcc = mcc(y_train, ypred_is)
    is_f1 = f1(y_train, ypred_is)
    is_mse = mse(y_train, ypred_is)

    oos_acc = acc(y_test, ypred_oos)
    oos_mcc = mcc(y_test, ypred_oos)
    oos_f1 = f1(y_test, ypred_oos)
    oos_mse = mse(y_test, ypred_oos)

    is_tn, is_fp, is_fn, is_tp = confusion_matrix(y_train, ypred_is).ravel()
    oos_tn, oos_fp, oos_fn, oos_tp = confusion_matrix(y_test,
                                                      ypred_oos).ravel()
    is_auprc = auprc(y_train, ypred_is)
    oos_auprc = auprc(y_test, ypred_oos)

    np.seterr(**np_err_handling)

    print(
        f"Consensus {current_iteration}: IS acc {is_acc:0.5f}.  IS MCC {is_mcc:0.5f}.  IS F1 {is_f1:0.5f}.  IS MSE {is_mse:0.5f}.  OOS acc {oos_acc:0.5f}.  OOS MCC {oos_mcc:0.5f}.  OOS F1 {oos_f1:0.5f}.  OOS MSE {oos_mse:0.5f}."
    )
    print(
        f"Confusion {current_iteration}: IS TP: {is_tp}, IS FP: {is_fp}, IS TN: {is_tn}, IS FN: {is_fn}, IS AUPRC: {is_auprc:0.5f}.  OOS TP: {oos_tp}, OOS FP: {oos_fp}, OOS TN: {oos_tn}, OOS FN: {oos_fn}, OOS AUPRC: {oos_auprc:0.5f}."
    )

    return is_acc, is_mcc, is_f1, is_mse, is_auprc, oos_acc, oos_mcc, oos_f1, oos_mse, oos_auprc
예제 #26
0
def test_acc_knn(x, tst, tst_trg, trn, trn_trg):
    x = [int(a) for a in np.round(x)]
    if sum(x) == 0:
        return 0
    x = [i for i in range(len(x)) if x[i]==1]
    tst = tst.reshape(tst.shape[1], -1)
    tst = tst[x, :]
    tst = np.transpose(tst)
    trn = trn.reshape(trn.shape[1], -1)
    trn = trn[x, :]
    trn = np.transpose(trn)
    clf = knn(n_neighbors=nn)
    clf.fit(trn, trn_trg)
    tst_pred = clf.predict(tst)
    return acc(tst_trg, tst_pred)
예제 #27
0
def test_acc_svm(x, tst, tst_trg, trn, trn_trg):
    x = [int(a) for a in np.round(x)]
    if sum(x) == 0:
        return 0
    x = [i for i in range(len(x)) if x[i]==1]
    tst = tst.reshape(tst.shape[1], -1)
    tst = tst[x, :]
    tst = np.transpose(tst)
    trn = trn.reshape(trn.shape[1], -1)
    trn = trn[x, :]
    trn = np.transpose(trn)
    clf = SVC(gamma="auto", kernel=svm_kernel)
    clf.fit(trn, trn_trg)
    tst_pred = clf.predict(tst)
    return acc(tst_trg, tst_pred)
예제 #28
0
def test_acc_tree(x, tst, tst_trg, trn, trn_trg):
    x = [int(a) for a in np.round(x)]
    if sum(x) == 0:
        return
    x = [i for i in range(len(x)) if x[i]==1]
    tst = tst.reshape(tst.shape[1], -1)
    tst = tst[x, :]
    tst = np.transpose(tst)
    trn = trn.reshape(trn.shape[1], -1)
    trn = trn[x, :]
    trn = np.transpose(trn)
    clf = tree.DecisionTreeClassifier()
    clf.fit(trn, trn_trg)
    tst_pred = clf.predict(tst)
    return acc(tst_trg, tst_pred)
예제 #29
0
def cluster_acc(y, cluster_labels):
    assert (y.shape == cluster_labels.shape)
    pred = np.empty_like(
        y
    )  # what the clustering algoritm predicts as the y-label (if it predicted the majority y-label for each cluster)
    for label in set(cluster_labels):
        mask = cluster_labels == label  # indices where the training data match a specific cluster
        sub = y[mask]  # get only the training labels that are in this cluster
        target = Counter(sub).most_common(1)[0][
            0]  # get the majority y-label from the instances assigned to this cluster (e.g 0)
        pred[
            mask] = target  # assign the training data in this cluster to have the majority y-label


#    assert max(pred) == max(Y)
#    assert min(pred) == min(Y)
    return acc(y, pred)
예제 #30
0
def get_accuracy(predictions, y, std_price = 0, mean_price = 0):

    unnorm_predictions = []
    for pred in predictions:
        if math.isnan(unnormalize(pred, std_price,mean_price)):
            print("NAN FOUND")
            exit()
        unnorm_predictions.append(unnormalize(pred, std_price, 
            mean_price))

    unnorm_y = []
    for y_pt in y:
        if math.isnan(unnormalize(y_pt, std_price,mean_price)):
            print("NAN FOUND")
            exit()
        unnorm_y.append(unnormalize(y_pt, std_price,
            mean_price))



    # Create lists to measure if opening price increased or decreased
    direction_pred = []
    for pred in unnorm_predictions:
        if pred >= 0:
            direction_pred.append(1)
        else:
            direction_pred.append(0)
    direction_test = []
    for value in unnorm_y:
        if value >= 0:
            direction_test.append(1)
        else:
            direction_test.append(0)

    from sklearn.metrics import confusion_matrix

    # Calculate if the predicted direction matched the actual direction
    direction = acc(direction_test, direction_pred)
    direction = round(direction,4)*100
    _mae = mae(unnorm_y, unnorm_predictions) #median absolute error
    _rmse = np.sqrt(mse(y, predictions)) # root mean squared error
    _r2 = r2(unnorm_y, unnorm_predictions) #R squared error
    print("CONFUSION MATRIX")
    print(confusion_matrix(direction_test,direction_pred).ravel())
    return (direction,_mae,_rmse,_r2)
예제 #31
0
    def test(self, x_test, y_test, params, n_centers, width):
        y_true = []
        y_pred = []
        (p, _) = x_test.shape
        for i in range(p):
            d = y_test[i]
            y = self.predict(x_test[i], params, n_centers, width)

            # Confusion Matrix
            y_true.append(list(d))
            y_pred.append(list(y))

        a = util.inverse_transform(y_true, self.n_classes)
        b = util.inverse_transform(y_pred, self.n_classes)
        return acc(a, b), tpr(a, b,
                              average='macro'), 0, ppv(a,
                                                       b,
                                                       average='weighted')
예제 #32
0
#########################################################
### your code goes here ###

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score as acc




for i in range(1,20):
    C = pow(10,i)
    clf = SVC(kernel="rbf",C=C)
    clf.fit(features_train,labels_train)
    pred = clf.predict(features_test)
    print "C:",C,"Accuracy:",acc(pred,labels_test)

C = 10000
clf = SVC(kernel="rbf",C=C)
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print "C:",C,"Accuracy:",acc(pred,labels_test)


from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import accuracy_score as accu

clf = DTC(min_samples_split=2)
clf.fit(features_train,labels_train)

pred = clf.predict(features_test)
예제 #33
0
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)


for outcast in ["sshacklensf", "cgermannsf"]:
    features_train = [x.replace(outcast,"") for x in features_train]

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]



### your code goes here
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score as acc

clf = DecisionTreeClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print acc(pred,labels_test)
print [(i,x) for i,x in enumerate(clf.feature_importances_) if x > 0.2]
print vectorizer.get_feature_names()[21323]
예제 #34
0

#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################


### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc

clf = RandomForestClassifier(n_jobs=-1, criterion="gini", n_estimators=100, min_samples_leaf=5, max_features=1)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print "Accuracy", acc(pred, labels_test)


try:
    prettyPicture(clf, features_train, labels_train)
except NameError:
    pass