Exemplo n.º 1
0
def metrics_helper(human_scores, system_scores):
    """
    This is a helper function that computes some basic
    metrics for the system_scores against the human_scores.
    """

    # compute the kappas
    unweighted_kappa = kappa(human_scores, system_scores)
    quadratic_weighted_kappa = kappa(human_scores,
                                     round(system_scores),
                                     weights='quadratic')

    # compute the agreement statistics
    human_system_agreement = agreement(human_scores, system_scores)
    human_system_adjacent_agreement = agreement(human_scores,
                                             system_scores,
                                             tolerance=1)

    # compute the pearson correlation after removing
    # any cases where either of the scores are NaNs.
    df = pd.DataFrame({'human': human_scores,
                       'system': system_scores}).dropna(how='any')
    correlations = pearsonr(df['human'], df['system'])[0]

    # compute the min/max/mean/std. dev. for the system and human scores
    min_system_score = np.min(system_scores)
    min_human_score = np.min(human_scores)

    max_system_score = np.max(system_scores)
    max_human_score = np.max(human_scores)

    mean_system_score = np.mean(system_scores)
    mean_human_score = np.mean(human_scores)

    system_score_sd = np.std(system_scores, ddof=1)
    human_score_sd = np.std(human_scores, ddof=1)

    # compute standardized mean difference as recommended
    # by Williamson et al (2012)
    numerator = mean_system_score - mean_human_score
    denominator = np.sqrt((system_score_sd**2 + human_score_sd**2)/2)
    SMD = numerator/denominator

    # return everything as a series
    return pd.Series({'kappa': unweighted_kappa,
                      'wtkappa': quadratic_weighted_kappa,
                      'exact_agr': human_system_agreement,
                      'adj_agr': human_system_adjacent_agreement,
                      'SMD': SMD,
                      'corr': correlations,
                      'sys_min': min_system_score,
                      'sys_max': max_system_score,
                      'sys_mean': mean_system_score,
                      'sys_sd': system_score_sd,
                      'h_min': min_human_score,
                      'h_max': max_human_score,
                      'h_mean': mean_human_score,
                      'h_sd': human_score_sd,
                      'N': len(system_scores)})
def stats (list1,list2):
    print "Predictions:"
    print list1
    print list(reversed(list2)) #COMPARABLE ORDER
    print

    list1fl=[class2float(i) for i in list1]
    list2fl=[class2float(i) for i in list(reversed(list2))]

    print list1fl
    print list2fl

    print
    print kappa(list1fl,list2fl) #http://skll.readthedocs.org/en/latest/_modules/skll/metrics.html
    print
    print list2
Exemplo n.º 3
0
def train_model(train, folds):
    y = train.median_relevance.values
    x = train.drop(["median_relevance", "doc_id"], 1).values

    clf = Pipeline([
        ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)),
        ('svm', SVC(C=10.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))
        ])


    scores = []
    for train_index, test_index in cross_validation.StratifiedKFold(
            y=y,
            n_folds=int(folds),
            shuffle=True,
            random_state=42):

        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(x_train, y_train)
        predicted = transform_regression(clf.predict(x_test))

        s = kappa(y_test, predicted, weights="quadratic")
        print s
        scores.append(s)

    warn("cv scores:")
    warn(scores)
    warn(np.mean(scores))
    warn(np.std(scores))

    clf.fit(x, y)

    return clf
def agreementtest(path1,path2):
    #1. import the labels
    from utils import loadLabels
    label_human = loadLabels(path1,0,2)
    label_machine = loadLabels(path2,0,2)
    #2. transfer them into the list
    y = []
    y_pred = []

    for key in label_human:
        y += [label_human[key]]
        y_pred += [label_machine[key]]
    print len(y),len(y_pred)
    #3. get the raw agreement
    from pandas import DataFrame
    from pandas import crosstab
    result = DataFrame({'y_pred' : y_pred,
                        'y_human' : y})
    crosstable = crosstab(result['y_pred'], result['y_human'])

    print crosstable

    acc = float(crosstable['1']['1']+crosstable['0']['0'])/len(y_pred)
    prec = float(crosstable['1']['1'])/(crosstable['1']['1']+crosstable['0']['1'])
    recall = float(crosstable['1']['1'])/(crosstable['1']['1']+crosstable['1']['0'])
    F1_hand = 2 * prec * recall/( prec + recall)

    #4. use the skll to get the kappa
    from skll import metrics
    kappa = metrics.kappa(y,y_pred)

    return crosstable,acc,recall,prec,F1_hand,kappa
def testing(file):
	""" 
		To test and see if the quadratic weighing kappa function is working properly
	"""
	f = open(file, 'r')
	f.readline()

	labels, estimate = [], []
	for row in f:
		label = row.strip().split("\t")[6]
		if random() > 0.5:
			estimate.append(int(4*int(label)*random()))
		else:
			estimate.append(int(int(label)*random()))
		labels.append(int(label))

	print kappa(labels, labels, weights = 'quadratic')
Exemplo n.º 6
0
 def runSVM(self, y_test, y_train, x_train, x_test):
     clf = svm.LinearSVC(class_weight="auto")
     clf.fit(x_train, y_train)
     direction = clf.coef_.tolist()[0]
     y_pred = clf.predict(x_test)
     y_pred = y_pred.tolist()
     kappa_score = kappa(y_test, y_pred)
     return kappa_score,  direction
Exemplo n.º 7
0
    def print_kappa(self, method, one_off=False):
        mean_kappa_same = []
        mean_kappa_diff = []

        for i in range(0,50):

            checked_pairs = []
            checked_pairs_same = []
            checked_pairs_diff = []
            kappas_same = []
            kappas_diff = []

            # calculating agreement for pairs from the same batches and different batches
            while len(checked_pairs_same) < 20 or len(checked_pairs_diff) < 20:
                id1 = random.choice(self.ids)
                id2 = random.choice(self.ids)
                pair = sorted([id1, id2])
                if pair not in checked_pairs and id1 != id2:
                    values_first = self.get_rating_values(id1)
                    values_second = self.get_rating_values(id2)
                    if len(values_first) != len(values_second) or len(values_first) == 0:
                        continue

                    if method == 'standard':
                        kappa = metrics.kappa(values_first, values_second)
                    else:
                        kappa = metrics.kappa(values_first, values_second, method, one_off)

                    if self.batch_hash[id1] == self.batch_hash[id2]:
                        kappas_same.append(kappa)
                        checked_pairs_same.append(pair)
                    else:
                        kappas_diff.append(kappa)
                        checked_pairs_diff.append(pair)

                    checked_pairs.append(pair)

            mean_kappa_same.append(numpy.mean(kappas_same))
            mean_kappa_diff.append(numpy.mean(kappas_diff))

        print("Kappa same group: " + str(numpy.mean(mean_kappa_same)) + " different groups: " + str(numpy.mean(mean_kappa_diff)))
        print("Confidence same: " + str(stats.norm.interval(0.999, loc=numpy.mean(mean_kappa_same), scale=numpy.std(mean_kappa_same)/math.sqrt(50))) + " different: " + str(stats.norm.interval(0.999, loc=numpy.mean(mean_kappa_diff), scale=numpy.std(mean_kappa_diff)/math.sqrt(50))))
Exemplo n.º 8
0
 def eval(self):
     sys.stderr.write('Evaluating\n')
     folds = StratifiedKFold(y=self.y_train, n_folds=self.folds, shuffle=True, random_state=1337)
     scores = []
     for train_index, test_index in folds:
         self.fit(train_index)
         predicted, y_test = self.predict(test_index)
         k = kappa(y_test, transform(predicted), weights='quadratic')
         print(k)
         scores.append(k)
     print(scores)
     print(np.mean(scores))
     print(np.std(scores))
def evalerror(preds, dtrain):
    labels = dtrain.get_label()

    # TODO: delete
    # print 'evalerror'
    # print max(preds)
    preds = np.round(preds, 0)
    # print max(preds)
    # print len(preds), preds

    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)
    return 'kappa', 1.0 - kappa(labels, preds, weights='quadratic')
Exemplo n.º 10
0
def kNNClass(train_idx,test_idx,n_neighbors):
	training_data=input_kmers_counts.loc[train_idx]
	testing_data=input_kmers_counts.loc[test_idx]
	clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform")
	clf.fit(training_data[kmer_colums], training_data["class"])
	#print "predicting"
	predicted_classes= clf.predict(testing_data[kmer_colums])
	# compute kappa stat 
	confusion_matrix(testing_data["class"],predicted_classes)
	# make a mapping 
	class_map=dict(zip(set(testing_data["class"]),range(0,4)))
	kapp=kappa([class_map[x] for x in testing_data["class"]],[class_map[x] for x in predicted_classes])
	cm=caret.confusionMatrix(robjects.FactorVector(predicted_classes),robjects.FactorVector(testing_data["class"]))
	return kapp,cm
Exemplo n.º 11
0
def get_average_kappa(arr_act, arr_pred):
	"""
	Calculates the average quadratic kappa
	over the entire essay set
	"""

	assert(len(arr_act) == len(arr_pred))
	total = len(arr_act)
	kappa_val = 0

	for i in xrange(0, total):
		kappa_val += met.kappa([arr_act[i]], [arr_pred[i]], \
					'quadratic')
#		print arr_act[i], '-', arr_pred[i]

	kappa_val  = float(kappa_val) / float(total)

	return kappa_val
Exemplo n.º 12
0
def kNNClass(train_idx,test_idx,n_neighbors,k_mer_subset):
	logger.info('computing for %s'%(k_mer_subset))
	train_idx=train_idx
	test_idx=test_idx
	training_subset=normalized_counts.loc[train_idx][np.append(k_mer_subset,"class")]
	testing_subset=normalized_counts.loc[test_idx][np.append(k_mer_subset,"class")]
	clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform")
	clf.fit(training_subset[k_mer_subset], training_subset["class"])
	#print "predicting"
	predicted_classes= clf.predict(testing_data[k_mer_subset])
	# compute kappa stat 
	confusion_matrix(testing_data["class"],predicted_classes)
	# make a mapping 
	class_map=dict(zip(set(testing_data["class"]),range(0,4)))
	kapp=kappa([class_map[x] for x in testing_data["class"]],[class_map[x] for x in predicted_classes])
	cm=caret.confusionMatrix(robjects.FactorVector(predicted_classes),robjects.FactorVector(	testing_data["class"]))
	logger.info("Finished for %s with kappa==%f"%(k_mer_subset,kapp))
	return kapp,cm
def accuracy_stats(Ypred, Ytest):
    
    stats = {}
    
    statkeys = ['AA', 'AP', 'f1', 'recall', 'kappa']
    for key in statkeys:
        stats[key] = []
   

    for ypred, ytest in zip(Ypred, Ytest):
        
        stats['AA'].append(accuracy_score(ytest.ravel(), ypred.ravel()))
        stats['AP'].append(precision_score(ytest.ravel(), ypred.ravel()))
        stats['f1'].append(f1_score(ytest.ravel(), ypred.ravel()))
        stats['recall'].append(recall_score(ytest.ravel(), ypred.ravel()))
        stats['kappa'].append(kappa(ytest.ravel(), ypred.ravel()))
        
    return stats
Exemplo n.º 14
0
def scores(X,y,y_proba,name="nan",to_plot=False):
#    print(name+' Classifier:\n {}'.format(metrics.classification_report(X,y)))
    cm= metrics.confusion_matrix(X,y)
    print cm
    if(to_plot):
        plt_cm(X,y,[-1,1])
        auc_compute(X,y)
    auc=roc_auc_score(X,y_proba)
    print(name+' Classifier auc:  %f' % auc)
    accuracy=metrics.accuracy_score(X,y)
    print(name+' Classifier accuracy:  %f' % (accuracy))
    f1=metrics.f1_score(X,y,pos_label=1)
    print(name+' Classifier f1: %f' % (f1))
    precision=metrics.precision_score(X,y)
    print(name+' Classifier precision_score: %f' % (precision))
    recall=metrics.recall_score(X,y)
    print(name+' Classifier recall_score: %f' % (recall))
    kappa_score=kappa(X,y)
    
    print(name+' Classifier kappa_score:%f' % (kappa_score))
    return [auc,f1.mean(),accuracy.mean(),precision.mean(),recall.mean(),kappa_score]
Exemplo n.º 15
0
def train_model(train, folds):
    y = train.median_relevance.values
    x = train.drop(["median_relevance", "doc_id"], 1).values

    xg_params = {
        "silent": 1,
        "objective": "reg:linear",
        "nthread": 4,
        "bst:max_depth": 10,
        "bst:eta": 0.1,
        "bst:subsample": 0.5
    }
    num_round = 600

    scores = []
    for train_index, test_index in cross_validation.StratifiedKFold(
            y=y,
            n_folds=int(folds),
            shuffle=True,
            random_state=42):

        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        xg_train = xg.DMatrix(x_train, label=y_train)
        xg_test  = xg.DMatrix(x_test,  label=y_test)

        watchlist = [(xg_train, "train"), (xg_test, "test")]
        bst = xg.train(xg_params, xg_train, num_round, watchlist, feval=evalerror)

        predicted = transform_regression(bst.predict(xg_test))

        s = kappa(y_test, predicted, weights="quadratic")
        print s
        scores.append(s)

    warn("cv scores:")
    warn(scores)
    warn(np.mean(scores))
    warn(np.std(scores))
Exemplo n.º 16
0
    def run(self):
        """
        run Forrest
        """
        
        for loopcount in range(self.ntasks):
            seed = time.time()
            resultsline = []
            # do stuff
            training = random.sample(range(self.matrix.shape[0]), int(self.trainingratio*self.matrix.shape[0]))
            testing = list(set(range(self.matrix.shape[0])).difference(training))
            
            print  self.matrix[training,:].shape, len([targ[i] for i in training]), self.matrix[testing,:].shape
            
            clf = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
            clf.fit(self.matrix[training,:].todense(), [targ[i] for i in training])
            #~ clf.fit(self.matrix[training,:], [targ[i] for i in training])
            classes=clf.predict(self.matrix[testing,:].todense())
            #~ classes=clf.predict(self.matrix[testing,:])
            
#            print(confusion_matrix(classes,[targ[i] for i in testing]))
            
#            print(kappa(classes,[targ[i] for i in testing]))

            resultsline = []
            resultsline = resultsline+info_log
            resultsline.append(seed)
            resultsline.append(kappa(classes,[targ[i] for i in testing]))
            conf = []
            for row in confusion_matrix(classes,[targ[i] for i in testing]):
                conf = conf+list(row)
            resultsline = resultsline+conf
            
            self.result_queue.put([str(item) for item in resultsline])
            
        # store the results in the results queue once all the contigs are processed
        # please run please run please run please run please run please run 
#        
        sys.stdout.write("Done with worker for %d tasks: %d loops done\n" % (self.ntasks, loopcount+1))
Exemplo n.º 17
0
        line = line.split()
        if str(g) in line[1]:
            batches[int(line[0])].append(line)

print "### gamma=%d ###"%g
moy = 0

for classifier in batches:
    classes = []
    predicted = []
    for contig in classifier:
#        print contig[1], contig[3], contig[4]
        classes.append(int(contig[3]))
        predicted.append(int(contig[4]))

    kappas.append(kappa(classes,predicted))

print "tous classifieurs:"
print "kappa moyen = %f" % np.mean(kappas)
print "écart-type = %f" % np.std(kappas)

best_batchid = kappas.index(max(kappas))
print "meilleur kappa = %f" % kappas[best_batchid]

print "loading best classifier..."
with open(clffile, "r") as clffh:
    bestclf = pickle.load(clffh)[best_batchid]
    
# Re-run the prediction with the best classifier

print "loading data..."
Exemplo n.º 18
0
    def test(self, samples, test_labels,label_names=None):
        # test each using held-out data
        test = samples

        # if test_labels is None:
        #     return self.predict(test_samples)

        label_test = test_labels
        print("\nTesting...")
        print "Test Samples:", len(test)

        classes = []
        p_count = 0

        avg_class_err = []
        avg_err = self.test_network(test, label_test)

        predictions = self.predict_network(test)

        for i in range(0, len(label_test)):
            p_count += 1
            classes.append(label_test[i].tolist())


        predictions = np.round(predictions, 3).tolist()

        actual = []
        pred = []
        cor = []

        # get the percent correct for the predictions
        # how often the prediction is right when it is made
        for i in range(0, len(predictions)):
            c = classes[i].index(max(classes[i]))
            actual.append(c)

            p = predictions[i].index(max(predictions[i]))
            pred.append(p)
            cor.append(int(c == p))

        # calculate a naive unfair baseline using averages
        avg_class_pred = np.mean(label_test, 0)

        print "Predicting:", avg_class_pred, "for baseline*"
        for i in range(0, len(label_test)):
            res = FFNNet.AverageCrossEntropy(np.array(avg_class_pred), np.array(classes[i]))
            avg_class_err.append(res)
            # res = RNN_GRU.AverageCrossEntropy(np.array(predictions_GRU[i]), np.array(classes[i]))
            # avg_err_GRU.append(res)
        print "*This is calculated from the TEST labels"

        from sklearn.metrics import roc_auc_score, f1_score
        from skll.metrics import kappa

        kpa = []
        auc = []
        f1s = []
        t_pred = du.transpose(predictions)
        t_lab = du.transpose(label_test)

        for i in range(0, len(t_lab)):
            # if i == 0 or i == 3:
            #    t_pred[i] = du.normalize(t_pred[i],method='max')
            kpa.append(kappa(t_lab[i], t_pred[i]))
            auc.append(roc_auc_score(t_lab[i], t_pred[i]))
            temp_p = [round(j) for j in t_pred[i]]
            if np.nanmax(temp_p) == 0:
                f1s.append(0)
            else:
                f1s.append(f1_score(t_lab[i], temp_p))

        print "\nBaseline Average Cross-Entropy:", "{0:.4f}".format(np.nanmean(avg_class_err))
        print "\nNetwork Performance:"
        print "Average Cross-Entropy:", "{0:.4f}".format(np.nanmean(avg_err))
        print "AUC:", "{0:.4f}".format(np.nanmean(auc))
        print "Kappa:", "{0:.4f}".format(np.nanmean(kpa))
        print "F1 Score:", "{0:.4f}".format(np.nanmean(f1s))
        print "Percent Correct:", "{0:.2f}%".format(np.nanmean(cor) * 100)

        print "\n{:<15}".format("  Label"), \
            "{:<9}".format("  AUC"), \
            "{:<9}".format("  Kappa"), \
            "{:<9}".format("  F Stat"), \
            "\n=============================================="

        if label_names is None or len(label_names) != len(t_lab):
            label_names = []
            for i in range(0, len(t_lab)):
                label_names.append("Label " + str(i + 1))

        for i in range(0, len(t_lab)):
            print "{:<15}".format(label_names[i]), \
                "{:<9}".format("  {0:.4f}".format(auc[i])), \
                "{:<9}".format("  {0:.4f}".format(kpa[i])), \
                "{:<9}".format("  {0:.4f}".format(f1s[i]))
        print "\n=============================================="
        actual = []
        predicted = []
        for i in range(0, len(predictions)):
            actual.append(label_test[i].tolist().index(max(label_test[i])))
            predicted.append(predictions[i].index(max(predictions[i])))

        from sklearn.metrics import confusion_matrix
        print confusion_matrix(actual, predicted)

        return predictions
Exemplo n.º 19
0
input_kmers_counts["class"]=all_classes
input_kmers_counts["species"]=all_species
normalized_counts["class"]=all_classes
normalized_counts["species"]=all_species

# PCA 
normalized_counts[kmer_colums]=scale(normalized_counts[kmer_colums])
normalized_counts[kmer_colums].apply(scipy.mean,0)
normalized_counts[kmer_colums].apply(scipy.std,0)
non_zero=(normalized_counts[kmer_colums]!=0).apply(scipy.sum,0)
too_abundant_kmers=list(non_zero.order()[-10:].index)
kmer_colums_filt=list(set(kmer_colums).difference(too_abundant_kmers))


pca_trans=PCA(n_components=160)
pca_fitted=pca_trans.fit(normalized_counts[kmer_colums])
pca_coord=pca_fitted.transform(normalized_counts[kmer_colums])

# SVM classification and kappa estimates 

X_train,X_test,Y_train,Y_test=train_test_split(pca_coord,normalized_counts["class"],test_size=0.5,random_state=421)
clf=SVC(C=4.152687927300392,gamma=0.002448996894369464,kernel='rbf')
clf.fit(X_train,Y_train)
predictions=clf.predict(X_test)
mmat=confusion_matrix(predictions,Y_test)
print mmat
class_map=dict(zip(set(input_kmers_counts["class"]),range(0,4)))
kappa([class_map[x] for x in Y_test],[class_map[x] for x in predictions])

Exemplo n.º 20
0
def test_invalid_weighted_kappa():
    kappa([1, 2, 1], [1, 2, 1], weights='invalid', allow_off_by_one=False)
    kappa([1, 2, 1], [1, 2, 1], weights='invalid', allow_off_by_one=True)
Exemplo n.º 21
0
def test_invalid_lists_kappa():
    kappa(['a', 'b', 'c'], ['a', 'b', 'c'])
def quadratic_kappa(true, predicted):
    if params.REGRESSION:
        return kappa(true, predicted, weights='quadratic')
    else:
        return kappa(true, np.argmax(predicted, axis=1), weights='quadratic')
Exemplo n.º 23
0
def check_kappa(y_true, y_pred, weights, allow_off_by_one, expected):
    assert_almost_equal(kappa(y_true, y_pred, weights=weights,
                              allow_off_by_one=allow_off_by_one), expected)
Exemplo n.º 24
0
def test_invalid_weighted_kappa():
    kappa([1, 2, 1], [1, 2, 1], weights='invalid', allow_off_by_one=False)
    kappa([1, 2, 1], [1, 2, 1], weights='invalid', allow_off_by_one=True)
        predictions = np.round(predictions)
        print min(predictions)
        print max(predictions)


        # Get error and best iteration
        # TODO: delete
        # print 'predictions'
        # print max(predictions)
        # predictions = np.round(predictions, 0)
        # print max(predictions)
        # print min(predictions)
        # print len(predictions), predictions

        kappa_score = kappa(test_Y, predictions, weights='quadratic')
        print "Kappa: %f" % kappa_score
        print "Confusion matrix:"
        print confusion_matrix(test_Y, predictions)
        print "Classification report:"
        print classification_report(test_Y, predictions)

        errors.append(kappa_score)
        best_iterations.append(bst.best_iteration)

    # Append new grid error
    grid_errors.append(np.mean(errors))
    grid_best_iterations.append(list(best_iterations))

# Show results
for i in xrange(len(params_space)):
Exemplo n.º 26
0
Arquivo: models.py Projeto: Mosar/EEG
    #         train_test_split(x_all, y_all, train_size=0.8)
    #
    #     # Обучаем модели
    #     for m in models.keys():
    #         models[m]['cl'].fit(x_train, y_train)
    #
    #     # Прогнозируем
    #     for t in models.keys():
    #         models[t]['pred'] = models[t]['cl'].predict(x_test)
    #
    #     tmp = metrics.kappa(y_test, models['rf']['pred'])
    #     print(tmp)
    #     kappa.append(tmp)
    # print(sum(kappa) / len(kappa))

    tmp = metrics.kappa(y_test, models['rf']['pred'])
    print(tmp)

    # quit()

    # Визуализируем
    models_num = len(models)

    fig, axes = plt.subplots(nrows=2, ncols=models_num, squeeze=False)
    if True:
        # Строим confusion матрицы
        for (name, cm), ax in zip([(x['title'], x['cm'])
                                   for x in models.values()],
                                  axes.flat[:models_num]):
            m = ax.matshow(cm, cmap='Oranges')
            ax.set_title(name)
def quadratic_kappa(true, predicted):
    return kappa(true, predicted, weights='quadratic')
Exemplo n.º 28
0
training_ratio = 0.8
training_set = indices[0:int(n_rows * training_ratio)]
testing_set = indices[-int(n_rows * training_ratio):]

training_data = input_kmers_counts.loc[training_set]
testing_data = input_kmers_counts.loc[testing_set]

clf = neighbors.KNeighborsClassifier(15, weights="uniform")
clf.fit(training_data[count_colums], training_data["class"])
#print "predicting"
predicted_classes = clf.predict(testing_data[count_colums])
# compute kappa stat
confusion_matrix(testing_data["class"], predicted_classes)
# make a mapping
class_map = dict(zip(set(testing_data["class"]), range(0, 4)))
kappa([class_map[x] for x in testing_data["class"]],
      [class_map[x] for x in predicted_classes])

# fit a KNN on the normalized_counts
# kNNClass(training_set,testing_set,15,count_colums)

# We focus on the ambiguous k-mers, approx 15k; basically all-kmer appear more than once

ambiguous_kmers = all_nodes_df[all_nodes_df["degree"] > 2]
len(set(ambiguous_kmers['kmer']))
len(set(all_nodes_df['kmer']))

# We do a PCA on that
amb_kmers_counts = pandas.pivot_table(ambiguous_kmers,
                                      values="degree",
                                      index=['sequence_description'],
                                      columns=["kmer"],
Exemplo n.º 29
0
def quadratic_kappa(true, predicted):
    return kappa(true, predicted, weights='quadratic')
Exemplo n.º 30
0
X = iris.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = iris.target

training = random.sample(range(150), 100)
testing = list(set(range(150)).difference(training))

X_train = iris.data[training,:]
y_train = [y[i] for i in training]
X_test = iris.data[testing,:]
y_test = [y[i] for i in testing]

clf3 = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf3.fit(X_train, y_train)

print "kappa =", kappa(y_test,clf3.predict(X_test))

metric = LMNN(X_train, y_train)
metric.fit()
new_X_train = metric.transform()
new_X_test = metric.transform(X_test)
clf4 = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf4.fit(new_X_train, y_train)

print "kappa =", kappa(y_test,clf4.predict(new_X_test))


#
## Create color maps
#cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
#cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
Exemplo n.º 31
0
    def test(self, test, test_labels=None, label_names=None):
        if test_labels is None:
            return self.predict(test)
        test_cpy = list(test)
        if not du.len_deepest(test_cpy) == self.num_input:
            if self.covariates is not None:
                for a in range(0, len(test_cpy)):
                    if type(test_cpy[a]) is not list:
                        test_cpy[a] = test_cpy[a].tolist()
                    for e in range(0, len(test_cpy[a])):
                        c = []
                        for i in range(0, len(self.covariates)):
                            c.append(test_cpy[a][e][self.covariates[i]])
                        test_cpy[a][e] = c

        if len(self.cov_mean) == 0 or len(self.cov_stdev) == 0:
            print "Scaling factors have not been generated: calculating using test sample"
            t_tr = du.transpose(RNN.flatten_sequence(test_cpy))
            self.cov_mean = []
            self.cov_stdev = []

            for a in range(0, len(t_tr)):
                mn = np.nanmean(t_tr[a])
                sd = np.nanstd(t_tr[a])
                self.cov_mean.append(mn)
                self.cov_stdev.append(sd)

        test_samples = []

        import math
        for a in range(0, len(test_cpy)):
            sample = []
            for e in range(0, len(test_cpy[a])):
                covariates = []
                for i in range(0, len(test_cpy[a][e])):
                    cov = 0
                    if self.cov_stdev[i] == 0:
                        cov = 0
                    else:
                        cov = (test_cpy[a][e][i] -
                               self.cov_mean[i]) / self.cov_stdev[i]

                    if math.isnan(cov) or math.isinf(cov):
                        cov = 0

                    covariates.append(cov)
                sample.append(covariates)
            test_samples.append(sample)

        label_test = test_labels
        print("\nTesting...")
        print "Test Samples:", len(test_samples)

        classes = []
        p_count = 0

        avg_class_err = []
        avg_err_RNN = []

        if self.scale_output:
            print "Scaling output..."

        predictions_RNN = []
        for i in range(0, len(test_samples)):
            # get the prediction and calculate cost
            prediction_RNN = self.pred_RNN([test_samples[i]])
            #prediction_RNN += .5-self.avg_preds
            if self.scale_output:
                prediction_RNN -= self.min_preds
                prediction_RNN /= (self.max_preds - self.min_preds)
                prediction_RNN = np.clip(prediction_RNN, 0, 1)
                prediction_RNN = [(x * [
                    1 if c == self.majorityclass else 0.9999
                    for c in range(0, self.num_output)
                ]) if np.sum(x) == 4 else x for x in prediction_RNN]
            avg_err_RNN.append(
                self.compute_cost_RNN([test_samples[i]], label_test[i]))

            for j in range(0, len(label_test[i])):
                p_count += 1

                classes.append(label_test[i][j].tolist())
                predictions_RNN.append(prediction_RNN[j].tolist())

        predictions_RNN = np.round(predictions_RNN, 3).tolist()

        actual = []
        pred_RNN = []
        cor_RNN = []

        # get the percent correct for the predictions
        # how often the prediction is right when it is made
        for i in range(0, len(predictions_RNN)):
            c = classes[i].index(max(classes[i]))
            actual.append(c)

            p_RNN = predictions_RNN[i].index(max(predictions_RNN[i]))
            pred_RNN.append(p_RNN)
            cor_RNN.append(int(c == p_RNN))

        # calculate a naive baseline using averages
        flattened_label = []
        for i in range(0, len(label_test)):
            for j in range(0, len(label_test[i])):
                flattened_label.append(label_test[i][j])
        flattened_label = np.array(flattened_label)
        avg_class_pred = np.mean(flattened_label, 0)

        print "Predicting:", avg_class_pred, "for baseline*"
        for i in range(0, len(flattened_label)):
            res = RNN.AverageCrossEntropy(np.array(avg_class_pred),
                                          np.array(classes[i]))
            avg_class_err.append(res)
            # res = RNN.AverageCrossEntropy(np.array(predictions_RNN[i]), np.array(classes[i]))
            # avg_err_RNN.append(res)
        print "*This is calculated from the TEST labels"

        from sklearn.metrics import roc_auc_score, f1_score
        from skll.metrics import kappa

        kpa = []
        auc = []
        f1s = []
        apr = []
        t_pred = du.transpose(predictions_RNN)
        t_lab = du.transpose(flattened_label)

        for i in range(0, len(t_lab)):
            #if i == 0 or i == 3:
            #    t_pred[i] = du.normalize(t_pred[i],method='max')
            temp_p = [round(j) for j in t_pred[i]]

            kpa.append(kappa(t_lab[i], t_pred[i]))
            apr.append(du.Aprime(t_lab[i], t_pred[i]))
            auc.append(roc_auc_score(t_lab[i], t_pred[i]))

            if np.nanmax(temp_p) == 0:
                f1s.append(0)
            else:
                f1s.append(f1_score(t_lab[i], temp_p))

        if label_names is None or len(label_names) != len(t_lab):
            label_names = []
            for i in range(0, len(t_lab)):
                label_names.append("Label " + str(i + 1))

        RNN.print_label_distribution(label_test, label_names)

        self.eval_metrics = [
            np.nanmean(avg_err_RNN),
            np.nanmean(auc),
            np.nanmean(kpa),
            np.nanmean(f1s),
            np.nanmean(cor_RNN) * 100
        ]

        print "\nBaseline Average Cross-Entropy:", "{0:.4f}".format(
            np.nanmean(avg_class_err))
        print "\nNetwork Performance:"
        print "Average Cross-Entropy:", "{0:.4f}".format(
            np.nanmean(avg_err_RNN))
        print "AUC:", "{0:.4f}".format(np.nanmean(auc))
        print "A':", "{0:.4f}".format(np.nanmean(apr))
        print "Kappa:", "{0:.4f}".format(np.nanmean(kpa))
        print "F1 Score:", "{0:.4f}".format(np.nanmean(f1s))
        print "Percent Correct:", "{0:.2f}%".format(np.nanmean(cor_RNN) * 100)

        print "\n{:<15}".format("  Label"), \
            "{:<9}".format("  AUC"), \
            "{:<9}".format("  A'"), \
            "{:<9}".format("  Kappa"), \
            "{:<9}".format("  F Stat"), \
            "\n=============================================="

        for i in range(0, len(t_lab)):
            print "{:<15}".format(label_names[i]), \
                "{:<9}".format("  {0:.4f}".format(auc[i])), \
                "{:<9}".format("  {0:.4f}".format(apr[i])), \
                "{:<9}".format("  {0:.4f}".format(kpa[i])), \
                "{:<9}".format("  {0:.4f}".format(f1s[i]))
        print "\n=============================================="

        print "Confusion Matrix:"
        actual = []
        predicted = []
        flattened_label = flattened_label.tolist()
        for i in range(0, len(predictions_RNN)):
            actual.append(flattened_label[i].index(max(flattened_label[i])))
            predicted.append(predictions_RNN[i].index(max(predictions_RNN[i])))

        from sklearn.metrics import confusion_matrix
        conf_mat = confusion_matrix(actual, predicted)
        for cm in conf_mat:
            cm_row = "\t"
            for element in cm:
                cm_row += "{:<6}".format(element)
            print cm_row
        print "\n=============================================="

        return predictions_RNN
Exemplo n.º 32
0
def check_kappa(y_true, y_pred, weights, allow_off_by_one, expected):
    assert_almost_equal(
        kappa(y_true,
              y_pred,
              weights=weights,
              allow_off_by_one=allow_off_by_one), expected)
Exemplo n.º 33
0
training_set=indices[0:int(n_rows*training_ratio)]
testing_set=indices[-int(n_rows*training_ratio):]

training_data=input_kmers_counts.loc[training_set]
testing_data=input_kmers_counts.loc[testing_set]


clf = neighbors.KNeighborsClassifier(15, weights="uniform")
clf.fit(training_data[count_colums], training_data["class"])
#print "predicting"
predicted_classes= clf.predict(testing_data[count_colums])
# compute kappa stat 
confusion_matrix(testing_data["class"],predicted_classes)
# make a mapping 
class_map=dict(zip(set(testing_data["class"]),range(0,4)))
kappa([class_map[x] for x in testing_data["class"]],[class_map[x] for x in predicted_classes])


# fit a KNN on the normalized_counts
# kNNClass(training_set,testing_set,15,count_colums)


# We focus on the ambiguous k-mers, approx 15k; basically all-kmer appear more than once

ambiguous_kmers=all_nodes_df[all_nodes_df["degree"]>2]
len(set(ambiguous_kmers['kmer']))
len(set(all_nodes_df['kmer']))

# We do a PCA on that 
amb_kmers_counts=pandas.pivot_table(ambiguous_kmers,values="degree",index=['sequence_description'],columns=["kmer"],fill_value=0)
kmer_colums=amb_kmers_counts.columns
Exemplo n.º 34
0
def test_invalid_lists_kappa():
    kappa(['a', 'b', 'c'], ['a', 'b', 'c'])
Exemplo n.º 35
0
    documents, classes, train_size=0.7)

classifier = NaiveBayesTextClassifier(
    categories=categories,
    min_df=1,
    lowercase=True,
    stop_words=stopwords.words('english')
)

print('> Train classifier')
classifier.train(train_docs, train_classes)

print('> Classify test data...')
predicted_classes = classifier.classify(test_docs)

print('> Complete.')
print(classification_report(test_classes, predicted_classes))

print('-' * 42)
print("{:<25}: {:>4} articles".format("Test data size", len(test_classes)))
print("{:<25}: {:>6.2f} %".format(
    "Accuracy", 100 * accuracy_score(test_classes, predicted_classes))
)
print("{:<25}: {:>6.2f} %".format(
    "Kappa statistics", 100 * kappa(
        category_to_number(test_classes, categories),
        category_to_number(predicted_classes, categories)
    )
))
print('-' * 42)
Exemplo n.º 36
0
# -------------- Classify --------------- #

print("> Start classify data")
start_time = time.time()

if options.test:
    predicted_classes = classifier.classify(test_docs)

    print(classification_report(test_classes, predicted_classes))
    print('-' * 42)
    print("{:<25}: {:>6} articles".format("Test data size", len(test_classes)))
    print("{:<25}: {:>6.2f} %".format(
        "Accuracy", 100 * accuracy_score(test_classes, predicted_classes)))
    print("{:<25}: {:>6.2f} %".format(
        "Kappa statistics", 100 * kappa(test_classes, predicted_classes)))

elif options.predict:
    predicted_classes = classifier.classify(test_data.review)

    print("> Save predicted results")
    print("> {}".format(PREDICTED_DATA_FILE))
    np.savetxt(PREDICTED_DATA_FILE,
               np.concatenate(
                   (test_data.values[:, 0:1], np.matrix(predicted_classes).T),
                   axis=1),
               delimiter=',',
               header='id,sentiment',
               comments='',
               fmt="%s")
    print('-' * 42)
Exemplo n.º 37
0
#clf.fit(training_matrix, training_targets)
#print "predicting"
#classes=clf.predict(testing_matrix)
#
#print(confusion_matrix(classes,testing_targets))
            #
#print(kappa(classes,testing_targets))

#print ("### avec norm")

for i in range(len(length)):
    matrix[i,:]=matrix[i,:]/length[i]

training_matrix = matrix[training,:]
training_targets = [targ[i] for i in training]

testing_matrix = matrix[testing,:]
testing_targets = [targ[i] for i in testing]


#print "fitting"

clf = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
clf.fit(training_matrix, training_targets)
#print "predicting"
classes=clf.predict(testing_matrix)

print(confusion_matrix(classes,testing_targets))
            
print(kappa(classes,testing_targets))
Exemplo n.º 38
0
#clf = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
#clf.fit(training_matrix, training_targets)
#print "predicting"
#classes=clf.predict(testing_matrix)
#
#print(confusion_matrix(classes,testing_targets))
            #
#print(kappa(classes,testing_targets))

#print ("### avec norm")

for i in range(len(length)):
    matrix[i,:]=matrix[i,:]/length[i]

training_matrix = matrix[training,:]
training_targets = [targ[i] for i in training]

testing_matrix = matrix[testing,:]
testing_targets = [targ[i] for i in testing]


#print "fitting"
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=1.0).fit(training_matrix, training_targets)

#print "predicting"
classes=rbf_svc.predict(testing_matrix)

print(confusion_matrix(classes,testing_targets))
            
print(kappa(classes,testing_targets))
Exemplo n.º 39
0
csvfile = "nbayes-k6_arch_bact_euk_virus_3.csv"

for n in nvals:
    print "### n=%s ###"%str(n)
    batches = []
    for i in range(50):
        batches.append([])
    c15 = 0
    with open(csvfile, "r") as fh:
        for line in fh:
            line = line.split()
            if str(n) in line[1]:
                batches[int(line[0])].append(line)
                if int(line[3]) == int(line[4]):
                    c15 += 1

    moy15 = 0
    for classifier in batches:
        classes = []
        predicted = []
        for contig in classifier:
    #        print contig[1], contig[3], contig[4]
            classes.append(int(contig[3]))
            predicted.append(int(contig[4]))
    
        print "kappa =", kappa(classes,predicted)
        moy15 += kappa(classes,predicted)
    #    print c15, "/", len(batches)
    #    print confusion_matrix(classes,predicted)
    
    print "moy = ", moy15/50
Exemplo n.º 40
0
    features_test = np.delete(features_test, 0, 1)
    features_test = np.delete(features_test, len(features_test[1]) - 1, 1)

    # print features_train.shape, dummy_train.shape
    # print features_test.shape, dummy_test.shape

    combined_train = np.column_stack((features_train, dummy_train))
    combined_test = np.column_stack((features_test, dummy_test))

    # "Essay Set 	Classifier	Feature Set		Accuracy "
    # print "Set " + str(no)
    for i, clf in enumerate(classifiers):
        clf.fit(dummy_train, labels)
        prediction = clf.predict(dummy_train)
        prediction1 = clf.predict(dummy_test)
        a[i].append(kappa(test_labels, prediction1, weights='quadratic'))
        a[i].append(kappa(labels, prediction, weights='quadratic'))
    print no, "\tTest\t Stat\t ", a[0][0], '\t', a[0][1]
    #print no,"\tTrain\t Stat\t ",a[1][0],'\t',a[1][1]

    for i, clf in enumerate(classifiers):
        clf.fit(features_train, labels)
        prediction = clf.predict(features_train)
        prediction1 = clf.predict(features_test)
        a[i].append(kappa(test_labels, prediction1, weights='quadratic'))
        a[i].append(kappa(labels, prediction, weights='quadratic'))
    print no, "\tTest\t Prompt\t ", a[0][2], '\t', a[0][3]
    #print no,"\tTrain\t Prompt\t ",a[1][2],'\t',a[1][3]

    for i, clf in enumerate(classifiers):
        clf.fit(combined_train, labels)