def RF_model(X, y):

    RF_clf = RandomForestClassifier(n_estimators=150, max_depth=10, min_samples_split=100,
                                           min_samples_leaf=100, n_jobs=-1)
    # 再度分割数据集, 取1/3数据用作训练集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.66, random_state=42)
    # 拟合模型
    RF_clf.fit(X_train, y_train)

# 做预测
    y_pred = RF_clf.predict(X_test)
    print("Accuracy: {0:.3f}\n".format(accuracy_score(y_test, y_pred)))
    print('Kappa:', skll.kappa(y_test, y_pred))
    print("\nConfusion matrix:\n{}\n".format(confusion_matrix(y_test, y_pred)))

    labels = np.unique(np.concatenate((y_test, y_pred)))
    scores_image = mglearn.tools.heatmap(
        confusion_matrix(y_test, y_pred), xlabel='Predicted label',
        ylabel='True label', xticklabels=labels, yticklabels=labels,
        cmap='viridis', fmt='%d')
    plt.title("Confusion matrix")
    plt.gca().invert_yaxis()

    print(classification_report(y_test, y_pred))

    print("The training of the model ends!")
    return  RF_clf
def kappa_score(events, data_student, data_RE):

    for event in events:
        label_student = check_label_event(event, data_student, 'student')
        label_RE = check_label_event(event, data_RE, 'RE')

        print event + '\t' + str(kappa(label_RE, label_student))
示例#3
0
def cal_cohen_kappa_by_hit(results):
    # collect predicate by hit_id
    preds_by_hit = {}
    for sent in results.itervalues():
        for pred in sent.predicates.itervalues():
            if pred.hit_id not in preds_by_hit:
                preds_by_hit[pred.hit_id] = []
            preds_by_hit[pred.hit_id].append(pred)

    kappas = []
    # calculate kappa
    for preds in preds_by_hit.itervalues():
        # get answer list for each worker
        worker_answers_by_hit = {worker_id: [] for worker_id
                                 in preds[0].workers.keys()}
        for pred in preds:
            for worker in pred.workers.itervalues():
                worker_answers_by_hit[worker.workerid].append(worker.answer)

        # calculate kappa for each answer list pair
        worker_answers = worker_answers_by_hit.values()
        pairs_combinations = itertools.combinations(worker_answers, 2)
        for l1, l2 in pairs_combinations:
            kappa_value = skll.kappa(l1, l2)  #, weights='quadratic')
            kappas.append(kappa_value)
    avg_kappa = sum(kappas) / (len(kappas) + 0.0)
    print len(preds_by_hit)
    print "The average Cohen's kappa of all HITs is %f." %(avg_kappa)
def kapp_sentiment_event(path_sent, path_event, sentiment, event):
    load_sent, load_event = load_file(path_sent, 'allTweets_ver3_sentLabel_' + sentiment + '.csv')\
        , load_file(path_event, event + '.csv')
    list_sent, list_gt = list(), list()
    for i in range(0, len(load_sent)):
        split_sent, split_gt = load_sent[i].split('\t'), load_event[i].split('\t')
        label, gt = int(split_sent[0]), int(split_gt[1])
        list_sent.append(label), list_gt.append(gt)
    print 'Kappa score of ' + sentiment + ' and ' + event + ':' + '\t' + str(kappa(list_gt, list_sent))
def Kappa(y_true, y_pred, **kwargs):
    if not KAGGLE:
        from skll import kappa
        #from kappa import kappa
    return kappa(y_true, y_pred, **kwargs)
ftTest = d.transform(test.T.to_dict().values())

msk = np.random.rand(ftData.shape[0]) < 0.8

ftTrain = ftData[msk, 0:ftData.shape[1]]
ftCv = ftData[~msk, 0:ftData.shape[1]]

yTrain = dataPred[msk]
yCv = dataPred[~msk]

clf = GradientBoostingRegressor(n_estimators=3000,
                                max_depth=8,
                                min_samples_split=10,
                                min_samples_leaf=2,
                                max_features="auto",
                                verbose=1,
                                random_state=1988)
clf.fit(ftTrain, yTrain)

trainPred = clf.predict(ftTrain)
trainPred = [adjustResponse(resp) for resp in trainPred]
kTrain = kappa(yTrain, trainPred, weights="quadratic")
print "Ktrain : " + str(kTrain)

cvPred = clf.predict(ftCv)
cvPred = [adjustResponse(resp) for resp in cvPred]
kCv = kappa(yCv, cvPred, weights="quadratic")
print "cvPred : " + str(kCv)

joblib.dump(clf, "GBMModel3000_sameCvMat/GBMModel3000_sameCvMat")
示例#7
0
						if datum['kevin']!='' and datum['amy']!=''])

	filtered_kevin,filtered_amy = zip(*[(int(datum['kevin']),int(datum['amy'])) for datum in filtered_data
						if datum['kevin']!='' and datum['amy']!=''])

	ratings = map(str,range(4))
	kevin_amy_contingency = np.array([[len([item for item in filtered_data if 
								 item['kevin'] == kevin_rating and item['amy'] == amy_rating]) 
								for kevin_rating in ratings] for amy_rating in ratings])

	print>>f, 'Did Kevin and Amy rate differently? %s'%fisher_test(numpy2ri(kevin_amy_contingency),simulate_p_value=True,B=6000)


	print>>f, '---For all data------'
	print>>f, 'Chance agreement: %.02f'%(np.trace(contingency_table)/float(contingency_table.sum()))
	print>>f,  "Cohen's kappa: %.02f"%skll.kappa(kevin,amy)
	print>>f, '---------'


	print>>f, '---For full fields-----'
	print>>f, 'Chance agreement: %.02f'%(np.trace(filtered_contingency_table)/float(filtered_contingency_table.sum()))
	print>>f,  "Cohen's kappa: %.02f"%skll.kappa(kevin,amy)
	print>>f, '---------'

	#--Does reflection related to grade?

	unfiltered_distribution_grades = [float(item['exam']) for item in data if item['exam'] != '']
	filtered_distribution_grades = [float(item['exam']) for item in filtered_data if item['exam'] != '']

	quartiles = [(0,70),(70,80),(80,90),(90,100)]
	unfiltered_quartiles = [(np.percentile(unfiltered_distribution_grades,lower),np.percentile(unfiltered_distribution_grades,upper))
    data = []

    sentiment_r1_5_scale = []
    sentiment_r2_5_scale = []

    for r1, r2 in zip(sentences_r1, sentences_r2):

        sentiment_r1_5_scale.append(int(r1[5]))
        data.append((6, r1[0], r1[5]))

        sentiment_r2_5_scale.append(int(r2[5]))
        data.append((7, r2[0], r2[5]))

        if (r1[0] != r2[0]):
            print r1[0]

except Exception, e:
    print e

# disconnect from server
db.close()
print i

print skll.kappa(sentiment_r1_5_scale, sentiment_r2_5_scale)

annotation = AnnotationTask(data=data)

print annotation.kappa()
print annotation.alpha()
示例#9
0
def Kappa(y_true, y_pred, **kwargs):
    if not KAGGLE:
        from skll import kappa
        #from kappa import kappa
    return kappa(y_true, y_pred, **kwargs)
示例#10
0
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    k = kappa(labels,preds,weights="quadratic")
    return 'error',k
示例#11
0
def compute_kappa(data):
    print "kappa: ",
    print skll.kappa(data[0], data[1])
    #print skll.kappa(data[0], data[1], weights='linear')
    print "quadratic weighted kappa: ",
    print skll.kappa(data[0], data[1], weights='quadratic')
示例#12
0
# Collect the data needed for the accuracy assessment.
xys = []
classes = []
with open(accuracy_fn) as fp:
    reader = csv.reader(fp)
    next(reader)
    for row in reader:
        xys.append([float(n) for n in row[:2]])
        classes.append(int(row[2]))

ds = gdal.Open(prediction_fn)
pixel_trans = gdal.Transformer(ds, None, [])
offset, ok = pixel_trans.TransformPoints(True, xys)
cols, rows, z = zip(*offset)

data = ds.GetRasterBand(1).ReadAsArray()
sample = data[rows, cols]
del ds

# Compute kappa.
print('Kappa:', skll.kappa(classes, sample))

# Create the confusion matrix.
labels = np.unique(np.concatenate((classes, sample)))
matrix = metrics.confusion_matrix(classes, sample, labels)

# Add labels to the matrix and save it.
matrix = np.insert(matrix, 0, labels, 0)
matrix = np.insert(matrix, 0, np.insert(labels, 0, 0), 1)
np.savetxt(matrix_fn, matrix, fmt='%1.0f', delimiter=',')
示例#13
0
def Kappa(y_true, y_pred, **kwargs):
    from numpy import clip
    from skll import kappa
    return kappa(clip(y_true, 1, 8), clip(y_pred, 1, 8), **kwargs)
示例#14
0
def compute_kappa(data):
    print "kappa: ",
    print skll.kappa(data[0], data[1])
    #print skll.kappa(data[0], data[1], weights='linear')
    print "quadratic weighted kappa: ",
    print skll.kappa(data[0], data[1], weights='quadratic')
示例#15
0
valid_df = pd.read_csv(valid_fn)
class_type = ['裸地', '建筑', '农田', '林地', '工业用地', '潮滩', '稻田', '芦苇草地', '盐田', '水域']

# 获取图像坐标信息(xoff, yoff)
xys = np.array(valid_df[['xoff', 'yoff']])
# 提取类别标签数据
y = np.array(valid_df['class'], dtype=int)
del valid_df

cols, rows = zip(*xys)
del xys

data = tif_ds.GetRasterBand(1).ReadAsArray()
y_hat = data[rows, cols]

print('Kappa:', skll.kappa(y, y_hat))

labels = np.unique(np.concatenate((y, y_hat)))
matrix = confusion_matrix(y, y_hat, labels)
print("\nConfusion matrix:\n{}\n".format(matrix))
scores_image = mglearn.tools.heatmap(matrix,
                                     xlabel='Predicted label',
                                     ylabel='True label',
                                     xticklabels=labels,
                                     yticklabels=labels,
                                     cmap='viridis',
                                     fmt='%d')
plt.title("RF_opt_Confusion matrix")
plt.gca().invert_yaxis()

report = classification_report(y,