Exemplo n.º 1
0
def test_prf(fn1,fn2,sth,L):
    y_true=[]
    y_score=[]
    edges_1=prep.read_edges(fn1)
    edges_2=prep.read_edges(fn2)
    
    predict_set={}
    for key in sth.keys():
        predict_set[key]=predict_set.get(key,0.)+sth[key]
    predict_set=sorted(predict_set.iteritems(),key=lambda d:d[1],reverse=True)#

    threshold=predict_set[L][1]
    for i in edges_1:
        if sth[i]>threshold:
            y_score.append(1)
        else:
            y_score.append(0)

    for i in edges_1:
        if i not in edges_2:
            y_true.append(0)
        else:
            y_true.append(1)

    print classification_report(y_true,y_score)
    print auc_score(y_true,y_score)
Exemplo n.º 2
0
def eval_model():
    comments, labels = load_extended_data()

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    #models = [clf1]
    cv = ShuffleSplit(len(comments), n_iterations=5, test_size=0.2,
            indices=True)
    scores = []
    for train, test in cv:
        probs_common = np.zeros((len(test), 2))
        for clf in models:
            X_train, y_train = comments[train], labels[train]
            X_test, y_test = comments[test], labels[test]
            clf.fit(X_train, y_train)
            probs = clf.predict_proba(X_test)
            print("score: %f" % auc_score(y_test, probs[:, 1]))
            probs_common += probs
        probs_common /= 4.
        scores.append(auc_score(y_test, probs_common[:, 1]))
        print("combined score: %f" % scores[-1])

    print(np.mean(scores), np.std(scores))
Exemplo n.º 3
0
def test_thresholded_scorers():
    """Test scorers that take thresholds."""
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = auc_score(y_test, clf.decision_function(X_test))
    score3 = auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = SCORERS['log_loss'](clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
Exemplo n.º 4
0
def test_score_scale_invariance():
    # Test that average_precision_score and auc_score are invariant by
    # the scaling or shifting of probabilities
    y_true, _, probas_pred = make_prediction(binary=True)
    roc_auc = auc_score(y_true, probas_pred)
    roc_auc_scaled = auc_score(y_true, 100 * probas_pred)
    roc_auc_shifted = auc_score(y_true, probas_pred - 10)
    assert_equal(roc_auc, roc_auc_scaled)
    assert_equal(roc_auc, roc_auc_shifted)
    pr_auc = average_precision_score(y_true, probas_pred)
    pr_auc_scaled = average_precision_score(y_true, 100 * probas_pred)
    pr_auc_shifted = average_precision_score(y_true, probas_pred - 10)
    assert_equal(pr_auc, pr_auc_scaled)
    assert_equal(pr_auc, pr_auc_shifted)
Exemplo n.º 5
0
def ScoreClassifier(features, labels, clf=None, score_func=None):
  """Test a learned classifier.

  :type callable score_func: Scoring function (one of accuracy_scorer or
     auc_scorer). This is not a score function from sklearn.metrics.
  :rtype: float
  :returns: Accuracy of classifier on test data.

  """
  # Note: type(clf) will be 'instance' if clf is a learned classifier. If
  # instead type(clf) is 'type', then it is assumed to be the class of learning
  # algorithm to apply.
  if clf is None or type(clf) == type:
    mask = ChooseTrainingSet(labels, 0.5)
    clf = FitClassifier(features[mask], labels[mask], algorithm=clf)
    features = features[~mask]
    labels = labels[~mask]
  features = features.astype(float)
  score_func = score_func or 'accuracy'
  if isinstance(score_func, basestring):
    score_func = score_func.lower()
    if score_func == 'accuracy':
      score_func = accuracy_score
    elif score_func == 'auc':
      predictions = clf.decision_function(features)
      return auc_score(labels, predictions), predictions
  elif not callable(score_func):
    raise ValueError("Score function must be a string or a callable.")
  predictions = clf.predict(features)
  return score_func(labels, predictions), predictions
Exemplo n.º 6
0
def auc(test_data, index, reverse, test_file):
    pred = [x[index] for x in test_data]

    if reverse:
        pred = [ x * -1 for x in pred]
    testing_Y = [x[0] for x in test_data]
    print "AUC: \n%f\n" % metrics.auc_score(testing_Y, pred)    
Exemplo n.º 7
0
def run_cv(x,y,reg,cv):
     ''' returns mean AUC for this reg using cv splits.'''
     scores = []      
     for sp in cv:
          reg.fit(x[sp[0],:],y[sp[0]])
          scores.append(auc_score(y[sp[1]],reg.predict_proba(x[sp[1],:])[:,1]))
     return np.mean(scores)
def classification_metrics (targets, preds, probs=None):

#    if probs != None and len(probs) > 0:
#        fpr, tpr, thresholds = roc_curve(targets, probs[:, 1], 1)
#        roc_auc = auc_score(fpr, tpr)
#    else:
#        fpr, tpr, thresholds = roc_curve(targets, preds, 1)
#        roc_auc = auc_score(targets, preds)

    auc = 0
    if len(targets) > 1:
        auc = auc_score(targets, preds)

    cm = confusion_matrix(targets, preds)

    #accuracy
    acc = accuracy_score(targets, preds)

    #recall? True Positive Rate or Sensitivity or Recall
    sens = recall_score(targets, preds)

    #precision
    prec = precision_score(targets, preds)

    #f1-score
    f1 = f1_score(targets, preds, np.unique(targets), 1)

    tnr  = 0.0
    spec = 0.0
    #True Negative Rate or Specificity (tn / (tn+fp))
    if len(cm) == 2:
        if (cm[0,0] + cm[0,1]) != 0:
            spec = float(cm[0,0])/(cm[0,0] + cm[0,1])

    return acc, sens, spec, prec, f1, auc
Exemplo n.º 9
0
 def test_auc():
     probs = numpy.ravel([test_pred_proba_i(i)[:,1] for i in xrange(n_test_batches)])
     if numpy.all(test_set_y.get_value()) and numpy.all(probs):
         return 1.
     if numpy.all(test_set_y.get_value() == 0) and numpy.all(probs == 0):
         return 0.
     return auc_score(test_set_y.get_value()[:n_test_batches*batch_size], probs)
def evalSymbReg(individual):
    # Transform the tree expression in a callable function
    func = toolbox.lambdify(expr=individual)
    # Evaluate the sum of squared difference between the expression
    # and the real function : x**4 + x**3 + x**2 + x
    #X=[[1.0000,0.9231,1.0000,1.0000,1.0000,1.0000,0.9091,1.0000]]
    #X.append([1.0000,0.9231,1.0000,1.0000,0.8333,0.5000,0.9091,0.8333])
    #X.append([1.0000,0.9231,1.0000,1.0000,0.8333,1.0000,0.9091,0.8333])
    #X.append([0.0000,0.9231,0.0000,0.0000,0.8333,0.5000,0.9091,0.8333])
    #X=[[1.0000,0,1.0000,1.0000,1.0000,1.0000,0,1.0000]]
    #X.append([1.0000,0,1.0000,1.0000,0,0.5000,0,0])
    #X.append([1.0000,0,1.0000,1.0000,0,1.0000,0,0])
    #X.append([1.0000,1,1.0000,1.0000,0,0.5000,0,1])
    #X.append([0.0000,1,1.0000,1.0000,0,0.5000,0,1])
    #L
    #A=[1,1,1,0,1];
    t=0
    a=0
    global co 
    co+=1
   # print co
    if(co>6000):
        global A,X
        A,X=getdata()
        co=0
    preds=[]
    for x in X:
        preds.append(func(x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7]))
        #t+=1
    auc = 1-metrics.auc_score(A, preds)
    return auc,
Exemplo n.º 11
0
def bagging():
    from sklearn.feature_selection import SelectPercentile, chi2

    comments, dates, labels = load_data()
    select = SelectPercentile(score_func=chi2, percentile=4)

    clf = LogisticRegression(tol=1e-8, penalty='l2', C=7)
    #clf = BaggingClassifier(logr, n_estimators=50)
    countvect_char = TfidfVectorizer(ngram_range=(1, 5),
            analyzer="char", binary=False)
    countvect_word = TfidfVectorizer(ngram_range=(1, 3),
            analyzer="word", binary=False)
    badwords = BadWordCounter()

    ft = FeatureStacker([("badwords", badwords), ("chars", countvect_char),
        ("words", countvect_word)])
    #ft = TextFeatureTransformer()
    pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)])

    cv = ShuffleSplit(len(comments), n_iterations=20, test_size=0.2,
            indices=True)
    scores = []
    for train, test in cv:
        X_train, y_train = comments[train], labels[train]
        X_test, y_test = comments[test], labels[test]
        pipeline.fit(X_train, y_train)
        probs = pipeline.predict_proba(X_test)
        scores.append(auc_score(y_test, probs[:, 1]))
        print("score: %f" % scores[-1])
    print(np.mean(scores), np.std(scores))
def model_generate_level1(bestc, features, X_train, y) :

    ntrain = X_train.shape[0]
    newdata = np.zeros(ntrain)

    X_train, keymap = utility.OneHotEncoder(X_train[:,features])
    model = linear_model.LogisticRegression()
    model.C = bestc

    cvscores = []
    cvgen = cross_validation.KFold(ntrain, 10, random_state=utility.SEED)
    for train_inds, test_inds in cvgen :
        X_cvtrain = X_train[train_inds]
        X_cvtest = X_train[test_inds]
        y_cvtrain = y[train_inds]
        y_cvtest = y[test_inds]

        model.fit(X_cvtrain, y_cvtrain)
        pred_cvtest = model.predict_proba(X_cvtest)[:,1]
        cvscore = metrics.auc_score(y_cvtest, pred_cvtest)
        cvscores.append(cvscore)

        newdata[test_inds] = pred_cvtest

    print "Average CV Score: {}".format(np.mean(cvscores))
    return newdata
Exemplo n.º 13
0
def run_fest_test(festpath="/Users/bjcohen/dev/fest", **kwargs):
    """
    -c <int>  : committee type:
                1 bagging
                2 boosting (default)
                3 random forest
    -d <int>  : maximum depth of the trees (default: 1000)
    -e        : report out of bag estimates (default: no)
    -n <float>: relative weight for the negative class (default: 1)
    -p <float>: parameter for random forests: (default: 1)
                (ratio of features considered over sqrt(features))
    -t <int>  : number of trees (default: 100)
    """
    idstr = "".join(map(lambda (f, v): f + str(v), kwargs.items()))
    ret = call(
        [
            os.path.join(festpath, "festlearn"),
            " ".join(map(lambda (f, v): "-" + f + str(v), kwargs.items())),
            os.path.join("..", "data", "train_3way_-27000.libsvm"),
            os.path.join("..", "data", "fest_%s_-27000.model" % idstr),
        ]
    )
    if ret != 0:
        raise Exception()
    ret = call(
        [
            os.path.join(festpath, "festclassify"),
            os.path.join("..", "data", "train_3way_-27000.libsvm"),
            os.path.join("..", "data", "fest_%s_-27000.model" % idstr),
            os.path.join("..", "data", "pred_fest_train_-27000_%s" % idstr),
        ]
    )
    if ret != 0:
        raise Exception()
    ret = call(
        [
            os.path.join(festpath, "festclassify"),
            os.path.join("..", "data", "train_3way_27000-.libsvm"),
            os.path.join("..", "data", "fest_%s_-27000.model" % idstr),
            os.path.join("..", "data", "pred_fest_train_27000-_%s" % idstr),
        ]
    )
    if ret != 0:
        raise Exception()
    tr_score = auc_score(ACTION[:27000], pd.read_table("../data/pred_fest_train_-27000_%s" % idstr, header=None))
    te_score = auc_score(ACTION[27000:], pd.read_table("../data/pred_fest_train_27000-_%s" % idstr, header=None))
    return (tr_score, te_score)
Exemplo n.º 14
0
def calculatePrediction():
		
        dTr = loadFile('../data/train.csv')
        y_train = dTr[0]
        X_train_A = dTr[1]
        X_train_B = dTr[2]

        dTes = loadFileTest('../data/test.csv')
        X_test_A = dTes[0]
        X_test_B = dTes[1]

        print "train size: {0} {1}".format(X_train_A.shape, X_train_B.shape)
        print "test size: {0} {1}".format(X_test_A.shape, X_test_B.shape)

	#def transform_features(x):
	#    return np.log(1+x)
	
	X_train_minus = transform_features(X_train_A) - transform_features(X_train_B)
	X_train_div = transform_features(X_train_A) / (transform_features(X_train_B) + 1)
	X_train = np.concatenate((X_train_div, X_train_minus),axis=1)

	X_test_minus = transform_features(X_test_A) - transform_features(X_test_B)
	X_test_div = transform_features(X_test_A) / (transform_features(X_test_B) + 1)
	X_test = np.concatenate((X_test_div, X_test_minus),axis=1)
	
        #In this case we'll use a random forest, but this could be any classifier
        cfr = RandomForestClassifier(n_estimators=100, max_features=math.sqrt(X_train.shape[1]), n_jobs=1)

    #Simple K-Fold cross validation. 5 folds.
        cv = cross_validation.KFold(len(X_train), k=10, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
        results = []
        for traincv, testcv in cv:
            probas = cfr.fit(X_train[traincv], y_train[traincv]).predict_proba(X_train[testcv])
            p_train = [x[1] for x in probas]
            results.append(auc_score(y_train[testcv].tolist(),p_train))
            #results.append( logloss.llfun(target[testcv], [x[1] for x in probas]) )

    #print out the mean of the cross-validated results
        print "Results: " + str( np.array(results).mean() )

        # Test set prob

        probas = cfr.predict_proba(X_test)
        p_test = [x[1] for x in probas]

	###########################
	# WRITING SUBMISSION FILE
	###########################
	predfile = open('predictions_test.csv','w+')

        print "label size: test - {0} expected {1}".format(len(p_test), X_test_A.shape[0])
	
        for item in p_train:
            print >>predfile, "{0}".format(str(item))
	
	predfile.close()
	def AUROCScore(self):
		try:
			self.__rocarea = auc_score(self.__labels, self.__scores)
		except Exception as e:
			print "roc_curve exception"
			print e
			return nan
		return self.__rocarea
Exemplo n.º 16
0
def test_roc_curve():
    """Test Area under Receiver Operating Characteristic (ROC) curve"""
    y_true, _, probas_pred = make_prediction(binary=True)

    fpr, tpr, thresholds = roc_curve(y_true, probas_pred)
    roc_auc = auc(fpr, tpr)
    assert_array_almost_equal(roc_auc, 0.80, decimal=2)
    assert_almost_equal(roc_auc, auc_score(y_true, probas_pred))
Exemplo n.º 17
0
def summary(clf, x, y):
    df = clf.decision_function(x).ravel()
    yp = df > 0

    print 'False Positive: %0.3f' % false_pos(y, yp)
    print 'Recall:         %0.3f' % recall(y, yp)
    print 'AUC:            %0.3f' % auc_score(y, yp)
    print 'Accuracy:       %0.3f' % (yp == y).mean()
Exemplo n.º 18
0
def iterate_Multinomial_alpha(vect):
	auc_training=[]
	auc_oos=[]
	dfs=np.arange(0,3,0.1)
	print dfs
	for n in dfs:
		print n
		train2=vect.fit_transform(train.Comment)
		x_train2,x_test2=train_test_split(train2,random_state=42)
		x_train2=x_train2.tocoo() 
		x_test2=x_test2.tocoo()
		classifier = MultinomialNB(fit_prior=True, alpha=n).fit(x_train2,x_train[:,0]) 
		auc_training.append(auc_score(x_train[:,0],classifier.predict(x_train2)))
		auc_oos.append(auc_score(x_test[:,0],classifier.predict(x_test2)))
	results= zip(dfs,auc_training,auc_oos)
	print results
	return auc_plot(results)
def validation_worker(args):
    X, y, model, j, SEED = args
    X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.15, 
                                       random_state = j*SEED)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_cv)[:,1]
    auc = metrics.auc_score(y_cv, preds)
    return auc  
def cv_loop(X, y, model, N, seed):
    mean_auc = 0.0
    k_fold = KFold(len(y), N, indices=True, shuffle=True, random_state=seed)
    for train_ix, test_ix in k_fold:
        model.fit(X[train_ix], y[train_ix])
        preds = model.predict_proba(X[test_ix])[:, 1]
        auc = metrics.auc_score(y[test_ix], preds)
        # print("AUC (fold %d/%d): %f" % (i + 1, N, auc))
        mean_auc += auc
    return mean_auc / N
def cv_loop(X, y, model, N):
    mean_auc = 0.0
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=0.20, random_state=i * SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:, 1]
        auc = metrics.auc_score(y_cv, preds)
        print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc += auc
    return mean_auc / N
Exemplo n.º 22
0
def iterate_vectorizer(initial_value,iterations,vect_string):
	auc_training=[]
	auc_oos=[]
	dfs=range(initial_value,iterations)
	print dfs
	for n in dfs:
		print n
		vect = eval(vect_string % n)
		train2=vect.fit_transform(train.Comment)
		x_train2,x_test2=train_test_split(train2,random_state=42)
		x_train2=x_train2.tocoo() 
		x_test2=x_test2.tocoo()
		classifier = MultinomialNB(fit_prior=True).fit(x_train2,x_train[:,0]) #the third term has to be a list, or an array)
		auc_training.append(auc_score(x_train[:,0],classifier.predict(x_train2)))
		auc_oos.append(auc_score(x_test[:,0],classifier.predict(x_test2)))
	results= zip(dfs,auc_training,auc_oos)
	print results
	return auc_plot(results)
	return results
Exemplo n.º 23
0
def calculate_classification_statistics(dataset):

    # Old format:
    # 'Expected true and predicted labels for each fold, but failed.' +
    # 'If you wish to provide labels for each fold separately it should look like: ' +
    # '[[y_true_1, y_predicted_1], [y_true_2, y_predicted_2], ...]')

    labels = [[],[]]
    for i in range(0,len(dataset.target)):
        labels[0].append(dataset['target'][i])
        labels[1].append(dataset['targetPredicted'][i])

    # Check if we have true and predicted labels for each fold
    if labels and type(labels[0][0]) == list:
        try:
            # Flatten
            y_true, y_pred = [], []
            for fold_labels in labels:
                y_true.extend(fold_labels[0])
                y_pred.extend(fold_labels[1])
            labels = [y_true, y_pred]
        except:
            raise Exception('Expected true and predicted labels for each fold, but failed.' +
                            'If you wish to provide labels for each fold separately it should look like: ' +
                            '[[y_true_1, y_predicted_1], [y_true_2, y_predicted_2], ...]')
    if len(labels) != 2:
        raise Exception('Wrong input structure, this widget accepts labels in the form: [y_true, y_pred]')

    y_true, y_pred = labels

    classes = set()
    classes.update(y_true + y_pred)
    classes = sorted(list(classes))

    # Assign integers to classes
    class_to_int = {}
    for i, cls_label in enumerate(classes):
        class_to_int[cls_label] = i

    y_true = [class_to_int[lbl] for lbl in y_true]
    y_pred = [class_to_int[lbl] for lbl in y_pred]

    accuracy = metrics.accuracy_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred)
    confusion_matrix = metrics.confusion_matrix(y_true, y_pred)

    # AUC is defined only for binary classes
    if len(classes) == 2:
        auc = metrics.auc_score(y_true, y_pred)
    else:
        auc = 'undefined for multiple classes'

    return accuracy, precision, recall, f1, auc, confusion_matrix
Exemplo n.º 24
0
def r_funct(current_key,str_values):
	df_train = pd.DataFrame.from_records(str_values,columns = df_columns)	
	'''
	#0. convert to proper dtypes
	for col,coltype in data_type_dict.iteritems():
		if coltype=='int64':
			df_train[col] = df_train[col].astype(int)
		if coltype=='float64':
			df_train[col] = df_train[col].astype(float)
	'''
	#1. remove constant columns
	remove = []
	for col in df_train.columns:
		if df_train[col].std() == 0:
			remove.append(col)
	df_train = df_train.drop(remove, axis=1)
	
	#2. remove duplicated columns
	remove = []
	c = df_train.columns
	for i in range(len(c)-1):
		v = df_train[c[i]].values
		for j in range(i+1,len(c)):
			if np.array_equal(v,df_train[c[j]].values):
				remove.append(c[j])
	df_train = df_train.drop(remove, axis=1)

	#REMOVE UNWANTED COLUMNS
	y_train = df_train['TARGET'].values
	X_train = df_train.drop(['ID','TARGET'], axis=1).values

	# params for this randomforest
	len_train = len(X_train)
	learning_rate=random.choice([1,.5,.3,.2,.1,.03,.05,.01,.005,.001,.0005,.0001,.00001])
	n_estimators=sp_randint.rvs(100, 5000)
	subsample=random.choice([1,.95,.85,.90,.8])
	min_samples_split=sp_randint.rvs(2, 11)
	min_samples_leaf=sp_randint.rvs(1, 11)
	max_depth=sp_randint.rvs(2, 20)
	min_weight_fraction_leaf=0


	# kfold cross validation for train data using randomforest.
	clf = GradientBoostingClassifier(learning_rate=learning_rate,n_estimators=n_estimators,subsample=subsample,min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,max_depth=max_depth)
	k_fold = cross_validation.KFold(len_train, 5)
	auc_scores_list = []
	for k, (train, test) in enumerate(k_fold):
		clf.fit(X_train[train], y_train[train])
		auc_scr = auc_score(y_train[test], clf.predict_proba(X_train[test])[:,1])
		auc_scores_list.append(auc_scr)
	mean = np.mean(auc_scores_list)
	std = np.std(auc_scores_list)

	print "GBT:learning_rate:%s,n_estimators:%s,subsample:%s,min_samples_split:%s,min_samples_leaf:%s,min_weight_fraction_leaf:%s,max_depth:%s,mean:%s,std:%s" %(learning_rate,n_estimators,subsample,min_samples_split,min_samples_leaf,min_weight_fraction_leaf,max_depth,mean,std)
Exemplo n.º 25
0
def demo():
    # Basic demo test
    X_train, y_train = load_svmlight_file("demo.train", query_id=False)
    X_test, y_test = load_svmlight_file("demo.train", query_id=False)
    coef, _ = sgd_train(
        X_train, y_train, np.ones(y_test.shape), alpha=0.1, n_features=150000, model="rank", max_iter=100000
    )
    preds = sgd_predict(X_test, coef, blocks=None)
    preds = np.sign(preds)
    assert accuracy_score(y_test, preds) > 0.98
    assert precision_score(y_test, preds) > 0.98
    assert recall_score(y_test, preds) > 0.98
    assert auc_score(y_test, preds) > 0.98
Exemplo n.º 26
0
def auc(y, y_pred):
  
    #y_1 = y[(y['Target'] == 0) | (y['Target'] == 1)]
    y_1 = y.copy()
    y_1[y.Target == 0] = -1
    #print y_1
    #predictions = y_p.ix[y_1.index]
    auc1 = metrics.auc_score(np.array(y_1.Target), y_pred)

    #y_2 = y[(y['Target'] == 0) | (y['Target'] == -1)]
    #predictions = y_p.ix[y_2.index]
    #predictions = predictions * (-1)
    y_2 = y.copy()
    y_2[y.Target == 0] = 1
    #y_2 = y_2.replace(-1, 1)
    #print predictions.shape, y_2.shape
    #print len(y_1),len(y_2.Target)

    auc2 = metrics.auc_score(np.array(y_2.Target), y_pred)

    auc = 0.5 * auc1 + 0.5 * auc2
    #print "AUC", auc, auc1, auc2
    return auc, auc1, auc2
Exemplo n.º 27
0
def cv_loop_mt(args):
    f, X, y, model, N = args
    sum_auc = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.20, 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.auc_score(y_cv, preds)
        print 'Feature Set {f} AUC (fold {current}/{total}): {auc}'.format(f = f, current = i + 1, total = N, auc = auc)
        sum_auc += auc
    mean_auc = sum_auc / N
    return (mean_auc, f)
Exemplo n.º 28
0
    def report(self):
        from sklearn.metrics import auc_score
        from sklearn.metrics import classification_report
        from sklearn.metrics import confusion_matrix
        from sklearn.metrics import f1_score
        from sklearn.metrics import precision_recall_curve

        y_pred_probas, y_true, md = self.make_predictions()
        y_pred = y_pred_probas.argmax(1)
        y_pred_probas = y_pred_probas[:, 1]
        y_true = y_true.reshape(-1)

        print
        print "AUC score:", auc_score(y_true, y_pred_probas)
        print "AUC score (binary):", auc_score(y_true, y_pred)
        print

        print "Classification report:"
        print classification_report(y_true, y_pred)
        print

        print "Confusion matrix:"
        print confusion_matrix(y_true, y_pred)
        print
def cv_loop(X, y, model, N):
    mean_auc = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.10, 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)
 
        pVec = np.reshape(preds, 19 * preds.shape[0])
        yVec = np.reshape(y_cv, 19 * y_cv.shape[0])
        auc = metrics.auc_score(yVec, pVec)
        print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc += auc
    return mean_auc/N
Exemplo n.º 30
0
def perfeval_classification_statistics(input_dict):
    from sklearn import metrics
    labels = input_dict['true_and_predicted_labels']
    pos_label = input_dict.get('pos_label', None)

    # Check if we have true and predicted labels for each fold
    if labels and type(labels[0][0]) == list:
        try:
            # Flatten
            y_true, y_pred = [], []
            for fold_labels in labels:
                y_true.extend(fold_labels[0])
                y_pred.extend(fold_labels[1])
            labels = [y_true, y_pred]
        except:
            raise Exception('Expected true and predicted labels for each fold, but failed.' + 
                            'If you wish to provide labels for each fold separately it should look like: ' + 
                            '[[y_true_1, y_predicted_1], [y_true_2, y_predicted_2], ...]')
    if len(labels) != 2:
        raise Exception('Wrong input structure, this widget accepts labels in the form: [y_true, y_pred]')
    
    y_true, y_pred = labels
    
    classes = set()
    classes.update(y_true + y_pred)
    classes = sorted(list(classes))

    # Assign integers to classes
    class_to_int = {}
    for i, cls_label in enumerate(classes):
        class_to_int[cls_label] = i

    y_true = [class_to_int[lbl] for lbl in y_true]
    y_pred = [class_to_int[lbl] for lbl in y_pred]

    accuracy = metrics.accuracy_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred)
    confusion_matrix = metrics.confusion_matrix(y_true, y_pred)

    # AUC is defined only for binary classes
    if len(classes) == 2:
        auc = metrics.auc_score(y_true, y_pred)
    else:
        auc = 'undefined for multiple classes'
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 
            'f1': f1, 'auc': auc, 'confusion_matrix': confusion_matrix}
Exemplo n.º 31
0
 def inv_auc_score(y_true, y_scores):
     return 1.0 - metrics.auc_score(y_true, y_scores)
Exemplo n.º 32
0
for e in xrange(epochs):
    print('\nEpoch {:d}/{:d}'.format(e + 1, epochs))
    print('Learning rate: {:6f}'.format(K.eval(ae.optimizer.lr)))
    curr_iter = 0
    train_loss = []
    for batch_adj, batch_train, dummy_f, dummy_y, dummy_m in batch_data:
        # Each iteration/loop is a batch of train_batch_size samples
        res = ae.train_on_batch([batch_adj], [batch_train])
        train_loss.append(res)
        curr_iter += 1
        if curr_iter >= num_iters_per_train_epoch:
            break
    train_loss = np.asarray(train_loss)
    train_loss = np.mean(train_loss, axis=0)
    print('Avg. training loss: {:6f}'.format(train_loss))
    print('\nEvaluating val set...')
    decoded_lp = np.empty(shape=adj.shape, dtype=np.float32)
    predictions = []
    for step in xrange(adj.shape[0] / val_batch_size + 1):
        low = step * val_batch_size
        high = low + val_batch_size
        batch_adj = adj[low:high].toarray()
        if batch_adj.shape[0] == 0:
            break
        decoded_lp[low:high] = ae.predict_on_batch([batch_adj])
    predictions.extend(decoded_lp[test_r, test_c])
    predictions.extend(decoded_lp[test_c, test_r])
    print('Val AUC: {:6f}'.format(auc_score(labels, predictions)))
    print('Val AP: {:6f}'.format(ap_score(labels, predictions)))
print('\nAll Done.')
def target_score(y_true, predictions):
    return (auc_score(y_true == 1, predictions) + auc_score(y_true == -1, -predictions)) / 2
def GetErrorPrimal(data,
                   K,
                   w,
                   b,
                   giveMCC=True,
                   givePercentError=True,
                   giveAUC=True):

    predictions = []
    ys = data[:, 0].reshape(1, len(data[:, 0]))
    predictions = b + np.dot(K, w)
    '''
    import matplotlib.pyplot as pl
    pl.subplot(1,3,1)
    pl.title("Original space")
    pl.plot([data[i,1] for i in range(len(predictions)) if data[i,0]<0], [data[i,2] for i in range(len(predictions)) if data[i,0]<0], "ro")
    pl.plot([data[i,1] for i in range(len(predictions)) if data[i,0]>0], [data[i,2] for i in range(len(predictions)) if data[i,0]>0], "bo")
    pl.xlabel("x")
    pl.ylabel("y")
    
    pl.subplot(1,3,2)
    pl.title("Kernel space predicted")
    pl.plot([K[i,0] for i in range(len(predictions)) if predictions[i]<0], [K[i,1] for i in range(len(predictions)) if predictions[i]<0], "ro")
    pl.plot([K[i,0] for i in range(len(predictions)) if predictions[i]>0], [K[i,1] for i in range(len(predictions)) if predictions[i]>0], "bo")
    pl.xlabel("first component")
    pl.ylabel("second component")
    
    pl.subplot(1,3,3)
    pl.title("Kernel space actual")
    pl.plot([K[i,0] for i in range(len(predictions)) if data[i,0]<0], [K[i,1] for i in range(len(predictions)) if data[i,0]<0], "ro")
    pl.plot([K[i,0] for i in range(len(predictions)) if data[i,0]>0], [K[i,1] for i in range(len(predictions)) if data[i,0]>0], "bo")
    pl.xlabel("first component")
    pl.ylabel("second component")
    
    pl.show()
    '''

    if giveMCC == True:
        temp = doMCC(predictions, data)
        MCC = round(temp[0], 2)
        percentError = round(temp[1], 4)
        results = temp[2]
    if giveMCC == False:
        temp = ne.evaluate('ys*predictions')
        percentError = round(
            .5 * (len(predictions) - ne.evaluate('sum(temp)')) /
            float(len(predictions)), 4)
    if giveAUC == True:
        AUC = round(sm.auc_score(data[:, 0], np.array(predictions)), 2)
    if giveMCC == True and giveAUC == True:
        return {
            'MCC': MCC,
            'results': results,
            '%E': percentError,
            'AUC': AUC,
            'predictons': None
        }
    if giveMCC == True and giveAUC == False:
        return {
            'MCC': MCC,
            'results': results,
            '%E': percentError,
            'predictons': None
        }
    if giveMCC == False and giveAUC == False:
        return {'AUC': None, '%E': percentError}
def GetErrorDual(InSampleData,
                 OutSampleData,
                 K,
                 Kdecomp,
                 a,
                 b,
                 giveMCC=True,
                 givePercentError=True,
                 giveAUC=True):

    H = OutSampleData[:, 0].size
    predictions = []
    y_in = InSampleData[:, 0].reshape(len(InSampleData[:, 0]), 1)
    y_out = OutSampleData[:, 0].reshape(1, len(OutSampleData[:, 0]))
    z = ne.evaluate('a*y_in')
    for i in xrange(H):
        X = b + np.dot(K[i], z)
        predictions.append(X)
    '''  
    import matplotlib.pyplot as pl
    data=OutSampleData
    K=Kdecomp
    pl.subplot(1,3,1)
    pl.title("Original space")
    pl.plot([data[i,1] for i in range(len(predictions)) if data[i,0]<0], [data[i,2] for i in range(len(predictions)) if data[i,0]<0], "ro")
    pl.plot([data[i,1] for i in range(len(predictions)) if data[i,0]>0], [data[i,2] for i in range(len(predictions)) if data[i,0]>0], "bo")
    pl.xlabel("x")
    pl.ylabel("y")
    
    pl.subplot(1,3,2)
    pl.title("Kernel space with guessed-predictions")
    pl.plot([K[i,0] for i in range(len(predictions)) if predictions[i]<0], [K[i,1] for i in range(len(predictions)) if predictions[i]<0], "ro")
    pl.plot([K[i,0] for i in range(len(predictions)) if predictions[i]>0], [K[i,1] for i in range(len(predictions)) if predictions[i]>0], "bo")
    pl.xlabel("first component")
    pl.ylabel("second component")
    
    pl.subplot(1,3,3)
    pl.title("Kernel space with true-predictions")
    pl.plot([K[i,0] for i in range(len(predictions)) if data[i,0]<0], [K[i,1] for i in range(len(predictions)) if data[i,0]<0], "ro")
    pl.plot([K[i,0] for i in range(len(predictions)) if data[i,0]>0], [K[i,1] for i in range(len(predictions)) if data[i,0]>0], "bo")
    pl.xlabel("first component")
    pl.ylabel("second component")
    
    pl.show()
    '''

    if giveMCC == True:
        temp = doMCC(predictions, OutSampleData)
        MCC = round(temp[0], 2)
        percentError = round(temp[1], 4)
        results = temp[2]
    if giveMCC == False:
        temp = ne.evaluate('y_out*predictions')
        percentError = round(
            .5 * (len(predictions) - ne.evaluate('sum(temp)')) /
            float(len(predictions)), 4)
    if giveAUC == True:
        AUC = round(sm.auc_score(OutSampleData[:, 0], np.array(predictions)),
                    2)
    if giveMCC == True and giveAUC == True:
        return {
            'MCC': MCC,
            'results': results,
            '%E': percentError,
            'AUC': AUC,
            'predictons': None
        }
    if giveMCC == True and giveAUC == False:
        return {
            'MCC': MCC,
            'results': results,
            '%E': percentError,
            'predictons': None
        }
    if giveMCC == False and giveAUC == False:
        return {'AUC': None, '%E': percentError}
Exemplo n.º 36
0
def evaluation(y_true, y_pred):
    return metrics.auc_score(y_true, y_pred[:, 1])
Exemplo n.º 37
0
def get_auc(y, y_pred_proba):
    score = auc_score(y, y_pred_proba)
    print score