예제 #1
0
def run_confusion(clf, X, Y, Y_dist, sample_names, **kwargs):
    #print X, Y
    thres = kwargs['options'].get('confus_thres', 0.01)
    random.seed(0)
    if hasattr(clf, 'get_shape') and callable(getattr(X, 'get_shape')):
        #sparse matrix
        n_samples = X.get_shape()[0]
    else:
        #normal ndarray
        n_samples = X.shape[0]
    p = range(n_samples)
    random.shuffle(p)
    X, Y = X[p], Y[p]
    half = int(n_samples / 2)
    #Run classifier
    sample_names = sample_names[half:]
    y_ = clf.fit(X[:half], Y[:half]).predict(X[half:])
    if hasattr(clf, 'predict_proba') and callable(getattr(clf, 'predict_proba')):
        try:
            y_prob= clf.predict_proba(X[half:])
            valid_idx= []
            invalid_idx = []
            invalid_name = []
            diff = y_prob-Y_dist
            #for computing confusion matrix, we only pick
            for i in xrange(len(diff)):
                diff_sum = 0
                for x in diff[i]:
                    diff_sum += abs(x)
                if diff_sum > thres:
                    valid_idx.append(i)
                else:
                    invalid_idx.append(i)
                    invalid_name.append(sample_names[i])
                    print "too close", y_prob[i], "\nversus\n", Y_dist
            print "valid ", len(valid_idx), " invalid", len(invalid_idx)
            for name in invalid_name:
                logger.info("samples with class distribution: %s" % name)
        except NotImplementedError:
            valid_idx = range(len(y_))
    else:
        valid_idx = range(len(y_))
    #Compute confusion matrix
    Y_half = Y[half:]
    corr = pearsonr(Y_half[valid_idx], y_[valid_idx])
    print "Pearson correlation %f" %corr[0]
    cm = metrics.confusion_matrix(Y_half[valid_idx], y_[valid_idx])
    print cm
    logger.info('confusion matrix : %s' % str(cm))
    return (corr, cm, y_prob, sample_names)
예제 #2
0
 def inner_pickler(*args, **kwargs):
     name = build_pickle_name(func, *args, **kwargs)
     print "name ", name
     logger.info('name: %s' % name)
     if os.path.exists(name+".pkl"):
         tup = pickle.load(open(name+".pkl", 'rb'))
         clf = tup[0]
         scores = tup[1]
         corr = tup[2]
         cm = tup[3]
         y_prob = tup[4]
         sample_names = tup[5]
     else:
         clf = func(**kwargs)
         logger.info('running confusion matrix')
         (corr, cm, y_prob, sample_names) = run_confusion(clf, *args, **kwargs)
         logger.info('running cross validation')
         scores = run_cv(clf, *args[:2])
         if kwargs['options'].get('train', True):
             clf.fit(*args[:2])
         pickle.dump((clf, scores,corr, cm,
                      y_prob, sample_names), open(name+".pkl", 'wb'))
     logger.info('scores. %s' % str(scores))
     return (name, scores, corr, cm, y_prob, sample_names)