Пример #1
0
def classify(title, X, y, keys):
    print 'classify(title=%s, X=%s, y=%s, keys=%s)' % (title, X.shape, y.shape, keys)

    Xr, yr = select_features.resample_equal_y(X, y, 1.0)
       
    n_iter_val = 500
    power_t_val = 0.9
    alpha_val = 0.1 

    def get_sgd_hinge():
        return SGDClassifier(loss="hinge", alpha=alpha_val, n_iter=n_iter_val, fit_intercept=True)
 
    def get_rbf_svc():
        return svm.SVC(kernel='rbf', C=0.5, gamma=0.1)
        
    return classify_by_method(title + '_rbf', Xr, yr, keys, get_rbf_svc, True)
Пример #2
0
def compare_classifiers(title, X, y, keys):
    print 'compare_classifiers(title=%s, X=%s, y=%s, keys=%s)' % (title, X.shape, y.shape, keys)

    Xr, yr = select_features.resample_equal_y(X, y, 1.0)
   
    n_iter_val = 5000
    power_t_val = 0.9
    alpha_val = 0.1 
    CACHE_SIZE = 2000

    def get_sgd_hinge():
        return SGDClassifier(loss="hinge", alpha=alpha_val, n_iter=n_iter_val, fit_intercept=True)

    def get_svd_linear(): 
        return svm.SVC(kernel='linear')
        
    def get_svd_poly():    
        return svm.SVC(kernel='poly')

    def get_nu_linear():
        return svm.NuSVC(kernel='linear')

    def get_rbf_svc():
        return svm.SVC(kernel='rbf', C=0.5, gamma=0.1)
        
    def get_linear_svc():
        return svm.LinearSVC()
        
    def get_bayes_ridge():
        return linear_model.BayesianRidge()
        
    def get_log_reg_l1():    
        return linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
     
    def get_log_reg_l2():    
        return linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6)
        
    def get_lars():    
        return linear_model.LassoLars(alpha = 0.1)    
        
    def get_lasso():
        return linear_model.Lasso(alpha = 0.1)  

    def get_ridge():
        return linear_model.Ridge (alpha = 0.5)
        
    
    # get_rbf_svc works best followed by get_log_reg_l*
    classifiers = {
        'sgd_hinge': get_sgd_hinge,
        'svd_linear': get_svd_linear, 
        'svd_poly': get_svd_poly,
        'nu_linear': get_nu_linear,
        'linear_svc': get_linear_svc,
        'rbf_svc': get_rbf_svc,
        #'bayes_ridge': get_bayes_ridge, Not a classifier
        'log_reg_l1': get_log_reg_l1,
        'log_reg_l2': get_log_reg_l2,
        'lars': get_lars,
        'lasso': get_lasso,
        #'ridge': get_ridge,
        
    }

    slow = ['svd_poly']
    classifier_order = sorted(classifiers.keys(), key = lambda k: (k in slow, k))

    #print svm.SVC.__doc__
    if False:
        for gamma in [0.0, 0.1, 0.2, 0.5]:
            for C in [0.1, 0.2, 0.5, 1.0]:
                def func():
                    return svm.SVC(kernel='rbf', C=C, gamma=gamma)
                    #return svm.SVC(kernel='rbf', cache_size=CACHE_SIZE, C=C, gamma=gamma)
                name = '%s_gamma=%.2f_C=%.2f' % (title, gamma, C)
                classify_by_method(name, Xr, yr, keys, func, False)

    for name in classifier_order:
        func = classifiers[name]
        classify_by_method(title + '_' + name, Xr, yr, keys, func, False)    
Пример #3
0
def classify_old(title, X, y, keys, get_classifier):
    print 'classify(title=%s, X=%s, y=%s, keys=%s)' % (title, X.shape, y.shape, keys)

    Xr, yr = select_features.resample_equal_y(X, y, 1.0)
    print 'classify: Xr=%s, yr=%s' % (Xr.shape, yr.shape)
    n_samples = Xr.shape[0]

    if False:
        X_train, y_train = Xr[:n_samples/2,:], yr[:n_samples/2]
        X_test, y_test = Xr[n_samples/2:,:], yr[n_samples/2:]

    NUM_FOLDS = 5
    skf = StratifiedKFold(yr, NUM_FOLDS)

    verbose = False

    if verbose:
        def P(s): print s
    else:
        def P(s): pass

    n_iter_val = 500
    for power_t_val in [0.9]:
        for alpha_val in [0.1]: 
            y_test_all = np.zeros(0)
            y_pred_all = np.zeros(0)
            for i,(train, test) in enumerate(skf):
                X_train, y_train = Xr[train,:], yr[train]
                X_test, y_test = Xr[test,:], yr[test]
            
                if verbose: common.SUBHEADING()
                P('Fold %d of %d' % (i, NUM_FOLDS))
                P('classify: X_train=%s, y_train=%s' % (X_train.shape, y_train.shape))
                P('classify:  X_test=%s,  y_test=%s' % (X_test.shape, y_test.shape))

                # fit the model
                classifier = SGDClassifier(loss="hinge", alpha=alpha_val,  
                    n_iter=n_iter_val, fit_intercept=True)
           
                classifier.fit(X_train, y_train)
                y_pred = classifier.predict(X_test)

                P('Classification report for classifier %s:\n%s\n' % (classifier, 
                    metrics.classification_report(y_test, y_pred)))
                P('Confusion matrix:\n%s' % metrics.confusion_matrix(y_test, y_pred))
                
                y_test_all = np.r_[y_test_all, y_test]
                y_pred_all = np.r_[y_pred_all, y_pred]

            common.HEADING()
            print 'Classification report for all %s:\n%s\n' % (
                    classifier, metrics.classification_report(y_test_all, y_pred_all))
            print 'Confusion matrix:\n%s' % metrics.confusion_matrix(y_test_all, y_pred_all)

            # plot the line, the points, and the nearest vectors to the plane
            if False:
                fac = 1.0
                print 'Downsampling by a further factor of %f' % fac
                X_r, y_r = sklearn.utils.resample(X, y, n_samples = int(X.shape[0] * fac)) 
            y_pred = classifier.predict(Xr)
            plot_classification(Xr, yr, y_pred, keys, title, classifier)  
Пример #4
0
        top_features['f'][AGE_HIGH] = ['DrugCount_DSFS', 'proc_group=SDS', 'specialty=None', 'pcg=MISCL5', 'pcg=NEUMENT', 'pcg=ODaBNCA', 'pcg=SKNAUT', 'pcg=TRAUMA']
   
   # Set random seed so that each run gives same results
    random.seed(333)
    np.random.seed(333)

    def P(s):
        """Print string s"""
        print s
        #logfile.write(s + '\n')

    features = 'all2'

    X,y,keys = getXy_by_features_(-1, features)

    Xr, yr = select_features.resample_equal_y(X, y, 1.0)
    Xr, yr = normalize(Xr, yr)

    sex_vals = np.unique(Xr[:,keys.index('Sex')])
    age_vals = np.unique(Xr[:,keys.index('AgeAtFirstClaim')])
    sex_boundary = sex_vals.mean()
    age_boundaries = [0.5*(age_vals[i]+age_vals[i+1]) for i in [0,age_vals.size-2]] 
    print 'sex_vals = %s' % sex_vals
    print 'age_vals = %s' % age_vals
    print 'sex_boundary = %s' % sex_boundary
    print 'age_boundaries = %s' % age_boundaries

    print 'Xr=%s,yr=%s' % (Xr.shape, yr.shape)
    NUM_FOLDS = 2
    skf = StratifiedKFold(yr, NUM_FOLDS)