Пример #1
0
def find_best_features(year, features, sex, age, heavy):
    """year=-1 => both years 2,3 """
    print 'find_best_features(year=%d,features=%s,sex=%s,age=%s,heavy=%s)' % (year, features, sex,
        age, heavy)
    X, y, keys = getXy_by_features(year, features, sex, age)
    title = 'features=%s,sex=%s,age=%s,year=%d' % (features,sex,age,year) 
    results, n_samples = select_features.get_most_predictive_feature_set(title, X, y, keys, heavy)
    return results, n_samples, keys  
Пример #2
0
def find_best_features(year, features, sex):
    import select_features
    print 'find_best_features(year=%d)' % year
    if features == 'pcg':
        X,y,keys = getXy_pcg(year)
    elif features == 'patient':
        X,y,keys = getXy_patient(year)  
               
    elif features == 'all':
        X,y,keys = getXy_all(year)
    
    print 'keys=%s' % keys

    if sex and sex.lower()[0] in 'mf' and 'Sex' in keys:
        # Get male or female population
        sex_key = keys.index('Sex')
        if sex.lower()[0] == 'm':
            p = X[:,sex_key] < 0.5
        else:    
            p = X[:,sex_key] > 0.5

        X = X[p,:]
        y = y[p]

        
    # Remove columns with low counts
    LOW_COUNT_THRESHOLD = 100
    Xtot = X.sum(axis=0)
    significant = Xtot >= LOW_COUNT_THRESHOLD
    # Remove sex too
    significant[sex_key] = False
    print 'Removing keys < %d: %s' % (LOW_COUNT_THRESHOLD,
        [keys[i] for i in range(len(keys)) if not significant[i]])
    print 'keys=%d X=%s => ' % (len(keys), X.shape),    
    keys = [keys[i] for i in range(len(keys)) if significant[i]]
    X = X[:,significant] 
    print 'keys=%d X=%s' % (len(keys), X.shape) 
    
    # Normalize
    means = X.mean(axis=0)
    stds = X.std(axis=0)

    for i in range(X.shape[1]):
        X[:,i] = X[:,i] - means[i]
        if abs(stds[i]) > 1e-6:
            X[:,i] = X[:,i]/stds[i]    
    
    return select_features.get_most_predictive_feature_set(X, y, keys), keys