def find_best_features(year, features, sex, age, heavy): """year=-1 => both years 2,3 """ print 'find_best_features(year=%d,features=%s,sex=%s,age=%s,heavy=%s)' % (year, features, sex, age, heavy) X, y, keys = getXy_by_features(year, features, sex, age) title = 'features=%s,sex=%s,age=%s,year=%d' % (features,sex,age,year) results, n_samples = select_features.get_most_predictive_feature_set(title, X, y, keys, heavy) return results, n_samples, keys
def find_best_features(year, features, sex): import select_features print 'find_best_features(year=%d)' % year if features == 'pcg': X,y,keys = getXy_pcg(year) elif features == 'patient': X,y,keys = getXy_patient(year) elif features == 'all': X,y,keys = getXy_all(year) print 'keys=%s' % keys if sex and sex.lower()[0] in 'mf' and 'Sex' in keys: # Get male or female population sex_key = keys.index('Sex') if sex.lower()[0] == 'm': p = X[:,sex_key] < 0.5 else: p = X[:,sex_key] > 0.5 X = X[p,:] y = y[p] # Remove columns with low counts LOW_COUNT_THRESHOLD = 100 Xtot = X.sum(axis=0) significant = Xtot >= LOW_COUNT_THRESHOLD # Remove sex too significant[sex_key] = False print 'Removing keys < %d: %s' % (LOW_COUNT_THRESHOLD, [keys[i] for i in range(len(keys)) if not significant[i]]) print 'keys=%d X=%s => ' % (len(keys), X.shape), keys = [keys[i] for i in range(len(keys)) if significant[i]] X = X[:,significant] print 'keys=%d X=%s' % (len(keys), X.shape) # Normalize means = X.mean(axis=0) stds = X.std(axis=0) for i in range(X.shape[1]): X[:,i] = X[:,i] - means[i] if abs(stds[i]) > 1e-6: X[:,i] = X[:,i]/stds[i] return select_features.get_most_predictive_feature_set(X, y, keys), keys