def load_data(behavior, covariates=True): behavior_data, conn_data = pu.load_data_full_subjects() if behavior == 'TQ_high_low': tq_data = behavior_data['distress_TQ'].values high_low_thresholds = [0, 46, 84] tq_hl = np.digitize(tq_data, bins=high_low_thresholds, right=True) target_as_str = ['TQ_High' if t > 1 else 'TQ_low' for t in tq_hl] elif behavior == 'TQ_Grade': tq_data = behavior_data['distress_TQ'].values grade_thresholds = [0, 30, 46, 59, 84] tq_grade = np.digitize(tq_data, bins=grade_thresholds, right=True) target_as_str = ['Grade %d' % t for t in tq_grade] else: target_as_float = behavior_data[behavior].values.astype(float) target_as_str = pu.convert_tin_to_str(target_as_float, behavior) target_data = pd.DataFrame(target_as_str, index=conn_data.index) if not covariates: ml_data = conn_data.astype(float) else: categorical_variables = [ 'smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex' ] categorical_data = behavior_data[categorical_variables] dummy_coded_categorical = pu.dummy_code_binary(categorical_data) covariate_data = pd.concat( [behavior_data['age'], dummy_coded_categorical], axis=1) ml_data = pd.concat([conn_data, covariate_data], axis=1) return ml_data, target_data
def lars(): behavior_data, conn_data = pu.load_data_full_subjects() conn_data.astype(float) categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex'] categorical_data = behavior_data[categorical_variables] dummy_coded_categorical = pu.dummy_code_binary(categorical_data) covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1) ml_data = pd.concat([conn_data, covariate_data], axis=1) target = behavior_data['distress_TQ'].values.astype(float) feature_names = list(ml_data) continuous_features = [f for f in feature_names if 'categorical' not in f] continuous_indices = [ml_data.columns.get_loc(cont) for cont in continuous_features] categorical_features = [f for f in feature_names if 'categorical' in f] categorical_indices = [ml_data.columns.get_loc(cat) for cat in categorical_features] ml_continuous = ml_data.values[:, continuous_indices] ml_categorical = ml_data.values[:, categorical_indices] # Standardization for continuous data preproc = preprocessing.StandardScaler().fit(ml_continuous) ml_z = preproc.transform(ml_continuous) # Variance threshold for categorical data varthresh = feature_selection.VarianceThreshold(threshold=0).fit(ml_categorical) ml_v = varthresh.transform(ml_categorical) ml_preprocessed = np.hstack((ml_z, ml_v)) # Feature selection with extra trees clf = ensemble.ExtraTreesRegressor() model = feature_selection.SelectFromModel(clf, threshold="2*mean") # Transform train and test data with feature selection model ml_cleaned = model.fit_transform(ml_preprocessed, target) feature_indices = model.get_support(indices=True) cleaned_features = [feature_names[i] for i in feature_indices] lars_classifier = linear_model.LarsCV(cv=3, normalize=False, fit_intercept=False) lars_classifier.fit(ml_cleaned, target) predicted = lars_classifier.predict(ml_cleaned) r2 = lars_classifier.score(ml_cleaned, target) exp_var = metrics.explained_variance_score(target, predicted) max_err = metrics.max_error(target, predicted) mae = metrics.mean_absolute_error(target, predicted) mse = metrics.mean_squared_error(target, predicted) print(r2)
if __name__ == "__main__": import logging logging.basicConfig(level=logging.INFO) output_dir = './../data/eeg_regression/extra_trees/' if not os.path.isdir(output_dir): os.mkdir(output_dir) behavior_data, conn_data = pu.load_data_full_subjects() conn_data.astype(float) categorical_variables = [ 'smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex' ] categorical_data = behavior_data[categorical_variables] dummy_coded_categorical = pu.dummy_code_binary(categorical_data) covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1) ml_data = pd.concat([conn_data, covariate_data], axis=1) target = behavior_data['distress_TQ'].values.astype(float) targets = [ 'loudness_VAS', 'distress_TQ', 'distress_VAS', 'anxiety_score', 'depression_score' ] for target in targets: target_vect = behavior_data[target].values.astype(float) logging.info('%s Running regression on %s' % (pu.ctime(), target)) eeg_regression(eeg_data=ml_data, target_data=target_vect,
def classification_main(covariates=True, n_iters=0): output_dir = './../data/eeg_classification' if not isdir(output_dir): mkdir(output_dir) print('%s: Loading data' % pu.ctime()) behavior_data, conn_data = pu.load_data_full_subjects() ml_data_without_covariates = conn_data.astype(float) categorical_variables = [ 'smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex' ] categorical_data = behavior_data[categorical_variables] dummy_coded_categorical = pu.dummy_code_binary(categorical_data) covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1) ml_data_with_covariates = pd.concat([conn_data, covariate_data], axis=1) models = ['svm', 'extra_trees', 'knn'] resample_methods = ['no_resample', 'ROS', 'SMOTE', 'RUS'] targets = {} side_data = pu.convert_tin_to_str( behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side') targets['tin_side'] = side_data type_data = pu.convert_tin_to_str( behavior_data['tinnitus_type'].values.astype(float), 'tinnitus_type') targets['tin_type'] = type_data tq_data = behavior_data['distress_TQ'].values high_low_thresholds = [0, 46, 84] tq_high_low = np.digitize(tq_data, bins=high_low_thresholds, right=True) targets['TQ_high_low'] = tq_high_low grade_thresholds = [0, 30, 46, 59, 84] binned_target = np.digitize(tq_data, bins=grade_thresholds, right=True) tq_grade = ['Grade_%d' % t for t in binned_target] targets['TQ_grade'] = tq_grade # hads_thresholds = [8, 11, 21] # 0-7 (normal); 8-10 (borderline); 11-21 (abnormal) # anx_binned = np.digitize(behavior_data['anxiety_score'].values.astype(float), bins=hads_thresholds, right=True) # dep_binned = np.digitize(behavior_data['depression_score'].values.astype(float), bins=hads_thresholds, right=True) # targets['hads_OVR'] = convert_hads_to_single_label(np.vstack((anx_binned, dep_binned)).T) if covariates: ml_data = ml_data_with_covariates cv_check = 'with_covariates' else: ml_data = ml_data_without_covariates cv_check = 'without_covariates' if n_iters != 0: for model in models: for res in resample_methods: for target in targets: target_data = targets[target] perm_scores = {} model_outdir = join( output_dir, '%s %s %s %s' % (target, model, cv_check, res)) if not isdir(model_outdir): mkdir(model_outdir) for n in range(n_iters): perm_target = shuffle(target_data) scores = eeg_classify(ml_data, perm_target, target_type=target, model=model, resample=res) perm_scores['Iter%05d' % n] = scores with open(join(model_outdir, 'perm_scores.pkl'), 'wb') as file: pkl.dump(perm_scores, file) else: for target in targets: target_data = targets[target] for model in models: for res in resample_methods: eeg_classify(ml_data, target_data, target_type=target, model=model, outdir=output_dir, resample=res) print('%s: Finished' % pu.ctime())