def load_data(behavior, covariates=True): behavior_data, conn_data = pu.load_data_full_subjects() if behavior == 'TQ_high_low': tq_data = behavior_data['distress_TQ'].values high_low_thresholds = [0, 46, 84] tq_hl = np.digitize(tq_data, bins=high_low_thresholds, right=True) target_as_str = ['TQ_High' if t > 1 else 'TQ_low' for t in tq_hl] elif behavior == 'TQ_Grade': tq_data = behavior_data['distress_TQ'].values grade_thresholds = [0, 30, 46, 59, 84] tq_grade = np.digitize(tq_data, bins=grade_thresholds, right=True) target_as_str = ['Grade %d' % t for t in tq_grade] else: target_as_float = behavior_data[behavior].values.astype(float) target_as_str = pu.convert_tin_to_str(target_as_float, behavior) target_data = pd.DataFrame(target_as_str, index=conn_data.index) if not covariates: ml_data = conn_data.astype(float) else: categorical_variables = [ 'smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex' ] categorical_data = behavior_data[categorical_variables] dummy_coded_categorical = pu.dummy_code_binary(categorical_data) covariate_data = pd.concat( [behavior_data['age'], dummy_coded_categorical], axis=1) ml_data = pd.concat([conn_data, covariate_data], axis=1) return ml_data, target_data
def type_classification_drop_mixed(ml_data, behavior_data, output_dir, models=None): print( '%s: Running classification on tinnitus type, dropping mixed type subjects' % pu.ctime()) ml_copy = deepcopy(ml_data) if models is None: models = ['extra_trees'] resample_methods = [None, 'over', 'under'] t = pu.convert_tin_to_str( behavior_data['tinnitus_type'].values.astype(float), 'tinnitus_type') t_df = pd.DataFrame(t, index=ml_copy.index) mixed_indices = [i for i, s in enumerate(t) if s == 'PT_and_NBN'] type_data = ml_copy.iloc[mixed_indices] ml_copy.drop(index=type_data.index, inplace=True) t_df.drop(index=type_data.index, inplace=True) target_cleaned = np.ravel(t_df.values) for model in models: for res in resample_methods: eeg_classify(ml_copy, target_cleaned, 'tinnitus_type_no_mixed', model, output_dir, resample=res)
def side_classification_drop_asym(ml_data, behavior_data, output_dir, models=None): print( '%s: Running classification on tinnitus side, dropping asymmetrical subjects' % pu.ctime()) ml_copy = deepcopy(ml_data) if models is None: models = ['extra_trees'] resample_methods = [None, 'over', 'under'] t = pu.convert_tin_to_str( behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side') t_df = pd.DataFrame(t, index=ml_copy.index) asym_indices = [] for asym in ['Right>Left', 'Left>Right']: asym_indices.extend([i for i, s in enumerate(t) if asym == s]) asym_data = ml_copy.iloc[asym_indices] ml_copy.drop(index=asym_data.index, inplace=True) t_df.drop(index=asym_data.index, inplace=True) target_cleaned = np.ravel(t_df.values) for model in models: for res in resample_methods: eeg_classify(ml_copy, target_cleaned, 'tinnitus_side_no_asym', model, output_dir, resample=res)
def get_variable_data(): def _count_data(data_to_count, vartype): data_df = pd.DataFrame(data_to_count, columns=[vartype]) count_df = data_df[vartype].value_counts() return count_df output_dir = './../data/eeg_classification' if not isdir(output_dir): mkdir(output_dir) behavior_data, conn_data = pu.load_data_full_subjects() side_data = pu.convert_tin_to_str( behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side') side_count = _count_data(side_data, 'Side') type_data = pu.convert_tin_to_str( behavior_data['tinnitus_type'].values.astype(float), 'tinnitus_type') type_count = _count_data(type_data, 'Type') tq_data = behavior_data['distress_TQ'].values high_low_thresholds = [0, 46, 84] binned_high_low = np.digitize(tq_data, bins=high_low_thresholds, right=True) tq_high_low = ['Low' if t < 2 else 'High' for t in binned_high_low] hl_count = _count_data(tq_high_low, 'TQ (High/Low)') grade_thresholds = [0, 30, 46, 59, 84] binned_grade = np.digitize(tq_data, bins=grade_thresholds, right=True) tq_grade = ['Grade_%d' % t for t in binned_grade] grade_count = _count_data(tq_grade, 'TQ (Grade)') gender = behavior_data['sex'] gender_str = ['Male' if g > 0 else 'Female' for g in gender.values] gender_count = _count_data(gender_str, 'Gender') # categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex'] # categorical_data = behavior_data[categorical_variables] output = { 'side': side_count, 'type': type_count, 'tq_high_low': hl_count, 'tq_grade': grade_count, 'gender': gender_count } pu.save_xls(output, join(output_dir, 'tin_variables_classcount.xlsx'))
def test_gridsearch(): def gridsearch_pipe(cv=None): from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC kernel_range = ('linear', 'rbf') # , 'poly'] c_range = [1, 10, 100] # np.arange(start=1, stop=100, step=10, dtype=int) # gamma_range = np.arange(.01, 1, .01) param_grid = { 'C': c_range } # , 'gamma': gamma_range} # , 'kernel': kernel_range} pipe = Pipeline([ ('preprocess_data', StandardScaler()), ('feature_selection', SelectFromModel(ExtraTreesClassifier(random_state=13), threshold="2*mean")), ('grid', GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=cv, scoring='balanced_accuracy')) ]) return pipe print('%s: Loading data' % pu.ctime()) behavior_data, conn_data = pu.load_data_full_subjects() ml_data_without_covariates = conn_data.astype(float) side_data = pu.convert_tin_to_str( behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side') resampler = SMOTE(sampling_strategy='not majority', random_state=seed) x_res, y_res = resampler.fit_resample(ml_data_without_covariates, side_data) n_splits = 10 skf = model_selection.StratifiedKFold(n_splits=n_splits, random_state=seed) skf.get_n_splits(x_res, y_res) pipe = gridsearch_pipe(cv=skf).fit(x_res, y_res) gridsearch = pipe[-1] best_params = gridsearch.best_params_ print(best_params) best_score = gridsearch.best_score_ print(best_score) print('%s: Finished' % pu.ctime())
def classification_main(covariates=True, n_iters=0): output_dir = './../data/eeg_classification' if not isdir(output_dir): mkdir(output_dir) print('%s: Loading data' % pu.ctime()) behavior_data, conn_data = pu.load_data_full_subjects() ml_data_without_covariates = conn_data.astype(float) categorical_variables = [ 'smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex' ] categorical_data = behavior_data[categorical_variables] dummy_coded_categorical = pu.dummy_code_binary(categorical_data) covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1) ml_data_with_covariates = pd.concat([conn_data, covariate_data], axis=1) models = ['svm', 'extra_trees', 'knn'] resample_methods = ['no_resample', 'ROS', 'SMOTE', 'RUS'] targets = {} side_data = pu.convert_tin_to_str( behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side') targets['tin_side'] = side_data type_data = pu.convert_tin_to_str( behavior_data['tinnitus_type'].values.astype(float), 'tinnitus_type') targets['tin_type'] = type_data tq_data = behavior_data['distress_TQ'].values high_low_thresholds = [0, 46, 84] tq_high_low = np.digitize(tq_data, bins=high_low_thresholds, right=True) targets['TQ_high_low'] = tq_high_low grade_thresholds = [0, 30, 46, 59, 84] binned_target = np.digitize(tq_data, bins=grade_thresholds, right=True) tq_grade = ['Grade_%d' % t for t in binned_target] targets['TQ_grade'] = tq_grade # hads_thresholds = [8, 11, 21] # 0-7 (normal); 8-10 (borderline); 11-21 (abnormal) # anx_binned = np.digitize(behavior_data['anxiety_score'].values.astype(float), bins=hads_thresholds, right=True) # dep_binned = np.digitize(behavior_data['depression_score'].values.astype(float), bins=hads_thresholds, right=True) # targets['hads_OVR'] = convert_hads_to_single_label(np.vstack((anx_binned, dep_binned)).T) if covariates: ml_data = ml_data_with_covariates cv_check = 'with_covariates' else: ml_data = ml_data_without_covariates cv_check = 'without_covariates' if n_iters != 0: for model in models: for res in resample_methods: for target in targets: target_data = targets[target] perm_scores = {} model_outdir = join( output_dir, '%s %s %s %s' % (target, model, cv_check, res)) if not isdir(model_outdir): mkdir(model_outdir) for n in range(n_iters): perm_target = shuffle(target_data) scores = eeg_classify(ml_data, perm_target, target_type=target, model=model, resample=res) perm_scores['Iter%05d' % n] = scores with open(join(model_outdir, 'perm_scores.pkl'), 'wb') as file: pkl.dump(perm_scores, file) else: for target in targets: target_data = targets[target] for model in models: for res in resample_methods: eeg_classify(ml_data, target_data, target_type=target, model=model, outdir=output_dir, resample=res) print('%s: Finished' % pu.ctime())