예제 #1
0
def load_data(behavior, covariates=True):
    behavior_data, conn_data = pu.load_data_full_subjects()

    if behavior == 'TQ_high_low':
        tq_data = behavior_data['distress_TQ'].values
        high_low_thresholds = [0, 46, 84]
        tq_hl = np.digitize(tq_data, bins=high_low_thresholds, right=True)
        target_as_str = ['TQ_High' if t > 1 else 'TQ_low' for t in tq_hl]
    elif behavior == 'TQ_Grade':
        tq_data = behavior_data['distress_TQ'].values
        grade_thresholds = [0, 30, 46, 59, 84]
        tq_grade = np.digitize(tq_data, bins=grade_thresholds, right=True)
        target_as_str = ['Grade %d' % t for t in tq_grade]
    else:
        target_as_float = behavior_data[behavior].values.astype(float)
        target_as_str = pu.convert_tin_to_str(target_as_float, behavior)
    target_data = pd.DataFrame(target_as_str, index=conn_data.index)

    if not covariates:
        ml_data = conn_data.astype(float)
    else:
        categorical_variables = [
            'smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex'
        ]
        categorical_data = behavior_data[categorical_variables]
        dummy_coded_categorical = pu.dummy_code_binary(categorical_data)
        covariate_data = pd.concat(
            [behavior_data['age'], dummy_coded_categorical], axis=1)

        ml_data = pd.concat([conn_data, covariate_data], axis=1)
    return ml_data, target_data
예제 #2
0
def test_gridsearch():
    def gridsearch_pipe(cv=None):
        from sklearn.pipeline import Pipeline
        from sklearn.preprocessing import StandardScaler
        from sklearn.feature_selection import SelectFromModel
        from sklearn.ensemble import ExtraTreesClassifier
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC
        kernel_range = ('linear', 'rbf')  # , 'poly']
        c_range = [1, 10,
                   100]  # np.arange(start=1, stop=100, step=10, dtype=int)
        # gamma_range = np.arange(.01, 1, .01)
        param_grid = {
            'C': c_range
        }  # , 'gamma': gamma_range}  # , 'kernel': kernel_range}

        pipe = Pipeline([
            ('preprocess_data', StandardScaler()),
            ('feature_selection',
             SelectFromModel(ExtraTreesClassifier(random_state=13),
                             threshold="2*mean")),
            ('grid',
             GridSearchCV(SVC(kernel='rbf'),
                          param_grid=param_grid,
                          cv=cv,
                          scoring='balanced_accuracy'))
        ])

        return pipe

    print('%s: Loading data' % pu.ctime())
    behavior_data, conn_data = pu.load_data_full_subjects()
    ml_data_without_covariates = conn_data.astype(float)

    side_data = pu.convert_tin_to_str(
        behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side')

    resampler = SMOTE(sampling_strategy='not majority', random_state=seed)

    x_res, y_res = resampler.fit_resample(ml_data_without_covariates,
                                          side_data)

    n_splits = 10
    skf = model_selection.StratifiedKFold(n_splits=n_splits, random_state=seed)
    skf.get_n_splits(x_res, y_res)

    pipe = gridsearch_pipe(cv=skf).fit(x_res, y_res)
    gridsearch = pipe[-1]
    best_params = gridsearch.best_params_
    print(best_params)
    best_score = gridsearch.best_score_
    print(best_score)

    print('%s: Finished' % pu.ctime())
예제 #3
0
def lars():
    behavior_data, conn_data = pu.load_data_full_subjects()
    conn_data.astype(float)

    categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex']
    categorical_data = behavior_data[categorical_variables]
    dummy_coded_categorical = pu.dummy_code_binary(categorical_data)
    covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1)

    ml_data = pd.concat([conn_data, covariate_data], axis=1)
    target = behavior_data['distress_TQ'].values.astype(float)

    feature_names = list(ml_data)
    continuous_features = [f for f in feature_names if 'categorical' not in f]
    continuous_indices = [ml_data.columns.get_loc(cont) for cont in continuous_features]

    categorical_features = [f for f in feature_names if 'categorical' in f]
    categorical_indices = [ml_data.columns.get_loc(cat) for cat in categorical_features]

    ml_continuous = ml_data.values[:, continuous_indices]
    ml_categorical = ml_data.values[:, categorical_indices]

    # Standardization for continuous data
    preproc = preprocessing.StandardScaler().fit(ml_continuous)
    ml_z = preproc.transform(ml_continuous)

    # Variance threshold for categorical data
    varthresh = feature_selection.VarianceThreshold(threshold=0).fit(ml_categorical)
    ml_v = varthresh.transform(ml_categorical)

    ml_preprocessed = np.hstack((ml_z, ml_v))

    # Feature selection with extra trees
    clf = ensemble.ExtraTreesRegressor()
    model = feature_selection.SelectFromModel(clf, threshold="2*mean")
    # Transform train and test data with feature selection model
    ml_cleaned = model.fit_transform(ml_preprocessed, target)
    feature_indices = model.get_support(indices=True)
    cleaned_features = [feature_names[i] for i in feature_indices]

    lars_classifier = linear_model.LarsCV(cv=3, normalize=False, fit_intercept=False)

    lars_classifier.fit(ml_cleaned, target)
    predicted = lars_classifier.predict(ml_cleaned)

    r2 = lars_classifier.score(ml_cleaned, target)

    exp_var = metrics.explained_variance_score(target, predicted)
    max_err = metrics.max_error(target, predicted)
    mae = metrics.mean_absolute_error(target, predicted)
    mse = metrics.mean_squared_error(target, predicted)
    print(r2)
def get_variable_data():
    def _count_data(data_to_count, vartype):
        data_df = pd.DataFrame(data_to_count, columns=[vartype])
        count_df = data_df[vartype].value_counts()
        return count_df

    output_dir = './../data/eeg_classification'
    if not isdir(output_dir):
        mkdir(output_dir)
    behavior_data, conn_data = pu.load_data_full_subjects()

    side_data = pu.convert_tin_to_str(
        behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side')
    side_count = _count_data(side_data, 'Side')

    type_data = pu.convert_tin_to_str(
        behavior_data['tinnitus_type'].values.astype(float), 'tinnitus_type')
    type_count = _count_data(type_data, 'Type')

    tq_data = behavior_data['distress_TQ'].values
    high_low_thresholds = [0, 46, 84]
    binned_high_low = np.digitize(tq_data,
                                  bins=high_low_thresholds,
                                  right=True)
    tq_high_low = ['Low' if t < 2 else 'High' for t in binned_high_low]
    hl_count = _count_data(tq_high_low, 'TQ (High/Low)')

    grade_thresholds = [0, 30, 46, 59, 84]
    binned_grade = np.digitize(tq_data, bins=grade_thresholds, right=True)
    tq_grade = ['Grade_%d' % t for t in binned_grade]
    grade_count = _count_data(tq_grade, 'TQ (Grade)')

    gender = behavior_data['sex']
    gender_str = ['Male' if g > 0 else 'Female' for g in gender.values]
    gender_count = _count_data(gender_str, 'Gender')

    # categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex']
    # categorical_data = behavior_data[categorical_variables]

    output = {
        'side': side_count,
        'type': type_count,
        'tq_high_low': hl_count,
        'tq_grade': grade_count,
        'gender': gender_count
    }
    pu.save_xls(output, join(output_dir, 'tin_variables_classcount.xlsx'))
def plot_age_historgram(output_dir=None):
    behavior_data, conn_data = pu.load_data_full_subjects()
    age = behavior_data['age']
    sns.set_style('darkgrid')
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.distplot(age.values,
                 kde=False,
                 ax=ax,
                 hist_kws={
                     "alpha": .75,
                     "color": 'b'
                 })
    ax.set_xlabel('Age')
    ax.set_ylabel('Frequency')
    if output_dir is None:
        plt.show()
    else:
        fig.savefig(join(output_dir, 'age_hist.png'))
예제 #6
0
    if outdir is not None:
        score_df.to_excel(
            os.path.join(outdir, '%s_performance_measures.xlsx' % target_type))
        # coef_df.to_excel(os.path.join(outdir, '%s_feature_coefficients.xlsx' % target_type))


if __name__ == "__main__":
    import logging
    logging.basicConfig(level=logging.INFO)

    output_dir = './../data/eeg_regression/extra_trees/'
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    behavior_data, conn_data = pu.load_data_full_subjects()
    conn_data.astype(float)

    categorical_variables = [
        'smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex'
    ]
    categorical_data = behavior_data[categorical_variables]
    dummy_coded_categorical = pu.dummy_code_binary(categorical_data)
    covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical],
                               axis=1)

    ml_data = pd.concat([conn_data, covariate_data], axis=1)
    target = behavior_data['distress_TQ'].values.astype(float)

    targets = [
        'loudness_VAS', 'distress_TQ', 'distress_VAS', 'anxiety_score',
예제 #7
0
def classification_main(covariates=True, n_iters=0):
    output_dir = './../data/eeg_classification'
    if not isdir(output_dir):
        mkdir(output_dir)

    print('%s: Loading data' % pu.ctime())
    behavior_data, conn_data = pu.load_data_full_subjects()
    ml_data_without_covariates = conn_data.astype(float)

    categorical_variables = [
        'smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex'
    ]
    categorical_data = behavior_data[categorical_variables]
    dummy_coded_categorical = pu.dummy_code_binary(categorical_data)
    covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical],
                               axis=1)

    ml_data_with_covariates = pd.concat([conn_data, covariate_data], axis=1)

    models = ['svm', 'extra_trees', 'knn']
    resample_methods = ['no_resample', 'ROS', 'SMOTE', 'RUS']

    targets = {}
    side_data = pu.convert_tin_to_str(
        behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side')
    targets['tin_side'] = side_data

    type_data = pu.convert_tin_to_str(
        behavior_data['tinnitus_type'].values.astype(float), 'tinnitus_type')
    targets['tin_type'] = type_data

    tq_data = behavior_data['distress_TQ'].values
    high_low_thresholds = [0, 46, 84]
    tq_high_low = np.digitize(tq_data, bins=high_low_thresholds, right=True)
    targets['TQ_high_low'] = tq_high_low

    grade_thresholds = [0, 30, 46, 59, 84]
    binned_target = np.digitize(tq_data, bins=grade_thresholds, right=True)
    tq_grade = ['Grade_%d' % t for t in binned_target]
    targets['TQ_grade'] = tq_grade

    # hads_thresholds = [8, 11, 21]  # 0-7 (normal); 8-10 (borderline); 11-21 (abnormal)
    # anx_binned = np.digitize(behavior_data['anxiety_score'].values.astype(float), bins=hads_thresholds, right=True)
    # dep_binned = np.digitize(behavior_data['depression_score'].values.astype(float), bins=hads_thresholds, right=True)
    # targets['hads_OVR'] = convert_hads_to_single_label(np.vstack((anx_binned, dep_binned)).T)

    if covariates:
        ml_data = ml_data_with_covariates
        cv_check = 'with_covariates'
    else:
        ml_data = ml_data_without_covariates
        cv_check = 'without_covariates'

    if n_iters != 0:
        for model in models:
            for res in resample_methods:
                for target in targets:
                    target_data = targets[target]
                    perm_scores = {}

                    model_outdir = join(
                        output_dir,
                        '%s %s %s %s' % (target, model, cv_check, res))
                    if not isdir(model_outdir):
                        mkdir(model_outdir)
                    for n in range(n_iters):
                        perm_target = shuffle(target_data)
                        scores = eeg_classify(ml_data,
                                              perm_target,
                                              target_type=target,
                                              model=model,
                                              resample=res)
                        perm_scores['Iter%05d' % n] = scores

                    with open(join(model_outdir, 'perm_scores.pkl'),
                              'wb') as file:
                        pkl.dump(perm_scores, file)
    else:
        for target in targets:
            target_data = targets[target]
            for model in models:
                for res in resample_methods:
                    eeg_classify(ml_data,
                                 target_data,
                                 target_type=target,
                                 model=model,
                                 outdir=output_dir,
                                 resample=res)

    print('%s: Finished' % pu.ctime())