Пример #1
0
def feature_selection_pipeline_from_file():
    #get data
    dataset = refactor_labels(get_data(path, 'Sheet1'), group_column)

    # all the visualizations
    auto_visualize_features(dataset.drop(subject_number_column, axis = 1))

    #remove missing values columns
    non_missing_values_treshold = len(dataset.index) * 0.99
    dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True)

    #impute missing values
    dataset.fillna(dataset.mean(), inplace=True)

    #set X
    X = dataset.drop([group_column, subject_number_column], 1)
    sbj = dataset[subject_number_column]
    Y = dataset[group_column]
    names = list(X)

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
    X.columns = names
    print("p0", X.shape)

    #cutoff by variance
    variance_threshold = 0.05
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)

    print("p1", X.shape)

    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

    X.drop(to_drop, axis = 1, inplace=True)
    print("p2",X.shape)


    #random forest
    k_best_features = 42
    feature_importance = random_forest_selection.get_feature_importance(X,Y)
    random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance, list(X))
    processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
    print("p3", processed_dataframe.shape)
    processed_dataframe.to_csv(processed_dataframe_path)


    #PCA
    pca = PCA_Obj(X)
    pca.explained_variance_graph(pca_explained_variance_graph_path)
    pca.print_components()
    n_components = 12
    X = pca.create_pca(n_components)
    pca.save_pca_data(features_after_pca, Y=Y)
    print("p4", X.shape)
Пример #2
0
    def __init__(self, data = None, path=None, remove_missing_values=1, group_column='group', subject_number_column= 'Subject_Number'):
        if path:
            self.dataset = refactor_labels(get_data(path, 'Sheet1'), group_column)
            features_df = self.dataset.drop([group_column, subject_number_column], 1)
        if not data is None:
            self.dataset = features_df = data


        if remove_missing_values:
            features_df = features_df.dropna(axis=1)

        self.X = features_df.values
Пример #3
0
def run_stuff ():
    dataset = refactor_labels(get_data("C:\\Users\\user\\PycharmProjects\\AnxietyClassifier(2)\Alls_data_NO_specific_vars_corr.xlsx", "Sheet1"),"group")
    dataset = imputing_avarage(dataset)
    features_df = dataset.drop(['Age','group','PHQ9','Subject_Number'],1)
    X = features_df.values
    X = StandardScaler().fit_transform(X)

    #X = array[:,3:116]
    pca = RandomizedPCA(50)
    pca.fit(X)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance');
    plt.show()
def feature_selection_pipeline_from_file():
    #get data
    dataset = refactor_labels(get_data(path, 'Sheet1'), group_column)

    # all the visualizations
    #auto_visualize_features(dataset.drop([subject_number_column], 1))

    #remove missing values columns
    non_missing_values_treshold = len(dataset.index) * 0.99
    dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True)

    #impute missing values
    dataset.fillna(dataset.mean(), inplace=True)

    #set X
    X = dataset.drop([group_column, subject_number_column], 1)
    sbj = dataset[subject_number_column]
    Y = dataset[group_column]

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))

    #cutoff by variance
    variance_threshold = 0.03
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)

    print("p1", X.shape)

    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
    X.drop(X.columns[to_drop], 1, inplace=True)

    print("p2",X.shape)

    #save new df
    processed_dataframe = pd.concat([X, Y, sbj], axis=1)
    processed_dataframe.to_csv(processed_dataframe_path)

    #random forest
    if random_forest:
        k_best_features = 31
        feature_importance = random_forest_selection.get_feature_importance(X,Y)
        random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance)
        processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
        processed_dataframe.to_csv(processed_dataframe_path)
    print("p4", processed_dataframe.shape)
Пример #5
0
def visualize_features_interactivly():
    files = [file_a, file_b, file_c, file_d]
    file_num = input(
        "which file's features do you wanna see?\n {}\n".format(files))
    if file_num == 'a':
        visualization_object = DataVisualizationObj(get_data(file_a, 'Sheet1'))

    elif file_num == 'b':
        visualization_object = DataVisualizationObj(get_data(file_b, 'Sheet1'))

    elif file_num == 'c':
        visualization_object = DataVisualizationObj(get_data(file_c, 'Sheet1'))

    elif file_num == 'd':
        visualization_object = DataVisualizationObj(get_data(file_d, 'Sheet1'))

    else:
        return

    while True:
        func_dict = {
            1: visualization_object.create_binary_hist,
            2: visualization_object.print_data,
            3: visualization_object.plot_data,
            4: visualization_object.plot_corr,
            5: visualization_object.plot_correlation_matrix
        }
        try:
            vis_type = int(
                input(
                    "which visualization func do you wanna use?\n {}\n".format(
                        func_dict)))
        except ValueError:
            return

        func_dict[vis_type]()
Пример #6
0
    def __init__(self,
                 data=None,
                 path=None,
                 remove_missing_values=1,
                 group_column='group',
                 subject_number_column='Subject_Number'):
        if path:
            self.dataset = refactor_labels(get_data(path, 'Sheet1'),
                                           group_column)
            features_df = self.dataset.drop(
                [group_column, subject_number_column], 1)
        if not data is None:
            self.dataset = features_df = data

        if remove_missing_values:
            features_df = features_df.dropna(axis=1)

        self.X = features_df.values
Пример #7
0
def auto_visualize_features(data=None,
                            saving_path="subject_features_before_selection",
                            create_plots=1):
    file = r"C:\‏‏PycharmProjects\AnxietyClassifier\ExtractedFeaturesFiles\extracted_features_subjects_set_Updated,with_outlier_subjects_False_with_9029,9014,2018-10-29.xlsx"

    if not data is None:
        visualization_object = DataVisualizationObj(data)
    else:
        visualization_object = DataVisualizationObj(get_data(file, 'Sheet1'))
    path = r"C:\‏‏PycharmProjects\AnxietyClassifier\visualizations\high_low_plots_4,11"
    #visualization_object.detect_outliers()
    #visualization_object.print_missing_values()
    #visualization_object.describe()
    #visualization_object.print_variance(path=path)
    #    visualization_object.detect_outliers(path=path)
    #visualization_object.plot_correlation_matrix(path=path)
    if create_plots:
        visualization_object.create_binary_hist(path=path)
        visualization_object.create_two_hists_by_group(path=path)
Пример #8
0
from DataImporting.ImportData import get_data
from DataImporting.ImportData import refactor_labels
import numpy as np
from scipy import stats

file_g = r"C:\‏‏PycharmProjects\AnxietyClassifier\ExtractedFeatures\data_features_for_each_matrix.xlsx"
group_column = "group"
df = refactor_labels(get_data(file_g, 'Sheet1'), group_column)
print("df.shape\n",df.shape)
df = df.dropna(axis=1)
z = np.abs(stats.zscore(df))
threshold = 3
print(np.where(z > threshold))
#
# print("names\n", list(df))
# print("count missing\n", df.isnull().sum())
# print("\ndf.info()\n",df.info())
# print("\ndf.describe()\n", df.describe())
# print("\ndf groupby count\n", df.groupby(group_column)[group_column].count())
# print("\ndf groupby describe\n", df.groupby(group_column).describe())
Пример #9
0
def meow ():
    dataset = refactor_labels(get_data("C:\\Users\\user\\PycharmProjects\\AnxietyClassifier(2)\Alls_data_NO_specific_vars_corr.xlsx", "Sheet1"),"group")
    return PCA_transforme(dataset,6)