def feature_selection_pipeline_from_file(): #get data dataset = refactor_labels(get_data(path, 'Sheet1'), group_column) # all the visualizations auto_visualize_features(dataset.drop(subject_number_column, axis = 1)) #remove missing values columns non_missing_values_treshold = len(dataset.index) * 0.99 dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True) #impute missing values dataset.fillna(dataset.mean(), inplace=True) #set X X = dataset.drop([group_column, subject_number_column], 1) sbj = dataset[subject_number_column] Y = dataset[group_column] names = list(X) # standartize X X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X)) X.columns = names print("p0", X.shape) #cutoff by variance variance_threshold = 0.05 variance_cutoff = VarianceThreshold(threshold=variance_threshold) variance_cutoff.fit_transform(X) print("p1", X.shape) #cutoff high correlation corr_matrix = X.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > 0.7)] X.drop(to_drop, axis = 1, inplace=True) print("p2",X.shape) #random forest k_best_features = 42 feature_importance = random_forest_selection.get_feature_importance(X,Y) random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance, list(X)) processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance) print("p3", processed_dataframe.shape) processed_dataframe.to_csv(processed_dataframe_path) #PCA pca = PCA_Obj(X) pca.explained_variance_graph(pca_explained_variance_graph_path) pca.print_components() n_components = 12 X = pca.create_pca(n_components) pca.save_pca_data(features_after_pca, Y=Y) print("p4", X.shape)
def __init__(self, data = None, path=None, remove_missing_values=1, group_column='group', subject_number_column= 'Subject_Number'): if path: self.dataset = refactor_labels(get_data(path, 'Sheet1'), group_column) features_df = self.dataset.drop([group_column, subject_number_column], 1) if not data is None: self.dataset = features_df = data if remove_missing_values: features_df = features_df.dropna(axis=1) self.X = features_df.values
def run_stuff (): dataset = refactor_labels(get_data("C:\\Users\\user\\PycharmProjects\\AnxietyClassifier(2)\Alls_data_NO_specific_vars_corr.xlsx", "Sheet1"),"group") dataset = imputing_avarage(dataset) features_df = dataset.drop(['Age','group','PHQ9','Subject_Number'],1) X = features_df.values X = StandardScaler().fit_transform(X) #X = array[:,3:116] pca = RandomizedPCA(50) pca.fit(X) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance'); plt.show()
def feature_selection_pipeline_from_file(): #get data dataset = refactor_labels(get_data(path, 'Sheet1'), group_column) # all the visualizations #auto_visualize_features(dataset.drop([subject_number_column], 1)) #remove missing values columns non_missing_values_treshold = len(dataset.index) * 0.99 dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True) #impute missing values dataset.fillna(dataset.mean(), inplace=True) #set X X = dataset.drop([group_column, subject_number_column], 1) sbj = dataset[subject_number_column] Y = dataset[group_column] # standartize X X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X)) #cutoff by variance variance_threshold = 0.03 variance_cutoff = VarianceThreshold(threshold=variance_threshold) variance_cutoff.fit_transform(X) print("p1", X.shape) #cutoff high correlation corr_matrix = X.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > 0.8)] X.drop(X.columns[to_drop], 1, inplace=True) print("p2",X.shape) #save new df processed_dataframe = pd.concat([X, Y, sbj], axis=1) processed_dataframe.to_csv(processed_dataframe_path) #random forest if random_forest: k_best_features = 31 feature_importance = random_forest_selection.get_feature_importance(X,Y) random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance) processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance) processed_dataframe.to_csv(processed_dataframe_path) print("p4", processed_dataframe.shape)
def visualize_features_interactivly(): files = [file_a, file_b, file_c, file_d] file_num = input( "which file's features do you wanna see?\n {}\n".format(files)) if file_num == 'a': visualization_object = DataVisualizationObj(get_data(file_a, 'Sheet1')) elif file_num == 'b': visualization_object = DataVisualizationObj(get_data(file_b, 'Sheet1')) elif file_num == 'c': visualization_object = DataVisualizationObj(get_data(file_c, 'Sheet1')) elif file_num == 'd': visualization_object = DataVisualizationObj(get_data(file_d, 'Sheet1')) else: return while True: func_dict = { 1: visualization_object.create_binary_hist, 2: visualization_object.print_data, 3: visualization_object.plot_data, 4: visualization_object.plot_corr, 5: visualization_object.plot_correlation_matrix } try: vis_type = int( input( "which visualization func do you wanna use?\n {}\n".format( func_dict))) except ValueError: return func_dict[vis_type]()
def __init__(self, data=None, path=None, remove_missing_values=1, group_column='group', subject_number_column='Subject_Number'): if path: self.dataset = refactor_labels(get_data(path, 'Sheet1'), group_column) features_df = self.dataset.drop( [group_column, subject_number_column], 1) if not data is None: self.dataset = features_df = data if remove_missing_values: features_df = features_df.dropna(axis=1) self.X = features_df.values
def auto_visualize_features(data=None, saving_path="subject_features_before_selection", create_plots=1): file = r"C:\PycharmProjects\AnxietyClassifier\ExtractedFeaturesFiles\extracted_features_subjects_set_Updated,with_outlier_subjects_False_with_9029,9014,2018-10-29.xlsx" if not data is None: visualization_object = DataVisualizationObj(data) else: visualization_object = DataVisualizationObj(get_data(file, 'Sheet1')) path = r"C:\PycharmProjects\AnxietyClassifier\visualizations\high_low_plots_4,11" #visualization_object.detect_outliers() #visualization_object.print_missing_values() #visualization_object.describe() #visualization_object.print_variance(path=path) # visualization_object.detect_outliers(path=path) #visualization_object.plot_correlation_matrix(path=path) if create_plots: visualization_object.create_binary_hist(path=path) visualization_object.create_two_hists_by_group(path=path)
from DataImporting.ImportData import get_data from DataImporting.ImportData import refactor_labels import numpy as np from scipy import stats file_g = r"C:\PycharmProjects\AnxietyClassifier\ExtractedFeatures\data_features_for_each_matrix.xlsx" group_column = "group" df = refactor_labels(get_data(file_g, 'Sheet1'), group_column) print("df.shape\n",df.shape) df = df.dropna(axis=1) z = np.abs(stats.zscore(df)) threshold = 3 print(np.where(z > threshold)) # # print("names\n", list(df)) # print("count missing\n", df.isnull().sum()) # print("\ndf.info()\n",df.info()) # print("\ndf.describe()\n", df.describe()) # print("\ndf groupby count\n", df.groupby(group_column)[group_column].count()) # print("\ndf groupby describe\n", df.groupby(group_column).describe())
def meow (): dataset = refactor_labels(get_data("C:\\Users\\user\\PycharmProjects\\AnxietyClassifier(2)\Alls_data_NO_specific_vars_corr.xlsx", "Sheet1"),"group") return PCA_transforme(dataset,6)