import pandas as pd from imblearn.over_sampling import SMOTE from sklearn.model_selection import StratifiedKFold from classification.decision_trees.decision_trees_functions import * from utils import load_pd data: pd.DataFrame y: np.ndarray X: np.ndarray data, X, y = load_pd('../../../datasets/pd_speech_features.csv', merge_observations=True) # SMOTE balancing RANDOM_STATE = 42 smote = SMOTE(ratio='minority', random_state=RANDOM_STATE) X, y = smote.fit_sample(X, y) skf = StratifiedKFold(5) score_names = ['accuracy', 'recall', 'precision', 'roc-auc'] score = { 'tree_depths': [], 'tree_leafs': [], 'accuracy': [], 'recall': [], 'precision': [], 'roc-auc': [] } for train_index, test_index in skf.split(X, y):
from numpy.ma import arange from data_exploration.singular_variable_analysis.singular_variable_analysis_functions import * from utils import load_pd from vis_functions import variables_boxplot data: pd.DataFrame = load_pd("../../../datasets/pd_speech_features.csv", pop_class=False) print_shape(data) print_variable_types(data) print_missing_values(data) class_values = [0, 1] class_balance(data, 'class', class_values) data, X, y = remove_corr_and_select_k_best(data) variables_boxplot(data) plt.show() data['class'] = y data_0 = data.loc[data['class'] == 0] data_1 = data.loc[data['class'] == 1] data_0.pop('class') data_1.pop('class') columns = data.columns datas = [data_0, data_1]
""" K-Means results Parkinson Decease Data set. Tested pre-processing: normalization (StandardScaler), outlier removing with DBSCAN, PCA for data reduction/transformation """ from sklearn.preprocessing import StandardScaler from clustering.clustering_functions import * from utils import dbscan_outliers_analysis_plot, pca_cumulative_variance_plot, load_pd # load data data, X, y = load_pd("../../datasets/pd_speech_features.csv") # pre-process data normalized_data = StandardScaler().fit_transform(X) dbscan_outliers_analysis_plot(normalized_data, eps_list=[15, 20, 25, 30, 35], min_samples=3) non_outliers_indexes = DBSCAN(eps=35, min_samples=3).fit(normalized_data).labels_ != -1 data_without_out = normalized_data[non_outliers_indexes, :] new_target = y[non_outliers_indexes] pca_obj = pca_cumulative_variance_plot(data_without_out) first_components = pca_obj.components_[:115] # aprox 90% variance ratio reduced_data = np.dot(data_without_out, first_components.T) # parameter tuning k_analysis(reduced_data, list(range(2, 20, 2))) # fixed kmeans evaluation
""" Association rules for Parkinson Decease Data Set. """ from sklearn.feature_selection import f_classif from pattern_mining.pattern_mining_functions import * from utils import print_return_variable, load_pd # load data data, X, y = load_pd("../../datasets/pd_speech_features.csv", remove_corr=True, corr_threshold=.8) # pattern mining parameters print("\n### Pattern mining parameters") k_features = print_return_variable("Number of features to select: ", 25) selection_measure = print_return_variable("Feature selection function: ", f_classif) discretize_function = print_return_variable("Discretize function: ", pd.cut) bins = print_return_variable("Number of bins: ", 7) disc_needless_cols = print_return_variable( "Variables that doesnt need to be discretize: ", ['gender', 'class']) binary_cols = print_return_variable("Binary cols: ", []) min_supp = print_return_variable("Min support: ", 0.6) fp_mining_args = [min_supp] min_conf = print_return_variable("Min confidence: ", 0.9) min_ant_items = print_return_variable("Min of items in antecedents itemset: ", 2) # extract association rules