def clean_sick(df, encoder=None): """ Clean mixed data: cmc. """ cat_features = [ 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'referral_source' ] # for sick dataset response = 'Class' # for sick dataset splits, metadata = eda.split(df, cat_features=cat_features, response=response) X_num = splits['X_num'] X_num.drop(['TBG'], axis=1, inplace=True) #print(X_num) X_cat = splits['X_cat'] #print(X_cat) y = splits['y'][response].values # Drop columns with many nan # Replace values by the median of the column from sklearn.impute import SimpleImputer imp_mean = SimpleImputer( strategy='median') #for median imputation replace 'mean' with 'median' imp_mean.fit(X_num) X_num_no_nan = imp_mean.transform(X_num) #print(X_num_no_nan) # The data set is converted to data frame again X_num_ok = pd.DataFrame(X_num_no_nan, columns=X_num.columns) # Scaling X_num_scaled = (X_num_ok - X_num_ok.min()) / (X_num_ok.max() - X_num_ok.min()) pd.options.mode.chained_assignment = None # Encoding X_cat_encoded, encoder = prep.encode(X_cat, encoder) return pd.DataFrame(X_num_scaled), pd.DataFrame( X_cat_encoded), pd.DataFrame(y), encoder
def read_train_test_files(fold_number): import glob train_arff_files = glob.glob('../datasets/datasetsCBR/pen-based/*.train.arff') test_arff_files = glob.glob('../datasets/datasetsCBR/pen-based/*.test.arff') # test_arff_files = glob.glob('datasetsCBR/pen-based/*.test.arff') TrainTotal = [] Y_TrainTotal = [] TestTotal = [] Y_TestTotal = [] for file in train_arff_files: df_train = eda.read_arff(path_data=file, url_data=None) splits, metadata = eda.split(df_train, cat_features=None,response='a17') X_num = splits['X_num'] X_cat = splits['X_cat'] # No categorical features y_train = splits['y']['a17'].values X_norm_train = (X_num - X_num.min()) / (X_num.max() - X_num.min()) TrainTotal.append(X_norm_train) Y_TrainTotal.append(y_train) for file in test_arff_files: df_test = eda.read_arff(path_data=file, url_data=None) splits, metadata = eda.split(df_test, cat_features=None,response='a17') X_num = splits['X_num'] X_cat = splits['X_cat'] # No categorical features y_test = splits['y']['a17'].values X_norm_test = (X_num - X_num.min()) / (X_num.max() - X_num.min()) TestTotal.append(X_norm_test) Y_TestTotal.append(y_test) return TrainTotal[fold_number-1],Y_TrainTotal[fold_number-1], TestTotal[fold_number-1], Y_TestTotal[fold_number-1]
def clean_sick2(df): """ Clean mixed data: cmc. """ cat_features = [ 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'referral_source' ] # for sick dataset response = 'Class' # for sick dataset splits, metadata = eda.split(df, cat_features=cat_features, response=response) X_num = splits['X_num'] X_cat = splits['X_cat'] # Drop columns with many nan X_num.drop(['TBG'], axis=1, inplace=True) X_num = X_num.fillna(X_num.mean()) # Outliers # print(f'# Samples before removing outliers: {len(X_num)}') rows_to_remove = (np.abs(stats.zscore(X_num)) < 3).all(axis=1) X_num = X_num[rows_to_remove].copy() # print(f'# Samples after removing outliers: {len(X_num)}') # y = splits['y'][response].values y = splits['y'][rows_to_remove][response].values # Scaling X_num_scaled = prep.scale(X_num) # Removing categ. levels # X_cat = X_cat[rows_to_remove].copy() pd.options.mode.chained_assignment = None # X_cat.loc[X_cat['heducation'] == 1, 'heducation'] = 2 # X_cat.loc[X_cat['hoccupation'] == 4, 'hoccupation'] = 3 # Encoding X_cat_encoded = prep.encode(X_cat) return X_num_scaled, X_cat_encoded, y
def clean_cmc(df): """ Clean mixed data: cmc. """ cat_features = [ 'weducation', 'heducation', 'wreligion', 'wworking', 'hoccupation', 'living_index', 'media_exposure' ] splits, metadata = eda.split(df, cat_features=cat_features, response='class') X_num = splits['X_num'] X_cat = splits['X_cat'] # Outliers print(f'# Samples before removing outliers: {len(X_num)}') rows_to_remove = (np.abs(stats.zscore(X_num)) < 3).all(axis=1) X_num = X_num[rows_to_remove].copy() print(f'# Samples after removing outliers: {len(X_num)}') y = splits['y'][rows_to_remove]['class'].values # Scaling X_num_scaled = prep.scale(X_num) # Removing categ. levels X_cat = X_cat[rows_to_remove].copy() pd.options.mode.chained_assignment = None X_cat.loc[X_cat['heducation'] == 1, 'heducation'] = 2 X_cat.loc[X_cat['hoccupation'] == 4, 'hoccupation'] = 3 # Encoding X_cat_encoded = prep.encode(X_cat) return X_num_scaled, X_cat_encoded, y
# In[2]: path = '../datasets/datasetsCBR/pen-based/pen-based.fold.000000.test.arff' # Read the data set df_test = eda.read_arff(path_data=path, url_data=None) df_test.head() # In[3]: splits, metadata = eda.split(df_test, cat_features=None,response='a17') X_num = splits['X_num'] X_cat = splits['X_cat'] # No categorical features # In[4]: # True labels of all datapoints y = splits['y']['a17'].values print(y) print(len(y)) # In[5]:
import tools.preprocess as prep # url = 'https://raw.githubusercontent.com/gusseppe/master_artificial_intelligence/master/Introduction_to_Machine_Learning/deliverables/work1/iml/datasets/cmc.arff' path = 'datasets/cmc.arff' df = eda.read_arff(path_data=path) # local # df = eda(path_data='datasets/cmc.arff') # local df.head() # In[95]: cat_features = [ 'weducation', 'heducation', 'wreligion', 'wworking', 'hoccupation', 'living_index', 'media_exposure' ] splits, metadata = eda.split(df, cat_features=cat_features, response='class') X_num = splits['X_num'] X_cat = splits['X_cat'] X_num.head() # ### Metadata # # In[96]: metadata # ### Analyze and preprocess #
# In[71]: path = 'datasets/breast-w.arff' # Read the data set df = eda.read_arff(path_data=path, url_data=None) df.head() # ### Split data into numerical features and true label values (class) # In[72]: #Split data in Numerical and Categorical splits, metadata = eda.split(df, cat_features=None, response='Class') X_num = splits['X_num'] X_cat = splits['X_cat'] # No categorical features # In[73]: # True labels of all datapoints y = splits['y']['Class'].values X_num.head() # In[74]: print(f'# instances: {len(X_num)} | # features: {len(X_num.columns)}') print(f'# num_features: {len(X_num.columns)}') # In[75]: