def main_deprecated(): # This is deprecated, never use this please. print("This is main, alhumdulliah") ##### This block is for data cleaning ##### missing_values = ["n/a", "na", "--", "?"] raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) # print(raw_data.head()) # print head of the data # print(raw_data.describe()) # shows numerical columns statistics e.g. count, mean, std, min, max etc # print(raw_data.shape) # prints shape of the dataset (101766, 50) # print(raw_data["weight"].isnull().sum()) #prints number of null values in weight column # print(raw_data["weight"].shape[0]) #prints number of columns in weight column data_cleaning = DataCleaning() raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2) cols_having_missing_values = data_cleaning.get_cols_having_missing_values( raw_data, False) # cols having missing values # raw_data.dtypes #shows the column data types raw_data = data_cleaning.fill_missing_values( raw_data, cols_having_missing_values) # print(get_cols_having_missing_values(raw_data, False)) #no columns with missing values raw_data = data_cleaning.just_remove_columns(raw_data, columns=[ "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"]) df = raw_data my_util = Util() my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl") print("Filled the missing values either by the mode or mean value")
def clean(): missing_values = ["n/a", "na", "--", "?"] raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) data_cleaning = DataCleaning() raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2) cols_having_missing_values = data_cleaning.get_cols_having_missing_values( raw_data, False) # cols having missing values raw_data = data_cleaning.fill_missing_values( raw_data, cols_having_missing_values) raw_data = data_cleaning.just_remove_columns(raw_data, columns=[ "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"]) df = raw_data my_util = Util() my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")
from sklearn.metrics import accuracy_score, precision_score, recall_score from sklearn import tree from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold from sklearn.preprocessing import LabelEncoder, OneHotEncoder from visualization import Visualization from data_cleaning import DataCleaning missing_values = ["n/a", "na", "--", "?"] data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) data_cleaning = DataCleaning() data = data_cleaning.clean_columns(data, missing_bound=0.2) colsMissingValues = data_cleaning.get_cols_having_missing_values(data, False) data = data_cleaning.fill_missing_values(data, colsMissingValues) """ data = data.values features = [] for i in range(50): if isinstance(data[0][i], str): a = np.unique(data[:,i]) features.append(a) """ data = data.to_numpy() print(data)
def main(): #Data Cleaning missing_values = ["n/a", "na", "--", "?"] data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) data_cleaning = DataCleaning() data = data_cleaning.clean_columns(data, missing_bound=0.2) colsMissingValues = data_cleaning.get_cols_having_missing_values( data, False) data = data_cleaning.fill_missing_values(data, colsMissingValues) #Data Cleaning Done data = data.to_numpy() le = LabelEncoder() for i in range(50): if isinstance(data[0][i], str): data[:, i] = le.fit_transform(data[:, i]) print(data) print(data.shape) X_train, X_test = data[0:80000, 0:49], data[80000:101766, 0:49] Y_train, Y_test = data[0:80000, 49:50], data[80000:101766, 49:50] Y_train, Y_test = Y_train.astype('int'), Y_test.astype('int') print(X_train) print(X_train.shape) print(Y_train) print(Y_train.shape) grid_params = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [2, 4, 6], 'min_samples_leaf': [0.02, 0.04], 'min_samples_split': [0.2, 0.5, 0.8] } dt = DecisionTreeClassifier(random_state=50) # Builds a model for each possible combination of all the hyperparamter values provided using cv = 5 (5 fold cross validation) # cv = 5, builds a 5 fold cross validated GridSearch Object # Set scoring parameter as accuracy as we choose the best model based on the accuracy value grid_object = GridSearchCV(estimator=dt, param_grid=grid_params, scoring='accuracy', cv=5, n_jobs=-1) print "\nHyper Parameter Tuning Begins\n" # fit grid object to the training data grid_object.fit(X_train, Y_train) print "\n\nBest Param Values \t\t\n\n" print(grid_object.best_params_) #---- Hyper Parameter Tuning Ends ---- #----- Reporting Accuracy on Test Set using Model with Best Parameters learned through Hyper Parameter Tuning ----------- #Building Decision Tree With Best Parameters learned from Hyper Parameter Tuning best_params = grid_object.best_params_ dt = DecisionTreeClassifier( criterion=best_params['criterion'], splitter=best_params['splitter'], max_depth=best_params['max_depth'], min_samples_leaf=best_params['min_samples_leaf'], min_samples_split=best_params['min_samples_split'], random_state=50) #dt = DecisionTreeClassifier(criterion='gini') dt.fit(X_train, Y_train) Y_pred = dt.predict(X_test) print "Accuracy score Test = ", accuracy_score(Y_test, Y_pred) * 100 print "Accuracy score 5-Fold = ", kFoldVal(X_train, Y_train, dt, 5)