def main_deprecated(): # This is deprecated, never use this please. print("This is main, alhumdulliah") ##### This block is for data cleaning ##### missing_values = ["n/a", "na", "--", "?"] raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) # print(raw_data.head()) # print head of the data # print(raw_data.describe()) # shows numerical columns statistics e.g. count, mean, std, min, max etc # print(raw_data.shape) # prints shape of the dataset (101766, 50) # print(raw_data["weight"].isnull().sum()) #prints number of null values in weight column # print(raw_data["weight"].shape[0]) #prints number of columns in weight column data_cleaning = DataCleaning() raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2) cols_having_missing_values = data_cleaning.get_cols_having_missing_values( raw_data, False) # cols having missing values # raw_data.dtypes #shows the column data types raw_data = data_cleaning.fill_missing_values( raw_data, cols_having_missing_values) # print(get_cols_having_missing_values(raw_data, False)) #no columns with missing values raw_data = data_cleaning.just_remove_columns(raw_data, columns=[ "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"]) df = raw_data my_util = Util() my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl") print("Filled the missing values either by the mode or mean value")
def clean(): missing_values = ["n/a", "na", "--", "?"] raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) data_cleaning = DataCleaning() raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2) cols_having_missing_values = data_cleaning.get_cols_having_missing_values( raw_data, False) # cols having missing values raw_data = data_cleaning.fill_missing_values( raw_data, cols_having_missing_values) raw_data = data_cleaning.just_remove_columns(raw_data, columns=[ "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"]) df = raw_data my_util = Util() my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")