예제 #1
0
def main_deprecated():
    # This is deprecated, never use this please.
    print("This is main, alhumdulliah")
    ##### This block is for data cleaning #####
    missing_values = ["n/a", "na", "--", "?"]
    raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                           delimiter=',', na_values=missing_values)
    # print(raw_data.head()) # print head of the data
    # print(raw_data.describe()) # shows numerical columns statistics e.g. count, mean, std, min, max etc
    # print(raw_data.shape) # prints shape of the dataset (101766, 50)
    # print(raw_data["weight"].isnull().sum()) #prints number of null values in weight column
    # print(raw_data["weight"].shape[0]) #prints number of columns in weight column
    data_cleaning = DataCleaning()
    raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2)
    cols_having_missing_values = data_cleaning.get_cols_having_missing_values(
        raw_data, False)  # cols having missing values
    # raw_data.dtypes #shows the column data types
    raw_data = data_cleaning.fill_missing_values(
        raw_data, cols_having_missing_values)
    # print(get_cols_having_missing_values(raw_data, False)) #no columns with missing values
    raw_data = data_cleaning.just_remove_columns(raw_data, columns=[
                                                 "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"])
    df = raw_data
    my_util = Util()
    my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")
    print("Filled the missing values either by the mode or mean value")
예제 #2
0
def clean():
    missing_values = ["n/a", "na", "--", "?"]
    raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                           delimiter=',', na_values=missing_values)
    data_cleaning = DataCleaning()
    raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2)
    cols_having_missing_values = data_cleaning.get_cols_having_missing_values(
        raw_data, False)  # cols having missing values
    raw_data = data_cleaning.fill_missing_values(
        raw_data, cols_having_missing_values)
    raw_data = data_cleaning.just_remove_columns(raw_data, columns=[
                                                 "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"])
    df = raw_data
    my_util = Util()
    my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")