data1 = pd.read_csv("Data_analysis_Data_IER.csv", sep=",", encoding='cp1252') #Sort the data in the right samples data1.sort_values(by=['year'], inplace=True) df_1_t = data1.iloc[:94, :] #all the samples of 2019 df_2 = data1.iloc[95:, :] #all the samples of 2020 #drop_indices = np.random.choice(df_2_t.index, 3, replace=False) # make the sample sizes of 2020 and 2019 same length by deleting three random entries #df_2=df_2_t.drop(drop_indices) df_1 = df_1_t.drop(57) #drop the nan data sample df_1.sort_values(by=['living'], inplace=True) #sort by living situation df_2.sort_values(by=['living'], inplace=True) #sort by living situation #make table 1 columns = ['year', 'gender', 'bmi', 'living'] mytable = TableOne(data1, columns=columns, pval=False) print(mytable.tabulate(tablefmt="fancy_grid")) mytable.to_csv('mytable.csv') #split the data samples in living with their parents and moved out for 2020 and calculate means grouped1 = df_1.groupby(df_1.living) Moved_out_2019 = grouped1.get_group("Moved_out") mean1 = Moved_out_2019["attitu_2"].mean() Parents_2019 = grouped1.get_group("Living_with_parents") mean2 = Parents_2019["attitu_2"].mean() #split the data samples in living with their parents and moved out for 2019 and calculate means grouped2 = df_2.groupby(df_2.living) Moved_out_2020 = grouped2.get_group("Moved_out") mean3 = Moved_out_2020["attitu_2"].mean() Parents_2020 = grouped2.get_group("Living_with_parents") mean4 = Parents_2020["attitu_2"].mean()
!pip install tableone from tableone import TableOne df_day1['diabetes_flag'].fillna(value=0,inplace=True) categorical = ['gender','ethnicity','diabetes_flag', 'surgery'] groupby = 'hospitaldischargestatus' columns = ['gender', 'age','ethnicity', 'diabetes_flag', 'apachescore','Glucose_mean','delta_glucose_mean', 'surgery'] glu_table = TableOne(df_day1, groupby = groupby, columns = columns, categorical = categorical, pval= True) print(glu_table.tabulate(tablefmt="github")) display(HTML('<H4>Brain problem counts<H4>')) print(f'AIS: {sum(df_day1.brain_problem=="AIS")}') print(f'HEM: {sum(df_day1["brain_problem"]=="HEM")}') print(f'Other: {sum(df_day1["brain_problem"]=="Other")}') print(f'TBI: {sum(df_day1["brain_problem"]=="TBI")}') print(f'SZ: {sum(df_day1["brain_problem"]=="SZ")}') print(f'''Total: {sum(df_day1.brain_problem=="AIS")+ sum(df_day1["brain_problem"]=="HEM")+ sum(df_day1["brain_problem"]=="Other")+ sum(df_day1["brain_problem"]=="TBI")+ sum(df_day1["brain_problem"]=="SZ")}''') """## Modeling"""
participants_2019 = data[data['year'].isin([2019])] participants_2019_male= participants_2019[participants_2019['gender'].isin(['Male'])] participants_2019_female= participants_2019[participants_2019['gender'].isin(['Female'])] participants_2020 = data[data['year'].isin([2020])] participants_2020_male= participants_2020[participants_2020['gender'].isin(['Male'])] participants_2020_female= participants_2020[participants_2020['gender'].isin(['Female'])] #CREATING TABLE ONE columns = ['gender','bmi','living'] categorical = ['living'] groupby = 'gender' mytable = TableOne(participants_2019 , columns=columns, categorical=categorical, groupby=groupby) print(mytable.tabulate(tablefmt="latex")) mytable_2 = TableOne(participants_2020 , columns=columns, categorical=categorical, groupby=groupby) print(mytable_2.tabulate(tablefmt="latex")) #FILTERING NECESSARY DATA AND CALCULATING WEEKEND/WEEKDAY AVERAGES #Overall 2019 weekdays app temp = participants_2019[~participants_2019[['stap_app_1_aantal','stap_app_2_aantal','stap_app_3_aantal','stap_app_4_aantal','stap_app_5_aantal','stap_app_6_aantal','stap_app_7_aantal']].isin(['nan']).any(axis=1)] participants_2019_weekdays_app=temp[['stap_app_1_aantal','stap_app_2_aantal','stap_app_6_aantal','stap_app_7_aantal','stap_app_5_aantal']] temp_avg=participants_2019_weekdays_app.mean(axis=1) participants_2019_weekdays_app['avg']=temp_avg #Overall 2020 weekdays app temp = participants_2020[~participants_2020[['stap_app_1_aantal','stap_app_2_aantal','stap_app_3_aantal','stap_app_4_aantal','stap_app_5_aantal','stap_app_6_aantal','stap_app_7_aantal']].isin(['nan']).any(axis=1)] participants_2020_weekdays_app=temp[['stap_app_1_aantal','stap_app_2_aantal','stap_app_6_aantal','stap_app_7_aantal','stap_app_5_aantal']] temp_avg=participants_2020_weekdays_app.mean(axis=1) participants_2020_weekdays_app['avg']=temp_avg