'Rajasthan' ] # Practice Use Case on small dataset #target_dir = (r'C:\Users\theism\Documents\Dimagi\Data\person_phone_aadhar-ap-anantapur2') #output_dir = (r'C:\Users\theism\Documents\Dimagi\Data\person_phone_aadhar-ap-anantapur2\test') #case_data_regex = re.compile(r'cases_\d\d\d.csv') # ------------- don't edit below here ----------------------------- gen_func.start_logging(output_dir) logging.info('Starting scripts to analyze aadhar data...') # combine all csv into one dataframe case_df = gen_func.csv_files_to_df(target_dir, case_data_regex, case_date_cols, cols_to_use) # clean case data and start to get age distribution information output_dict = {} case_clean_df, output_dict = case_func.clean_case_data(case_df, output_dict) case_clean_df = case_func.add_age_info(case_clean_df) location_column_names = ['doc_id', 'district_name'] case_clean_df = gen_func.add_locations(case_clean_df, 'owner_id', location_column_names) case_clean_df = case_clean_df.loc[( case_clean_df['state_name'].isin(real_state_list))] logging.info(case_clean_df['sex'].value_counts()) logging.info(case_clean_df['age_bracket'].value_counts()) clean_case_age_dist = case_clean_df.groupby(['age_bracket', 'sex']).count()['caseid']
'form.counseling.is_family_planning', 'form.counseling.is_sanitation', 'form.training.show_gmp', 'form.training.show_hvs', 'form.training.show_thr', 'form.training.show_daily_feeding', 'form.training.show_due_list', 'form.training.show_hh_reg' ] ## ------------- don't edit below here ----------------------------- # start logging gf.start_logging(output_dir) vid_date_cols = ['received_on', 'completed_time', 'started_time'] os.chdir(output_dir) # BP form info bp_dir = os.path.join(target_dir, '[DA] Birth Preparedness - min_video') bp_df = gf.csv_files_to_df(bp_dir, data_regex, date_cols=['completed_time']) logging.info('raw BP forms: %i' % bp_df.shape[0]) bp_df = gf.add_locations_by_username(bp_df) logging.info('raw BP forms after add locations: %i' % bp_df.shape[0]) bp_df = bp_df.loc[(bp_df['state_name'].isin(real_state_list))] num_bp_forms = bp_df.shape[0] logging.info('Num Birth Prep forms in real locations: %i' % num_bp_forms) logging.info('%i different users submitted this form' % bp_df['awc_name'].nunique()) logging.info('%.2f average forms per user' % bp_df['awc_name'].value_counts().mean()) # try to also coordinate with birth phase bp_df['completed_time'] = pd.to_datetime(bp_df['completed_time']) bp_df['started_time'] = pd.to_datetime(bp_df['started_time']) bp_df['form_duration'] = (bp_df['completed_time'] - bp_df['started_time']) / np.timedelta64(1, 'm')
logging.info('Starting scripts to analyze case data...') for folder in folder_list: # initialize dataframe for output errors bad_df = pd.DataFrame() bad_df = bad_df.fillna('') if os.path.isdir(os.path.join(target_dir, folder)): location_name = gen_func.folder_name_to_location(folder) logging.info('-------------------------------------------') logging.info('Going through data for: %s' % location_name) logging.info('-------------------------------------------') output_dict = {'location': location_name} logging.info(time.strftime('%X %x')) # combine all csv into one dataframe case_df = gen_func.csv_files_to_df(os.path.join(target_dir, folder), case_data_regex, case_date_cols) os.chdir(output_dir) # clean cases-closed, orphan, test states/names, awc owner, blank names case_clean_df, output_dict = case_func.clean_case_data( case_df, output_dict) # add age distribution information to dataframe case_clean_df = case_func.add_age_info(case_clean_df, col_name='big_age_bracket', bin_type='brackets', relative_date='opened_date') # check for blank / skipped dob good_df = case_func.value_is_blank(case_clean_df, 'dob')
for folder in folder_list: if os.path.isdir(os.path.join(target_dir, folder)): logging.info('Going through data for: %s' % folder) if folder == 'Additional Growth Monitoring': col_names_to_use = col_names + ['form.measure_identify | none'] elif folder == 'Migration': col_names_to_use = col_names + [ 'form.migrate_out.confirm_migrated_out', 'form.migrate_in.confirm_migrated_in' ] else: col_names_to_use = col_names # combine all csv into one dataframe input_df = gf.csv_files_to_df(os.path.join(target_dir, folder), data_regex, date_cols, col_names_to_use) input_df['username2'] = input_df['username'].apply( lambda x: x[1:] if x[0] == '0' else x) # add location information for each user forms_df = pd.merge(input_df, location_df, left_on='username2', right_on='awc_site_code', how='left') forms_df['received_on'] = pd.to_datetime(forms_df['received_on']) # filter to real states # logging.info('%i users unmatched to location so far' % forms_df['awc_name'].isnull().sum()) logging.info('only getting users from real states...') forms_df = forms_df.loc[(forms_df['state_name'].isin(real_state_list))]
stats_df = pd.DataFrame() stats_df = stats_df.fillna('') closed_df = pd.DataFrame() closed_df = closed_df.fillna('') # get loads of each case type # cycle through case types and create case_df for case_type in case_types: logging.info('-------------------------------------------') logging.info('Going through data for: %s' % case_type) logging.info('-------------------------------------------') # get case dataset input_df = pd.DataFrame() input_df = case_df.fillna('') input_df = gf.csv_files_to_df(os.path.join(data_dir, case_type), case_data_regex, date_cols, cols_to_use) # input_df[case_type + '_open'] = input_df['closed'] == False # input_df[case_type + '_closed'] = input_df['closed'] != False # get all cases, open or closed. closed matter for load testing since only # removed from phone if parent case closed input_df[case_type] = 1 # show some open/closed info open_closed = input_df['closed'].value_counts() logging.info(open_closed) logging.info('Pct of %s open: %0.1f' % (case_type, (open_closed[False] * 100. / open_closed.sum()))) closed_df.loc[:, case_type] = open_closed
case_data_regex = re.compile(r'Cases_\d\d\d.csv') date_cols = [] cols_to_use = ['owner_id', 'closed','hh_bpl_apl','hh_caste','hh_minority','hh_religion'] data_cols = ['hh_bpl_apl','hh_caste','hh_minority','hh_religion'] location_columns = ['doc_id', 'block_name', 'district_name', 'state_name'] real_state_list = ['Madhya Pradesh', 'Chhattisgarh', 'Andhra Pradesh', 'Bihar', 'Jharkhand', 'Rajasthan'] # , 'Uttar Pradesh', 'Maharashtra'] # start logging gf.start_logging(output_dir) # initialize dfs input_df = pd.DataFrame() input_df = input_df.fillna('') input_df = gf.csv_files_to_df(data_dir, case_data_regex, date_cols, cols_to_use) # only keep open cases case_df = input_df[input_df['closed'] == False] # get latest location fixture and add location data if refresh_locations: gf.refresh_locations() case_df = gf.add_locations(case_df, 'owner_id', location_columns) case_df = case_df.loc[(case_df['state_name'].isin(real_state_list))] # get caste percentages to df for output drop_cols = cols_to_use drop_cols.append('state_name') drop_cols.append('district_name') blocks = case_df['block_name'].unique().tolist()