Exemplo n.º 1
0
    'Rajasthan'
]

# Practice Use Case on small dataset
#target_dir = (r'C:\Users\theism\Documents\Dimagi\Data\person_phone_aadhar-ap-anantapur2')
#output_dir = (r'C:\Users\theism\Documents\Dimagi\Data\person_phone_aadhar-ap-anantapur2\test')
#case_data_regex = re.compile(r'cases_\d\d\d.csv')

# ------------- don't edit below here -----------------------------

gen_func.start_logging(output_dir)

logging.info('Starting scripts to analyze aadhar data...')

# combine all csv into one dataframe
case_df = gen_func.csv_files_to_df(target_dir, case_data_regex, case_date_cols,
                                   cols_to_use)

# clean case data and start to get age distribution information
output_dict = {}
case_clean_df, output_dict = case_func.clean_case_data(case_df, output_dict)
case_clean_df = case_func.add_age_info(case_clean_df)
location_column_names = ['doc_id', 'district_name']
case_clean_df = gen_func.add_locations(case_clean_df, 'owner_id',
                                       location_column_names)
case_clean_df = case_clean_df.loc[(
    case_clean_df['state_name'].isin(real_state_list))]

logging.info(case_clean_df['sex'].value_counts())
logging.info(case_clean_df['age_bracket'].value_counts())
clean_case_age_dist = case_clean_df.groupby(['age_bracket',
                                             'sex']).count()['caseid']
    'form.counseling.is_family_planning', 'form.counseling.is_sanitation',
    'form.training.show_gmp', 'form.training.show_hvs',
    'form.training.show_thr', 'form.training.show_daily_feeding',
    'form.training.show_due_list', 'form.training.show_hh_reg'
]

## ------------- don't edit below here -----------------------------

# start logging
gf.start_logging(output_dir)
vid_date_cols = ['received_on', 'completed_time', 'started_time']
os.chdir(output_dir)

# BP form info
bp_dir = os.path.join(target_dir, '[DA] Birth Preparedness - min_video')
bp_df = gf.csv_files_to_df(bp_dir, data_regex, date_cols=['completed_time'])
logging.info('raw BP forms: %i' % bp_df.shape[0])
bp_df = gf.add_locations_by_username(bp_df)
logging.info('raw BP forms after add locations: %i' % bp_df.shape[0])
bp_df = bp_df.loc[(bp_df['state_name'].isin(real_state_list))]
num_bp_forms = bp_df.shape[0]
logging.info('Num Birth Prep forms in real locations: %i' % num_bp_forms)
logging.info('%i different users submitted this form' %
             bp_df['awc_name'].nunique())
logging.info('%.2f average forms per user' %
             bp_df['awc_name'].value_counts().mean())
# try to also coordinate with birth phase
bp_df['completed_time'] = pd.to_datetime(bp_df['completed_time'])
bp_df['started_time'] = pd.to_datetime(bp_df['started_time'])
bp_df['form_duration'] = (bp_df['completed_time'] -
                          bp_df['started_time']) / np.timedelta64(1, 'm')
Exemplo n.º 3
0
logging.info('Starting scripts to analyze case data...')
for folder in folder_list:
    # initialize dataframe for output errors
    bad_df = pd.DataFrame()
    bad_df = bad_df.fillna('')

    if os.path.isdir(os.path.join(target_dir, folder)):
        location_name = gen_func.folder_name_to_location(folder)
        logging.info('-------------------------------------------')
        logging.info('Going through data for: %s' % location_name)
        logging.info('-------------------------------------------')
        output_dict = {'location': location_name}
        logging.info(time.strftime('%X %x'))

        # combine all csv into one dataframe
        case_df = gen_func.csv_files_to_df(os.path.join(target_dir, folder),
                                           case_data_regex, case_date_cols)
        os.chdir(output_dir)

        # clean cases-closed, orphan, test states/names, awc owner, blank names
        case_clean_df, output_dict = case_func.clean_case_data(
            case_df, output_dict)

        # add age distribution information to dataframe
        case_clean_df = case_func.add_age_info(case_clean_df,
                                               col_name='big_age_bracket',
                                               bin_type='brackets',
                                               relative_date='opened_date')

        # check for blank / skipped dob
        good_df = case_func.value_is_blank(case_clean_df, 'dob')
for folder in folder_list:
    if os.path.isdir(os.path.join(target_dir, folder)):
        logging.info('Going through data for: %s' % folder)

        if folder == 'Additional Growth Monitoring':
            col_names_to_use = col_names + ['form.measure_identify | none']
        elif folder == 'Migration':
            col_names_to_use = col_names + [
                'form.migrate_out.confirm_migrated_out',
                'form.migrate_in.confirm_migrated_in'
            ]
        else:
            col_names_to_use = col_names

        # combine all csv into one dataframe
        input_df = gf.csv_files_to_df(os.path.join(target_dir, folder),
                                      data_regex, date_cols, col_names_to_use)
        input_df['username2'] = input_df['username'].apply(
            lambda x: x[1:] if x[0] == '0' else x)

        # add location information for each user
        forms_df = pd.merge(input_df,
                            location_df,
                            left_on='username2',
                            right_on='awc_site_code',
                            how='left')
        forms_df['received_on'] = pd.to_datetime(forms_df['received_on'])

        # filter to real states
        # logging.info('%i users unmatched to location so far' % forms_df['awc_name'].isnull().sum())
        logging.info('only getting users from real states...')
        forms_df = forms_df.loc[(forms_df['state_name'].isin(real_state_list))]
Exemplo n.º 5
0
stats_df = pd.DataFrame()
stats_df = stats_df.fillna('')
closed_df = pd.DataFrame()
closed_df = closed_df.fillna('')

# get loads of each case type
# cycle through case types and create case_df
for case_type in case_types:
    logging.info('-------------------------------------------')
    logging.info('Going through data for: %s' % case_type)
    logging.info('-------------------------------------------')

    # get case dataset
    input_df = pd.DataFrame()
    input_df = case_df.fillna('')
    input_df = gf.csv_files_to_df(os.path.join(data_dir, case_type),
                                  case_data_regex, date_cols, cols_to_use)

    # input_df[case_type + '_open'] = input_df['closed'] == False
    # input_df[case_type + '_closed'] = input_df['closed'] != False

    # get all cases, open or closed.  closed matter for load testing since only
    # removed from phone if parent case closed
    input_df[case_type] = 1

    # show some open/closed info
    open_closed = input_df['closed'].value_counts()
    logging.info(open_closed)
    logging.info('Pct of %s open: %0.1f' %
                 (case_type, (open_closed[False] * 100. / open_closed.sum())))
    closed_df.loc[:, case_type] = open_closed
case_data_regex = re.compile(r'Cases_\d\d\d.csv')
date_cols = []
cols_to_use = ['owner_id', 'closed','hh_bpl_apl','hh_caste','hh_minority','hh_religion']
data_cols = ['hh_bpl_apl','hh_caste','hh_minority','hh_religion']
location_columns = ['doc_id', 'block_name', 'district_name', 'state_name']
real_state_list = ['Madhya Pradesh', 'Chhattisgarh', 'Andhra Pradesh', 'Bihar',
                   'Jharkhand', 'Rajasthan']
# , 'Uttar Pradesh', 'Maharashtra']

# start logging
gf.start_logging(output_dir)

# initialize dfs
input_df = pd.DataFrame()
input_df = input_df.fillna('')
input_df = gf.csv_files_to_df(data_dir, case_data_regex, date_cols, cols_to_use)
    
# only keep open cases
case_df = input_df[input_df['closed'] == False]

# get latest location fixture and add location data
if refresh_locations:
    gf.refresh_locations()
case_df = gf.add_locations(case_df, 'owner_id', location_columns)
case_df = case_df.loc[(case_df['state_name'].isin(real_state_list))]

# get caste percentages to df for output
drop_cols = cols_to_use
drop_cols.append('state_name')
drop_cols.append('district_name')
blocks = case_df['block_name'].unique().tolist()