def _main_(): # set pandas viewing options desired_width = 320 pd.set_option('display.width', desired_width) # open qs_metadata.xlsx and create df from first sheet path = os.getcwd() filename = '/qs_metadata.xlsx' meta_dfs = CM.xl_to_dfs(path, filename) sheetname = 'Sheet1' meta_df = meta_dfs[sheetname] # identify which columns have relevant metadata in them (might change this get all column names) meta_cols = list(meta_df) # build dict in format: 'column_name': [filter_values] q_meta_dict = {} for col in meta_cols: q_meta_dict[col] = CM.distinct_from_df(meta_df, col) # build dicts in format: 'filter_value': [question_ids]. Nest inside q_meta_dict q_meta_shelve = {} for key in q_meta_dict.keys(): if len(q_meta_dict[key]) > 0: col_dict = {} for filter_val in q_meta_dict[key]: id_list = ids_to_list(meta_df, 'id', str(key), filter_val) col_dict[filter_val] = id_list q_meta_shelve[key] = col_dict # create shelve dict q_meta_name = 'qs_metadata' create_write_shelve(q_meta_shelve, q_meta_name)
def _main_(): # make the damn ric dict: ricname: datasourceID (except CII & OSVP, number is not datasourceid) rics = { 'MaRS Discovery District': { 'db_name': 'MaRS Discovery District', 'code': 7 }, 'RIC Centre': { 'db_name': 'RIC Centre', 'code': 9 }, 'Innovation Factory': { 'db_name': 'Innovation Factory', 'code': 12 }, 'NWOIC': { 'db_name': 'NWO Innovation Centre', 'code': 14 }, 'Invest Ottawa': { 'db_name': 'Invest Ottawa', 'code': 16 }, 'IION': { 'db_name': 'IION', 'code': 5 }, 'CII': { 'db_name': 'MaRS Centre for Impact Investing', 'code': -1 }, 'OSVP': { 'db_name': 'Ontario Scale-Up Voucher Program', 'code': -1 }, 'Innovation Guelph': { 'db_name': 'Innovation Guelph', 'code': 15 }, 'WEtech': { 'db_name': 'WEtech', 'code': 2 }, 'SSMIC': { 'db_name': 'SSMIC', 'code': 3 }, 'TechAlliance': { 'db_name': 'TechAlliance', 'code': 6 }, 'Haltech': { 'db_name': 'Haltech', 'code': 8 }, 'Spark Centre': { 'db_name': 'Spark Centre', 'code': 10 }, 'NORCAT': { 'db_name': 'NORCAT', 'code': 1 }, 'VentureLAB': { 'db_name': 'ventureLAB', 'code': 11 }, 'Innovate Niagara': { 'db_name': 'Innovate Niagara', 'code': 17 }, 'Launch Lab': { 'db_name': 'Launch Lab', 'code': 13 } # ,'Communitech': {'db_name': 'Communitech', 'code': 4} } with shelve.open(q_meta_name, 'r') as qs_metadata: print("Creating ric_qs dict") ric_qs = {} for ric in rics: if ric in list(qs_metadata['addedby'].keys()): ric_qids = include_list(ric) ric_qs[ric] = ric_qids # elif ric.lower() == 'communitech': # ric_qs[ric] = qs_metadata['which_survey']['COMMUNITECH'] else: ric_qs[ric] = qs_metadata['core/noncore']['core'] print("Reading qs_metadata.xlsx to df") cwd = os.getcwd() user_path = os.path.expanduser("~") filename = '/qs_metadata.xlsx' meta_dfs = CM.xl_to_dfs(cwd, filename) sheetname = 'Sheet1' meta_df = meta_dfs[sheetname] # create master data dict with qid: concatted name (i.e., <survey_section - readable_name>) print("Creating master data dict") meta_df = meta_df.sort_values(by=['q_num'], ascending=[True]) meta_df['col_title'] = meta_df['survey_section'].astype( str) + ' - ' + meta_df['readable_name'] data_dict = meta_df[['id', 'col_title', 'title', 'q_num']] # split master data dict into one for each ric print("Splitting master data dict into 1 per RIC") ric_data_dicts = {} for ric in ric_qs.keys(): qids_df = pd.DataFrame(ric_qs[ric], columns=['id']) ric_data_dict = pd.merge(qids_df, data_dict, how='inner', on=['id']) ric_data_dict.sort_values(by='q_num', inplace=True) ric_data_dicts[ric] = ric_data_dict # read questions and options from DB print("Reading questions and options from DB into qsos df") qsos_sql = CM.get_config("config_sql.ini", "ann_survey_18", "all_qsos") qsos = DB.pandas_read(qsos_sql) # add col_title column to qsos df qsos = pd.merge(qsos, meta_df[['id', 'col_title', 'q_num']], how='left', left_on='qid', right_on='id') qsos.drop('id', inplace=True, axis=1) print("Transforming qsos df") # put flag on 'ESSAY', 'TABLE', 'TEXTBOX', 'MENU', 'RADIO' so that their col_title does not change in next step qsos['multi_options'] = qsos.q_type.apply(multi_options) # for options, make col_title = col_title + "Option: " + [o_label] qsos['col_title'] = qsos.apply(opt_col_title, axis=1) qsos = qsos[qsos['q_num'] > 0] # capture correct order for columns for use later in formatting pivoted datasheets col_title_order = pd.Series(qsos.q_num.values, index=qsos.col_title).to_dict() # read answers from DB print("Reading answers from DB into ans df") ans_sql = CM.get_config("config_sql.ini", "ann_survey_18", "sel_ann_survey_res") ans = DB.pandas_read(ans_sql) # separate process for Communitech shared ventures # 1. get list of Communitech shared client answers print("Reading Communitech shared clients") comm_sql = CM.get_config("config_sql.ini", "ann_survey_18", "sel_communitech_shared") comm_ans = DB.pandas_read(comm_sql) # 2. concat with rest of answers (?) ans = pd.concat([ans, comm_ans]) # clean ans print("Cleaning ans df") ans.dropna(subset=['Answer'], inplace=True) ans['Answer'] = ans.apply(replacements, axis=1) ans['page_pipe'] = ans['page_pipe'].fillna('') # for each RIC print("\nPer RIC df datasheet creation:") for ric in ric_qs: # if ric == 'MaRS Discovery District': # turn that RIC's qid list into df print("\nRIC: {}".format(ric)) print("Creating df of questions for {}".format(ric)) qs_df = pd.DataFrame(ric_qs[ric], columns=['qid']) qs_df['ric'] = rics[ric]['db_name'] # left join that df with qsos df on qid qs_df = pd.merge(qs_df, qsos, how='left', on='qid') # left join resulting df with ans df print("Left join qs with ans") ric_survey_results = pd.merge( qs_df, ans, how='left', left_on=['qid', 'oid', 'ric'], right_on=['QuestionID', 'OptionID', 'RIC_Program']) # drop empty answers and sort print("Clean ans") ric_survey_results = ric_survey_results[pd.notnull( ric_survey_results['Answer'])] ric_survey_results.sort_values(by='q_num', inplace=True) # ric_survey_results.dropna(subset=['Answer']) print("Pivot into datasheet for {}".format(ric)) ric_datasheet = ric_survey_results[[ 'resp_id', 'CompanyID', 'col_title', 'Answer', 'page_pipe' ]].drop_duplicates() ric_datasheet['col_title'] = ric_datasheet[ 'col_title'] + ' ' + ric_datasheet['page_pipe'].astype(str) ric_datasheet['rid_cid'] = ric_datasheet['resp_id'].astype( float).astype(str) + '-' + ric_datasheet['CompanyID'].astype(str) ric_datasheet = ric_datasheet[['rid_cid', 'col_title', 'Answer']] try: ric_datasheet = ric_datasheet.pivot(index='rid_cid', columns='col_title', values='Answer') # ric_datasheet = pd.pivot_table(ric_datasheet, values='Answer', columns='col_title', index='rid_cid') ric_datasheet.reset_index(inplace=True) ric_datasheet['resp_id'], ric_datasheet[ 'CompanyID'] = ric_datasheet['rid_cid'].str.split('-', 1).str ric_datasheet.drop('rid_cid', axis=1, inplace=True) ric_datasheet = ric_datasheet.apply(pd.to_numeric, errors='ignore') # remove non-consenting responses for val in list(ric_datasheet): if 'consent' in str(val.lower()): consent_col = val ric_datasheet[consent_col] = ric_datasheet[ consent_col].str.replace(u"\u2019", "'") ric_datasheet = ric_datasheet[ ric_datasheet[consent_col] != "I don't give consent"] consent_col = '' break # re-order columns to reflect q_num ordering cols = list(ric_datasheet) rid_cid = cols[-2:] q_cols = cols[:-2] ordered_q_cols = [] for q in q_cols: if q[-2:] == '.0': ordered_q_cols.append([col_title_order[q[:-8]], q]) else: ordered_q_cols.append([col_title_order[q.strip()], q]) ordered_q_cols.sort() for i in range(len(ordered_q_cols)): ordered_q_cols[i] = ordered_q_cols[i][1] cols = rid_cid + ordered_q_cols ric_datasheet = ric_datasheet[cols] save_path = path_xl( user_path=user_path, path_extension= "Box Sync/Workbench/BAP/Annual Survey FY2018/DEV - Results to RICs/", filename=ric + '.xlsx') # pull out social impact companies separately for use later in CII datasheet if ric == 'MaRS Discovery District': soc_imp_df = ric_datasheet[ ric_datasheet['social_impact - Motives '] == 'Yes'] if ric != 'CII': # save to disc results_sheets = [ric_datasheet, ric_data_dicts[ric]] sheetnames = ['SurveyData', 'DataDictionary'] save_xls(results_sheets, save_path, sheetnames) print("Wrote to {}".format(save_path)) else: print('Add extra tabs to {} datasheet'.format(ric)) results_sheets = [ ric_datasheet, soc_imp_df, ric_data_dicts[ric], ric_data_dicts['MaRS Discovery District'] ] sheetnames = [ 'CII_SurveyData', 'All_RICs_SocialImpact_SurveyData', 'CII_DataDict', 'MaRS_DataDict' ] save_xls(results_sheets, save_path, sheetnames) print("Wrote to {}".format(save_path)) except ValueError as ex: print("!\nERROR FOR {}: {}\n!\n".format(ric, ex)) # save conflicting answer values when pivot fails save_path = path_xl( user_path=user_path, path_extension= "Box Sync/Workbench/BAP/Annual Survey FY2018/DEV - Results to RICs/__dupies/", filename=ric + '_dupies' + '.xlsx') save_xls([ ric_datasheet[ric_datasheet.duplicated( ['rid_cid', 'col_title'], keep=False)] ], save_path, ['dupies']) continue pass