def delete_old_ans(): # delete old ans using answer ids # store old ans in xl file old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'old_ans') old_ans_df = DB.pandas_read(old_ans_sql) DBInteractions.store_df(old_ans_df, '_OLD_PIPE_ANS') # run sql to delete old ans del_old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'del_old_ans') DB.execute(del_old_ans_sql)
def _main_(): print("Getting SQL query") sql = CM.get_config("config_sql.ini", "ann_survey_18", "caprevjob_by_ric") print("SQL: {}".format(sql)) print("Executing SQL to get dataframe of results") all_results = DB.pandas_read(sql) print("Creating column names") all_results['ConcatQ'] = all_results[['Cap/Rev/Emp', 'Question']].apply(lambda x: ' - '.join(x), axis=1) print("Splitting dataframe into one per RIC") split_frames = partition_by(all_results, "RIC_Program") print("Getting write path") user_path = os.path.expanduser("~") path = user_path + "/Box Sync/Workbench/BAP/Annual Survey FY2018/Results by RIC/" print("Path: {}".format(path)) print("Writing files to disc:") for ric in split_frames.keys(): x = split_frames[ric] x['rid_cid'] = x['resp_id'].astype(str) + '_' + x['Company_ID'].astype(str) x = spread(x, 'rid_cid', 'ConcatQ', 'Answer') x['rid_cid'] = x.index x['_resp_id'], x['_Company_ID'] = x['rid_cid'].str.split('_', 1).str x = x.apply(pd.to_numeric, errors='ignore') cols = x.columns.tolist() cols = cols[-2:] + cols[:-2] x = x[cols] for i in range(len(cols)): if str(cols[i])[0] == '_': cols[i] = cols[i][1:] x.columns = cols x = x.drop('rid_cid', axis=1) filename = "{} Survey Results".format(ric) write_to_xl(x, filename, path, 'Results') print("Wrote {} to path: {}".format(filename, path))
class Json: keep_qids = CM.get_config('config.ini', 'secondary_etl', 'sg_del_qids') def __init__(self, json, surveyid): self.json = json self.surveyid = surveyid def filter_out(self): keeps = self.get_full_keys('question') filtered_dicts = [] for dic in self.json: filtered_dic = {} for key in keeps: if dic[key] != '': filtered_dic[key] = dic[key] filtered_dicts.append(filtered_dic) return filtered_dicts @staticmethod def extract_id(string): x = string.find("(") + 1 y = string.find(")") return string[x:y] def get_full_keys(self, key_str): d = self.json[0] keeps = self.keep_qids.split(',') full_keys = [] keys = list(d.keys()) full_keys.extend(keys[:11]) for key in keys[12:]: small_key = key[:18] if Json.extract_id(small_key) in keeps and key_str in small_key: full_keys.append(key) return full_keys def to_df(self): data = self.filter_out() all_ans = [] for resp in data: srid = resp['id'] for key in list(resp.keys())[11:]: qid = Json.extract_id(key[:18]) page_pipe = Json.extract_id(key[15:]) answer_str = str(resp[key]) ans = Answer(qid=qid, srid=srid, answer=answer_str, surveyid=self.surveyid, page_pipe=page_pipe) answer = ans.record() all_ans.append(answer) all_ans = pd.DataFrame(all_ans, columns=Answer.cols()) return all_ans
def check_qs_exist(self, survey_id): sql = CM.get_config("config.ini", "sql_queries", "check_questions_exist") sql = sql.replace("WHAT_SURVEY_ID", str(survey_id)) check = DB.pandas_read(sql) if check.iloc[0][0]: return True else: return False
def connect(dev=False): conn = 'conn' if dev: conn = 'devconn' try: con_str = Common.get_config('config.ini', 'db_connect', conn) conn = pyodbc.connect(con_str) return conn except Exception as ex: print('DB Server Connection Exception: {}'.format(ex)) return None
def __init__(self): self._path1 = Common.get_config('config.ini', 'box_file_path', 'path_validI') self._path2 = Common.get_config('config.ini', 'box_file_path', 'path_validII') self._path3 = Common.get_config('config.ini', 'box_file_path', 'path_validIII') self.pathQ1 = os.path.join(os.path.expanduser(self._path1)) self.pathQ2 = os.path.join(os.path.expanduser(self._path2)) self.pathQ3 = os.path.join(os.path.expanduser(self._path3)) self._path_quarter_one = Common.get_config('config.ini', 'box_file_path', 'path_bap_validation_quarter_one') self._path_quarter_two = Common.get_config('config.ini', 'box_file_path', 'path_bap_validation_quarter_two') self._path_quarter_three = Common.get_config('config.ini', 'box_file_path', 'path_bap_validation_quarter_three') self.path_quarter_one = os.path.join(os.path.expanduser(self._path_quarter_one)) self.path_quarter_two = os.path.join(os.path.expanduser(self._path_quarter_two)) self.path_quarter_three = os.path.join(os.path.expanduser(self._path_quarter_three)) self.year = 2018 self.Q1 = '\'Q1\'' self.Q2 = '\'Q2\'' self.Q3 = '\'Q3\'' self.Q4 = '\'Q4\'' self.Q1CompanyData_sheet = None self.Q2CompanyData_sheet = None self.Q3CompanyData_sheet = None self.Q1CompanyData = None self.Q2CompanyData = None self.Q3CompanyData = None self.Q1CompanyData_dc = None self.Q2CompanyData_dc = None self.Q3CompanyData_dc = None self.Q1CompanyData_fact_ric = None self.Q2CompanyData_fact_ric = None self.Q3CompanyData_fact_ric = None self.Q1CompanyData_rollup = None self.Q2CompanyData_rollup = None self.Q3CompanyData_rollup = None self.source_file = None self.quarter_one_files = [] self.quarter_two_files = [] self.quarter_three_files = [] self.dict_list = [] self.rics = ['alliance', 'communitech', 'haltech', 'guelph', 'iion', 'innovationfactory', 'launchlab', 'mars', 'niagara', 'noic', 'norcat', 'ottawa', 'ric', 'spark', 'ssmic', 'venturelab', 'wetec'] self.batch = '''SELECT * FROM Config.ImportBatch WHERE Year = {} AND Quarter = {} AND DataSourceID = {} AND SourceSystemId = {} AND ImportStatusID = 5''' self.select = 'SELECT * FROM {} WHERE BatchID = {}' self.selectQ1 = '''
def get_campaigns(self, api_token, survey_id, session_variables, surveys_df): if survey_id == 'w': while type(survey_id) != int: try: survey_id = int( input( "Enter ID of survey that you would like to retrieve campaign data for: " )) if self.return_to_main(survey_id) == 1: return survey_id = self.validate_survey_id( survey_id, session_variables, api_token, surveys_df) survey_id = self.validate_survey_id( survey_id, session_variables, api_token, surveys_df) except ValueError: continue campaigns_df = sg_campaign.sg_campaigns_df(survey_id, api_token) print(campaigns_df) campaigns_df["id"] = campaigns_df["id"].apply(pd.to_numeric, errors='ignore') # remove campaigns from df that are already in DB c_sql = CM.get_config("config.ini", "sql_queries", "campaigns_for_survey") c_sql = c_sql.replace("WHAT_SURVEY_ID", str(survey_id)) db_cmpgns = DB.pandas_read(c_sql) if db_cmpgns is not None: db_cmpgns = db_cmpgns.apply(pd.to_numeric, errors='ignore') cmpgns_not_in_db = pd.merge(campaigns_df, db_cmpgns, how='left', indicator=True, on="id") cmpgns_not_in_db2 = cmpgns_not_in_db[cmpgns_not_in_db['_merge'] == 'left_only'].drop("_merge", axis=1) # cmpgns_not_in_db2 = cmpgns_not_in_db2.apply(pd.to_numeric, errors='ignore') # insert campaigns into DB if len(cmpgns_not_in_db2) > 0: insert_cmpgns_sql = "insert_campaigns" self.df_to_db(cmpgns_not_in_db2, insert_cmpgns_sql, remove_single_quotes=False, clean_numeric_cols=True) return campaigns_df
def partition_by(df, col_name): """ Splits df into multiple dfs, using values in col_name # df, str -> dict """ sql = CM.get_config("config_sql.ini", "ann_survey_18", "distinct_RICs") split_by = DB.pandas_read(sql) split_by = split_by['RIC_Program'].tolist() frame_dict = {elem: '' for elem in split_by} for key in frame_dict.keys(): query = '{} == \"{}\"'.format(str(col_name), str(key)) frame_dict[key] = df.query(query) return frame_dict
def del_survey_components(self, survey_id): del_sql = CM.get_config("config.ini", "sql_queries", "del_all_for_survey") del_sql = del_sql.replace("WHAT_SURVEY", str(survey_id)) DB.execute(del_sql) print("\nDeletion attempt was made. Survey components check:") comps_dict = { "questions": "select_questions", "options": "select_options", "answers": "select_answers", "responses": "select_responses", "emails": "select_emails", "campaigns": "select_campaigns" } for component, sql in comps_dict.items(): sql = CM.get_config("config.ini", "sql_queries", sql).replace("WHAT_SURVEY", str(survey_id)) df = DB.pandas_read(sql) print("\nCount of {}: {}".format(component, len(df))) return
def save_as_excel(dfs, file_name, path_key): print(os.getcwd()) print(len(dfs)) path = Common.get_config('config.ini', 'box_file_path', path_key) box_path = os.path.join(os.path.expanduser("~"), path) os.chdir(box_path) try: writer = pd.ExcelWriter(file_name) j = 0 for df in dfs: j += j sheet_name = 'SHEET {}'.format(j) df.to_excel(writer, sheet_name, index=False) writer.save() except Exception as ex: print(ex)
def __init__(self): self.batch = BatchService() self.sql_update = CM.get_config('config_sql.ini', 'db_sql_general', 'sql_update') self.sql_data_by_batch = CM.get_config('config_sql.ini', 'db_sql_common', 'sql_data_by_batch') self.sql_dim_company = CM.get_config('config_sql.ini', 'da_sql_company', 'sql_dim_company') self.sql_dim_company_source = CM.get_config('config_sql.ini', 'da_sql_company', 'sql_dim_company_source') self.sql_dim_company_insert = CM.get_config('config_sql.ini', 'da_sql_company', 'sql_dim_company_insert') self.sql_dim_company_source_insert = CM.get_config('config_sql.ini', 'da_sql_company', 'sql_dim_company_source_insert') self.sql_dim_company_source_update = CM.get_config('config_sql.ini', 'da_sql_company', 'sql_dim_company_source_update') self.dim_company_id = 0 self.dim_company_source_id = 0 self.path = CM.get_config('config.ini', 'box_file_path', 'path_bap_company_matching') self.file = FileService(self.path)
def df_to_db(self, df, sql_config_header, remove_single_quotes=True, return_vals=False, clean_numeric_cols=False): df_headers, df_qmarks, df_vals = self.get_sql_params( df, remove_single_quotes=remove_single_quotes) df_header_str = self.get_header_str(df_headers) df_sql = CM.get_config("config.ini", "sql_queries", sql_config_header) df_sql = df_sql.replace("WHAT_HEADERS", df_header_str).replace("WHAT_VALUES", df_qmarks) if clean_numeric_cols: for lst in df_vals: for i in range(len(lst)): element = lst[i] try: if str(element).lower() == "nan" or str( element) == "0000-00-00 00:00:00" or str( element) == '': new_val = None if type(lst) == tuple: lst = self.replace_tuple_val_at_index( lst, i, new_val) if np.dtype(element) == 'int64': new_val = int(str(lst[i])) if type(lst) == tuple: lst = self.replace_tuple_val_at_index( lst, i, new_val) except AttributeError: continue except TypeError: continue except ValueError: continue if len(df_vals) == 1 and type(df_vals[0]) == tuple: df_vals[0] = lst DB.bulk_insert(df_sql, df_vals) if return_vals: return df_vals
def write_survey_entries(self, api_token): year, quarter = CM.fiscal_year_quarter() api_surveys_df = self.get_surveys(api_token, prin=False) api_surveys_df = api_surveys_df.apply(pd.to_numeric, errors='ignore') db_surveys_sql = CM.get_config("config.ini", "sql_queries", "surveys") db_surveys_df = DB.pandas_read(db_surveys_sql) db_surveys_df = db_surveys_df.apply(pd.to_numeric, errors='ignore') surveys_not_in_db = pd.merge(api_surveys_df, db_surveys_df[['id']], how='outer', indicator=True, on="id") surveys_not_in_db2 = surveys_not_in_db[surveys_not_in_db['_merge'] == 'left_only'].drop("_merge", axis=1) # write surveys_not_in_db2 to db, one at a time so BatchService can be executed for each one for index in range(len(surveys_not_in_db2)): row = surveys_not_in_db2.iloc[index][:] df = pd.DataFrame([list(row.values)], columns=list(surveys_not_in_db2)) batch = BatchService() x = batch.create_new_batch(datasource=-1, systemsource=50, year=year, quarter=quarter) batch_id = x.iloc[-1][0] # add batchID to end of df df['BatchID'] = int(batch_id) self.df_to_db(df, "insert_survey_entry") pass
def load(self): df = self.data DBInteractions.store_df(df, '_NEW_PIPE_ANS') sql = CM.get_config('config.ini', 'sql_queries', 'insert_as') sql = sql.replace( 'WHAT_HEADERS', 'id, question_id, option_id, survey_response_id, answer, page_pipe' ) sql = sql.replace('WHAT_VALUES', '?,?,?,?,?,?') insert_vals = [] for index, row in df.iterrows(): vals = [] for header in df.columns: vals.append(row[header]) if len(df) == 1: t = tuple(vals) insert_vals.append(t) else: insert_vals.append(vals) DB.bulk_insert(sql, insert_vals, dev=False)
def entities_script(self): self.sql_acquired_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_acquired_insert') self.sql_acquiree_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_acquiree_insert') self.sql_acquisition_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_acquisition_insert') self.sql_category_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_category_insert') self.sql_org_category_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_org_category_insert') self.sql_founders_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_founders_insert') self.sql_funding_rounds_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_funding_rounds_insert') self.sql_funds_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_funds_insert') self.sql_image_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_image_insert') self.sql_investments_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_investments_insert') self.sql_investors_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_investors_insert') self.sql_ipo_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_ipo_insert') self.sql_job_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_job_insert') self.sql_news_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_news_insert') self.sql_offices_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_offices_insert') self.sql_partners_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_partners_insert') self.sql_sub_organization_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_sub_organization_insert') self.sql_team_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_team_insert') self.sql_websites_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_websites_insert') self.sql_person_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_person_insert') self.sql_invested_in_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_invested_in_insert')
def load_resps_ans_contacts__lists(self, survey_id, api_token): # get api resps print("\nGetting API responses (respondents)") api_ans, api_resps = self.get_ans(survey_id, api_token) print("\nGetting contact lists on account") contact_lists = self.get_contact_lists(survey_id, api_token) contact_lists = contact_lists.apply(pd.to_numeric, errors='ignore') print("\nGetting contact lists from DB") lists_in_db_sql = CM.get_config("config.ini", "sql_queries", "contact_lists") lists_in_db = DB.pandas_read(lists_in_db_sql) print( "\nChecking for diffs b/t API contact lists and DB contact lists") lists_not_in_db = pd.merge(contact_lists, lists_in_db, how='outer', indicator=True, on="id") lists_not_in_db2 = lists_not_in_db[lists_not_in_db['_merge'] == 'left_only'].drop("_merge", axis=1) if len(lists_not_in_db2) > 0: print( "\nOne or more new contact lists detected on acct. Loading into DB now" ) insert_lists_sql = "insert_contactlists" lists_not_in_db2 = lists_not_in_db2.drop_duplicates() self.df_to_db(lists_not_in_db2, insert_lists_sql, remove_single_quotes=False) print( "\nGathering all contacts from all lists on acct into single dataframe" ) all_contacts = [] for list_id in contact_lists["id"]: contact_list = self.get_contacts(api_token, list_id) all_contacts.append(contact_list) # gather all contacts from current survey all_campaigns = self.get_campaigns(api_token, survey_id, 0, 0) for campaign_id in all_campaigns['id']: campaign_contacts = self.get_contacts(api_token, list_id=0, survey_id=survey_id, campaign_id=campaign_id) if type(campaign_contacts) == int: continue else: all_contacts.append(campaign_contacts) all_contacts = pd.concat(all_contacts) all_contacts = all_contacts.apply(pd.to_numeric, errors='ignore') all_contacts['email_address'] = all_contacts[ 'email_address'].str.lower() print("\nGathering all contacts from DB") all_contacts_sql = CM.get_config("config.ini", "sql_queries", "all_contacts") all_db_contacts = DB.pandas_read(all_contacts_sql) all_db_contacts = all_db_contacts.apply(pd.to_numeric, errors='ignore') all_db_contacts['email_address'] = all_db_contacts[ 'email_address'].str.lower() contact_merge = pd.merge(all_contacts[[ "id", "mdc_contact_id", "contact_list_id", "email_address", "firstname", "lastname" ]], all_db_contacts, how='left', on='email_address', indicator=True) new_contacts = contact_merge[[ "id_x", "email_address", "firstname_x", "lastname_x" ]][contact_merge['_merge'] == 'left_only'] new_contacts.columns = ["id", "email_address", "firstname", "lastname"] if len(new_contacts) > 0: print("Writing new contacts to DB.") insert_cs_sql = "insert_contacts" new_contacts = new_contacts.drop_duplicates() self.df_to_db(new_contacts, insert_cs_sql, clean_numeric_cols=True) else: print("\nNo new contacts to write to DB.") updated_db_contacts = DB.pandas_read(all_contacts_sql) updated_db_contacts = updated_db_contacts.apply(pd.to_numeric, errors='ignore') updated_db_contacts['email_address'] = updated_db_contacts[ 'email_address'].str.lower() updated_contact_merge = pd.merge(all_contacts[[ "id", "mdc_contact_id", "contact_list_id", "email_address", "firstname", "lastname" ]], updated_db_contacts, how='left', on='email_address', indicator=True) api_contacts_lists_df = updated_contact_merge[[ "id_x", "id_y", "contact_list_id" ]] api_contacts_lists_df = api_contacts_lists_df.apply(pd.to_numeric, errors='ignore') api_contacts_lists_df.columns = [ "sg_cid", "mdc_contact_id", "contact_list_id" ] print("\nGetting Contacts__Lists table from DB.") db_cl_sql = CM.get_config("config.ini", "sql_queries", "all_contacts__lists") db_contacts_lists_df = DB.pandas_read(db_cl_sql) db_contacts_lists_df = db_contacts_lists_df.apply(pd.to_numeric, errors='ignore') cl_merge = pd.merge(api_contacts_lists_df, db_contacts_lists_df, how='left', indicator=True, on=["sg_cid", "mdc_contact_id", "contact_list_id"]) new_cl = cl_merge[["sg_cid", "mdc_contact_id", "contact_list_id" ]][cl_merge["_merge"] == 'left_only'] new_cl = new_cl.apply(pd.to_numeric, errors='ignore') # get api answers where response_id = resps.id # get db resps where resps.survey_id = survey_id print("\nGetting all responses for this survey from DB.") r_sql = CM.get_config("config.ini", "sql_queries", "all_resps_for_survey") r_sql = r_sql.replace("WHAT_SURVEY_ID", str(survey_id)) db_resps = DB.pandas_read(r_sql) db_resps["date_submitted"] = db_resps["date_submitted"].astype(str) print( "\nDetecting responses that have changed (looking for discrepancy between DB date_submitted and API date_submitted)" ) # changed_resps = [] i = 0 changed_resps = pd.merge(db_resps[["id", "date_submitted"]], api_resps[["id", "date_submitted"]], how='outer', indicator=True, on=["id", "date_submitted"]) changed_resps = changed_resps[[ "id" ]][changed_resps["_merge"] == 'right_only'] changed_resps = changed_resps["id"].tolist() print("{} responses changed".format(len(changed_resps))) print("\nDetecting responses in API that are not in DB at all.") resps_not_in_db = pd.merge(api_resps, db_resps[["id"]], how='outer', indicator=True, on="id") resps_not_in_db2 = resps_not_in_db[resps_not_in_db['_merge'] == 'left_only'].drop("_merge", axis=1) inserted_resps = [] # SECOND INSERT OF contacts__lists new_cl = pd.merge(new_cl, db_contacts_lists_df, how='left', indicator=True, on=["sg_cid"]) new_cl = new_cl[new_cl["_merge"] == 'left_only'] new_cl = new_cl[["sg_cid", "mdc_contact_id_x", "contact_list_id_x"]] new_cl.columns = ["sg_cid", "mdc_contact_id", "contact_list_id"] if len(new_cl) > 0: print("Writing new entries to Contacts__Lists") insert_cl_sql = "insert_contacts_lists" new_cl = new_cl.drop_duplicates() self.df_to_db(new_cl, insert_cl_sql, clean_numeric_cols=True) else: print("\nNo new Contacts__Lists entries to write to DB.") # update Survey_Responses where date_submitted has changed for existing response if len(changed_resps) > 0: print( "\nUpdating DB respondent entries that have changed (have diff date_submitted)" ) resp_headers, resp_qmarks, resp_vals = self.get_sql_params( api_resps) resp_header_str = self.get_header_str(resp_headers) update_r_sql = CM.get_config("config.ini", "sql_queries", "update_rs") for id in changed_resps: j = changed_resps.index(id) where_sql = "WHERE id = " + str(id) set_strs = "" for i in range(2, len(resp_headers)): header = resp_headers[i] val = resp_vals[j][i] set_str = "[" + header + "]" + " = '" + str(val) + "', " set_strs = set_strs + set_str final_update_sql = update_r_sql + set_strs[:-2] + " " + where_sql DB.execute(final_update_sql) # insert resps that aren't db at all if len(resps_not_in_db2) > 0: print("\nInserting new responses that aren't in DB at all") insert_resp_sql = "insert_rs" resps_not_in_db2 = resps_not_in_db2.drop_duplicates() self.df_to_db(resps_not_in_db2, insert_resp_sql, remove_single_quotes=False) for id in resps_not_in_db2["id"]: inserted_resps.append(id) # write to db only answers where answers.response_id is in list of response ids written to db above # del where id in changed_resps, then insert if len(changed_resps) > 0: print( "\nDeleting answers of respondents who updated their response." ) update_a_sql = CM.get_config("config.ini", "sql_queries", "update_a_sql") changed_ans_df = api_ans[api_ans["survey_response_id"].isin( changed_resps)] ans_headers, ans_qmarks, ans_vals = self.get_sql_params( changed_ans_df) del_ans_sql = CM.get_config("config.ini", "sql_queries", "del_ans") for id in changed_resps: del_ans_sql_for_id = del_ans_sql.replace( "WHAT_RESP_ID", str(id)) DB.execute(del_ans_sql_for_id) inserted_resps.append(id) # insert ans where id in inserted_resps if len(inserted_resps) > 0: print( "\nInserting answers into DB (includes updated responses and new responses)" ) ans_insert_df = api_ans[api_ans["survey_response_id"].isin( inserted_resps)] inserts_ans_sql = "insert_as" ans_insert_df = ans_insert_df.drop_duplicates() ans_vals = self.df_to_db(ans_insert_df, inserts_ans_sql, remove_single_quotes=False, return_vals=True) elif len(inserted_resps) == 0: print("\nNo new answers to insert or update.") return print("\nChecking that all answers were inserted") check_ans_sql = CM.get_config("config.ini", "sql_queries", "check_ans") inserted_resp_ids_str = '' for id in inserted_resps: inserted_resp_ids_str = inserted_resp_ids_str + str(id) + ", " inserted_resp_ids_str = inserted_resp_ids_str[:-2] check_ans_sql = check_ans_sql.replace("WHAT_RESP_IDS", inserted_resp_ids_str) ans_inserted_this_session = DB.pandas_read(check_ans_sql) if len(ans_inserted_this_session) != len(ans_vals): print( "\nNot all answers were loaded. Rolling back insert operation " "(deleting answers and responses inserted into DB)") # del ans inserted this session, if any del_ans_sql = CM.get_config("config.ini", "sql_queries", "del_ans_by_respids") del_ans_sql = del_ans_sql.replace("WHAT_RESP_IDS", inserted_resp_ids_str) DB.execute(del_ans_sql) # del resps inserted this session, if any del_resps_sql = CM.get_config("config.ini", "sql_queries", "del_resps_by_list") del_resps_sql = del_resps_sql.replace("WHAT_RESP_IDS", inserted_resp_ids_str) DB.execute(del_resps_sql) elif len(ans_inserted_this_session) == len(ans_vals): print( "All answers successfully inserted. This means that all the responses that were inserted during this " "session have all their respective answers in the DB now.") return
def __init__(self): super().__init__('', '', datasource=enum.DataSourceType.CRUNCH_BASE) self.file = FileService(os.getcwd()) self.user_key = CM.get_config('config.ini', 'crunch_base', 'user_key') self.api_token = '&user_key=' + self.user_key + '&page={}' self.api_tokens = '?user_key=' + self.user_key + '&page={}' self.api_org_token = '?user_key=' + self.user_key self.url_org = CM.get_config('config.ini', 'crunch_base', 'url_org') + self.api_token self.url_people = CM.get_config('config.ini', 'crunch_base', 'url_person') + self.api_token self.url_cat = CM.get_config('config.ini', 'crunch_base', 'url_cat') + self.api_tokens self.url_loc = CM.get_config('config.ini', 'crunch_base', 'url_loc') + self.api_token self.path = CM.get_config('config.ini', 'box_file_path', 'path_crunchbase') self.org_summary = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_organizations_insert') self.people = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_people_insert') self.category = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_category_insert') self.location = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_location_insert') self.orgs_api_url = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_orgs_summary') self.orgs_detail_insert = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_orgnization_insert') self.orgs_summary_update = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_orgs_summary_update') self.orgs_detail_update = CM.get_config('config_sql.ini', 'db_sql_crunchbase', 'sql_orgs_detail_update') self.data = None self.file_name = 'CB_{}_{}.csv' # self.file = FileService(self.path) self.org_uuid = None self.fk_uuid = 'org_uuid' self.one_to_one = 'OneToOne' self.i = 0 self.entities_script() self.col_funding = ['uuid', 'org_uuid', 'permalink', 'api_path', 'web_path', 'api_url', 'funding_type', 'series', 'series_qualifier', 'announced_on', 'announced_on_trust_code', 'closed_on', 'closed_on_trust_code', 'money_raised', 'money_raised_currency_code', 'money_raised_usd', 'target_money_raised', 'target_money_raised_currency_code', 'target_money_raised_usd', 'pre_money_valuation', 'pre_money_valuation_currency_code', 'pre_money_valuation_usd', 'rank', 'created_at', 'updated_at'] self.org_columns = ['org_uuid','batch', 'company_id', 'permalink', 'permalink_aliases', 'api_path', 'web_path', 'api_url','name', 'BasicName', 'also_known_as', 'short_description', 'description', 'profile_image_url', 'primary_role', 'role_company', 'role_investor', 'role_group', 'role_school', 'investor_type', 'founded_on', 'founded_on_trust_code', 'is_closed', 'closed_on', 'closed_on_trust_code', 'num_employees_min', 'num_employees_max', 'stock_exchange', 'stock_symbol', 'total_funding_usd', 'number_of_investments', 'homepage_url', 'contact_email', 'phone_number', 'rank', 'created_at', 'updated_at', 'fetched'] self.org_summary_col = ['uuid', 'permalink', 'api_path', 'web_path', 'api_url', 'name', 'stock_exchange', 'stock_symbol', 'primary_role', 'short_description', 'profile_image_url', 'domain', 'homepage_url', 'facebook_url', 'twitter_url', 'linkedin_url', 'city_name', 'region_name', 'country_code', 'created_at', 'updated_at'] self.office_col = ['uuid', 'org_uuid', 'name', 'street_1', 'street_2', 'postal_code', 'city', 'region', 'country','city_web_path','region_code2','region_web_path', 'country_code2', 'country_code3', 'country_web_path', 'latitude','longitude', 'created_at', 'updated_at' ] self.category_col = ['uuid' ,'org_uuid', 'name', 'web_path', 'category_groups', 'created_at', 'updated_at']
def get_emails(self, survey_id, api_token, session_variables, surveys_df, campaign_id='w'): if survey_id == 'w': while type(survey_id) != int: try: survey_id = int( input( "Enter ID of survey that you would like to retrieve campaign data for: " )) if menu_actions.return_to_main(survey_id) == 1: return survey_id = self.validate_survey_id( survey_id, session_variables, api_token, surveys_df) except ValueError: continue while type(campaign_id) != int: try: campaign_id = int( input( "Enter ID of campaign that you would like to retrieve email msg data for: " )) if menu_actions.return_to_main(campaign_id) == 1: return except ValueError: continue emails_df = sg_emails.sg_emails_df(survey_id, campaign_id, api_token) emails_df["id"] = emails_df["id"].apply(pd.to_numeric, errors='ignore') print(emails_df) # remove campaigns from df that are already in DB e_sql = CM.get_config("config.ini", "sql_queries", "emails_for_campaign") e_sql = e_sql.replace("WHAT_CAMPAIGN", str(campaign_id)) db_em = DB.pandas_read(e_sql) em_not_in_db = pd.merge(emails_df, db_em, how='left', indicator=True, on="id") em_not_in_db2 = em_not_in_db[em_not_in_db['_merge'] == 'left_only'].drop("_merge", axis=1) # insert campaigns into DB if len(em_not_in_db2) > 0: insert_em_sql = "insert_emails" self.df_to_db(em_not_in_db2, insert_em_sql, remove_single_quotes=False, clean_numeric_cols=True) # em_headers, em_qmarks, em_vals = self.get_sql_params(em_not_in_db2, remove_single_quotes=False) # em_header_str = self.get_header_str(em_headers) # em_sql = CM.get_config("config.ini", "sql_queries", "insert_emails") # em_sql = em_sql.replace("WHAT_HEADERS", em_header_str).replace("WHAT_VALUES", em_qmarks) # for lst in em_vals: # for i in range(len(lst)): # element = lst[i] # try: # if str(element).lower() == "nan": # lst[i] = None # except AttributeError: # continue # except TypeError: # continue # DB.bulk_insert(em_sql, em_vals) return emails_df
class BapQuarterly: desired_width = 420 pd.set_option('display.width', desired_width) year, quarter = COM.fiscal_year_quarter(datetime.datetime.utcnow()) quarter = quarter - 1 batch = BatchService() bap_path_source = COM.get_config('config.ini', 'box_file_path', 'path_bap_source') bap_path_etl = COM.get_config('config.ini', 'box_file_path', 'path_bap_etl') file = FileService(bap_path_source) qa = BapQA() season = '19_Q1' company = CompanyService() @staticmethod def show_bap_quarterly_template(): BapQuarterly.file.show_source_file() ''' Checks if all the RICs send the right template with all the columns exists. ''' @staticmethod def qa_bap_spread_sheet_by_ric(): BapQuarterly.qa.check_rics_file(fp.path_bap_etl, fp.path_bap_qa) @staticmethod def combine_rics_bap_quarterly(combine_for): program, program_youth, company_quarterly, company_annually = BapQuarterly.file.read_source_file( FileType.SPREAD_SHEET.value, DS.BAP, combine_for, current_path=fp.path_bap_qa.value) file_name = '{}'.format(FN.bap_combined.value.format('19','1'))#(str(BapQuarterly.year - 1)[-2:], BapQuarterly.quarter)) if combine_for == Combine.FOR_QA: file_name = 'QA_' + file_name else: file_name = 'ETL_' + file_name print('\nSave spreadsheet file named: {}'.format(file_name)) save_location = COM.change_working_directory(fp.path_bap_combined.value) print(str(save_location)) writer = pd.ExcelWriter(file_name) program.to_excel(writer, WS.bap_program.value, index=False) program_youth.to_excel(writer, WS.bap_program_youth.value, index=False) company_quarterly.to_excel(writer, WS.bap_company.value, index=False) if BapQuarterly.quarter == 3: company_annually.to_excel(writer, WS.bap_company_annual.value, index=False) writer.save() print('rics_spreasheet_combined.') @staticmethod def qa_bap_ric_combined(combined=False): BapQuarterly.qa.check_rics_file(fp.path_bap_combined, fp.path_bap_combined_dest, combined) @staticmethod def transfer_csv_program(dataframe): val = COM.df_list(dataframe) db.bulk_insert(sql.sql_bap_ric_program_insert.value, val) @staticmethod def transfer_csv_program_youth(dataframe): val = COM.df_list(dataframe) db.bulk_insert(sql.sql_bap_ric_program_youth_insert.value, val) @staticmethod def bulk_insert_quarterly_data(dataframe): val = COM.df_list(dataframe) db.bulk_insert(sql.sql_bap_ric_venture_quarterly_insert.value, val) @staticmethod def bulk_insert_annual_data(dataframe): val = COM.df_list(dataframe) db.bulk_insert(sql.sql_bap_ric_venture_annual_insert.value, val) @staticmethod def push_bap_quarterly_to_database(): COM.change_working_directory(fp.path_bap_combined.value) bap = pd.read_excel('ETL_RICS_BAP_COMBINED_FY19Q1.xlsx', sheet_name=None) # BapQuarterly.transfer_csv_program(bap['csv_program16']) # BapQuarterly.transfer_csv_program_youth(bap['csv_program16_youth']) BapQuarterly.bulk_insert_quarterly_data(bap['Quarterly Company Data']) if BapQuarterly.quarter == 3: BapQuarterly.bulk_insert_annual_data(bap['Annual Company data']) @staticmethod def create_bap_batch(): batch = BatchService() program = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.ric_program.value,BapQuarterly.year,BapQuarterly.quarter)) program_youth = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.ric_program_youth.value, BapQuarterly.year,BapQuarterly.quarter)) company = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.venture_data.value, BapQuarterly.year, BapQuarterly.quarter)) comapny_annual = db.pandas_read(sql.sql_annual_bap_distinct_batch.value.format(tbl.venture_annual.value,BapQuarterly.year)) # batch.create_bap_batch(program, BapQuarterly.year, BapQuarterly.quarter, tbl.ric_program.value, WS.bap_program.value, ss.RICPD_bap.value) # batch.create_bap_batch(program_youth, BapQuarterly.year, BapQuarterly.quarter, tbl.ric_program_youth.value, WS.bap_program_youth.value, ss.RICPDY_bap.value) batch.create_bap_batch(company, BapQuarterly.year, BapQuarterly.quarter, tbl.venture_data.value, WS.bap_company.value, ss.RICCD_bap.value) if BapQuarterly.quarter == 3: batch.create_bap_batch(comapny_annual, BapQuarterly.year, BapQuarterly.quarter, tbl.venture_annual.value, WS.bap_company_annual.value, ss.RICACD_bap.value) @staticmethod def transfer_bap_company(): cs = CompanyService() cs.move_company_data() @staticmethod def get_proper_values(df): df['StageLevelID'] = df.apply(lambda dfs: COM.get_stage_level(dfs.Stage), axis=1) df['High Potential y/n'] = df.apply(lambda dfs: COM.get_yes_no(dfs['High Potential y/n']), axis=1) df['Social Enterprise y/n'] = df.apply(lambda dfs: COM.get_yes_no(dfs['Social Enterprise y/n']), axis=1) df['Youth'] = df.apply(lambda dfs: COM.get_yes_no(dfs['Youth']), axis=1) # df['Funding Raised to Date $CAN'] = df.apply(lambda dfs: BapQuarterly.split_funding_range(dfs['Funding Raised to Date $CAN']), axis=1) return df @staticmethod def transfer_fact_ric_company_data(): df = db.pandas_read(sql.sql_bap_fact_ric_data_fyq4.value) df_frc = BapQuarterly.get_proper_values(df) # BapQuarterly.update_month_year(df_frc) # df_frc['IntakeDate'] = pd.to_datetime(df_frc['IntakeDate']) df_frc['Age'] = None # df_frc['Date of Incorporation'] = pd.to_datetime(df_frc['Date of Incorporation']) # df_ric = df_frc.drop(columns=['ID', 'Incorporate year (YYYY)', 'Incorporation month (MM)']) # BapQuarterly.file.save_as_csv(df_frc, '00 FactRICCompany.xlsx', os.getcwd(), 'FactRICCompany') values_list = COM.df_list(df_frc) db.bulk_insert(sql.sql_bap_fact_ric_company_insert.value, values_list) @staticmethod def split_funding_range(funding): funding_value = 0 if funding == '$100-149k': funding_value = '100000' elif funding == '$10-24k': funding_value = '10000' elif funding == '$150-249k': funding_value = '150000' elif funding == '$1M-1.9M': funding_value = '1000000' elif funding == '$250-499k': funding_value = '250000' elif funding == '$25-49k': funding_value = '25000' elif funding == '$2-5M': funding_value = '2000000' elif funding == '$2M-5M': funding_value = '2000000' elif funding == '$500-999k': funding_value = '500000' elif funding == '$50-99k': funding_value = '50000' elif funding == '<$10k': funding_value = '1000' elif funding == '>$5M': funding_value = '5000000' return funding_value @staticmethod def update_month_year(df): i = 0 for index, row in df.iterrows(): if row['Incorporate year (YYYY)'] is not None and row['Incorporation month (MM)'] is not None: row['Date of Incorporation'] = parser.parse('{}-{}-15'.format(row['Incorporate year (YYYY)'], row['Incorporation month (MM)'])) i+=1 print('{}. {}'.format(i, row['Date of Incorporation'])) # for index, row in df.iterrows(): # if row['Incorporate year (YYYY)'] is not None and len(row['Incorporate year (YYYY)']) > 4: # update = 'UPDATE BAP.QuarterlyCompanyData SET [Incorporate year (YYYY)] = {} WHERE ID = {} -- {}'.format(parser.parse(row['Incorporate year (YYYY)']).year, row['ID'], parser.parse(row['Incorporate year (YYYY)'])) # print(update) # dfs = df[df['Incorporate year (YYYY)'].isnull()] # for index, row in dfs.iterrows(): # if row['Incorporation month (MM)'] is not None and len(row['Incorporation month (MM)']) > 2: # update = 'UPDATE BAP.QuarterlyCompanyData SET [Incorporate year (YYYY)] = {} WHERE ID = {}'.format(parser.parse(row['Incorporation month (MM)']).year, row['ID']) # print(update) # i= 0 # for index, row in df.iterrows(): # if row['Incorporation month (MM)'] is not None and len(row['Incorporation month (MM)']) > 2: # i += 1 # update = 'UPDATE BAP.QuarterlyCompanyData SET [Incorporation month (MM)] = {} WHERE ID = {}'.format(parser.parse(row['Incorporation month (MM)']).month, row['ID']) # print(update) print('') @staticmethod def transfer_fact_ric_aggregation(): date_id = COM.get_dateid(datevalue=None) metric_prg = [130, 132, 133, 129, 134, 63, 77, 60, 68, 67, 135, 136, 137] metric_prg_youth = [134, 138] df_program = db.pandas_read(sql.sql_company_aggregate_program.value.format(2018, 4))#(BapQuarterly.year, BapQuarterly.quarter)) df_program_youth = db.pandas_read(sql.sql_company_aggregate_program_youth.value.format(2018, 4))#(BapQuarterly.year, BapQuarterly.quarter)) values = [] for _, row in df_program.iterrows(): i = 7 while i < 20: m = i - 7 val = [] val.append(int(row['DataSource'])) # DataSource val.append(int(date_id)) # RICDateID val.append(int(metric_prg[m])) # MetricID val.append(int(row['BatchID'])) # BatchID if str(row[i]) in ['no data', 'n\\a', '-', 'n/a', 'nan']: val.append(-1.0) print(row[i]) else: val.append(round(float(row[i]), 2)) # AggregateNumber val.append(str(datetime.datetime.today())[:23]) # ModifiedDate val.append(str(datetime.datetime.today())[:23]) # CreatedDate val.append(row['Youth']) # Youth values.append(val) i = i + 1 # db.execute(sql.sql_bap_fra_insert.value.format(tuple(val))) for _, row in df_program_youth.iterrows(): j = 7 while j < 9: m = j - 7 val = [] val.append(int(row['DataSource'])) # DataSource val.append(int(date_id)) # RICDateID val.append(int(metric_prg_youth[m])) # MetricID val.append(int(row['BatchID'])) # BatchID if str(row[j]) in ['no data', 'n\\a', '-', 'n/a', 'nan']: val.append(-1.0) print(row[j]) else: val.append(round(float(row[j]), 2)) # AggregateNumber val.append(str(datetime.datetime.today())[:23]) # ModifiedDate val.append(str(datetime.datetime.today())[:23]) # CreatedDate val.append(row['Youth']) # Youth values.append(val) j = j + 1 # db.execute(sql.sql_bap_fra_insert.value.format(tuple(val))) for val in range(len(values)): print('{}. {}'.format(val,values[val])) # print('{}. {}'.format(val,values[val][1])) db.bulk_insert(sql.sql_bap_fact_ric_aggregation_insert.value, values) @staticmethod def generate_bap_rolled_up(): company = [] i = 0 df_frcd = db.pandas_read(sql.sql_bap_fact_ric_company.value.format(BapQuarterly.year)) print('Number of record to process {} '.format(len(df_frcd))) df_fact_ds_quarter = db.pandas_read(sql.sql_bap_report_company_ds_quarter.value.format(BapQuarterly.year)) df_FactRICRolledUp = pd.DataFrame(columns=clm.clmn_fact_ric_rolled_up.value) df_industry = db.pandas_read(sql.sql_industry_list_table.value) cq = BapQuarterly.quarter total = 0 if not df_frcd.empty: for _, row in df_fact_ds_quarter.iterrows(): company_id = row['CompanyID'] data_source_id = row['DataSourceID'] i = i + 1 print('{}. {}'.format(i, company_id)) # ['Q1', 'Q2'] ls_q = [] ls_quarters = \ df_fact_ds_quarter.query('CompanyID == {} & DataSourceID == {}'.format(company_id, data_source_id))[ 'MinFQ'].tolist() ls = df_frcd.query('CompanyID == {}'.format(company_id))['FiscalQuarter'].tolist() for itm in ls_quarters: ls_q.append(itm[-1:]) # if str(cq) not in ls_q: # ls_q.append(cq) df_agg = df_frcd.query('CompanyID == {} & DataSourceID == {}'.format(company_id, data_source_id)) print(ls_q) for quarter in ls_q: if int(quarter) == cq and len(df_agg) > 1: current_quarter = 'FiscalQuarter == \'Q{}\''.format(quarter) previous_quarter = 'FiscalQuarter == \'Q{}\''.format(int(quarter) - 1) elif int(quarter) == cq and len(df_agg) == 1: current_quarter = 'FiscalQuarter == \'Q{}\''.format(quarter) elif int(quarter) < cq: current_quarter = 'FiscalQuarter == \'Q{}\''.format(quarter) try: batch_id = df_agg.query(current_quarter)['BatchID'].values[0] if not df_agg.query( current_quarter).empty else None min_date = -1 current_date = -1 vhs = df_agg.query(current_quarter)['VolunteerMentorHours'].values[0] if not df_agg.query( current_quarter).empty else None adv = df_agg.query(current_quarter)['AdvisoryServicesHours'].values[0] if not df_agg.query( current_quarter).empty else None if int(quarter) == cq: vhs_agg = df_agg['VolunteerMentorHours'].sum() adv_agg = df_agg['AdvisoryServicesHours'].sum() funding_agg = df_agg['FundingCurrentQuarter'].sum() else: vhs_agg = float(df_agg['VolunteerMentorHours'].sum()) - float( df_agg.query('FiscalQuarter == \'Q{}\''.format(cq))['VolunteerMentorHours'].values[ 0]) if not df_agg.query('FiscalQuarter == \'Q{}\''.format(cq)).empty else float( df_agg.query('FiscalQuarter == \'Q{}\''.format(cq - 1))['VolunteerMentorHours'].values[ 0]) adv_agg = float(df_agg['AdvisoryServicesHours'].sum()) - float( df_agg.query('FiscalQuarter == \'Q{}\''.format(cq))['AdvisoryServicesHours'].values[ 0]) if not df_agg.query('FiscalQuarter == \'Q{}\''.format(cq)).empty else float( df_agg.query('FiscalQuarter == \'Q{}\''.format(cq - 1))['AdvisoryServicesHours'].values[ 0]) funding_agg = float(df_agg['FundingCurrentQuarter'].sum()) - float( df_agg.query('FiscalQuarter == \'Q{}\''.format(cq))['FundingCurrentQuarter'].values[ 0]) if not df_agg.query('FiscalQuarter == \'Q{}\''.format(cq)).empty else float( df_agg.query('FiscalQuarter == \'Q{}\''.format(cq - 1))['FundingCurrentQuarter'].values[ 0]) modified_date = datetime.datetime.utcnow().__str__()[:23] stage = df_agg.query(current_quarter)['Stage'].values[0] if not df_agg.query( current_quarter).empty else df_agg.query(previous_quarter)['Stage'].values[0] industry_sector = df_agg.query(current_quarter)['IndustrySector'].values[0] if not df_agg.query( current_quarter).empty else df_agg.query(previous_quarter)['IndustrySector'].values[0] socialEnterprise = df_agg.query(current_quarter)['SocialEnterprise'].values[ 0] if not df_agg.query(current_quarter).empty else \ df_agg.query(previous_quarter)['SocialEnterprise'].values[0] highPotential = df_agg.query(current_quarter)['HighPotential'].values[0] if not df_agg.query( current_quarter).empty else df_agg.query(previous_quarter)['HighPotential'].values[0] youth = df_agg.query(current_quarter)['Youth'].values[0] if not df_agg.query( current_quarter).empty else df_agg.query(previous_quarter)['Youth'].values[0] dateOfIncorporation = df_agg.query(current_quarter)['DateOfIncorporation'].values[ 0] if not df_agg.query(current_quarter).empty else \ df_agg.query(previous_quarter)['DateOfIncorporation'].values[0] annual_revenue = df_agg.query(current_quarter)['AnnualRevenue'].values[0] if not df_agg.query( current_quarter).empty else None funding_current_quarter = df_agg.query(current_quarter)['FundingCurrentQuarter'].values[ 0] if not df_agg.query(current_quarter).empty else None number_of_employees = df_agg.query(current_quarter)['NumberEmployees'].values[ 0] if not df_agg.query(current_quarter).empty else None intake_date = df_agg.query(current_quarter)['IntakeDate'].values[0] if not df_agg.query( current_quarter).empty else None lvl2_industry_name = df_industry.query('Industry_Sector == \'{}\''.format(industry_sector))[ 'Lvl2IndustryName'].values[0] if not df_industry.query( 'Industry_Sector == \'{}\''.format(industry_sector)).empty else None dd = {'DataSourceID': data_source_id, 'CompanyID': company_id, 'MinDate': min_date, 'CurrentDate': current_date, 'VolunteerYTD': vhs_agg, 'AdvisoryHoursYTD': adv_agg, 'VolunteerThisQuarter': vhs, 'AdvisoryThisQuarter': adv, 'FiscalQuarter': quarter, 'BatchID': batch_id, 'ModifiedDate': modified_date, 'SocialEnterprise': socialEnterprise, 'Stage': stage, 'HighPotential': highPotential, 'Lvl2IndustryName': lvl2_industry_name, 'FiscalYear': BapQuarterly.year, 'Youth': youth, 'DateOfIncorporation': dateOfIncorporation, 'AnnualRevenue': annual_revenue, 'NumberEmployees': number_of_employees, 'FundingToDate': funding_current_quarter, 'IndustrySector': industry_sector, 'IntakeDate': intake_date, 'FundingCurrentQuarter': funding_agg } print(dd.values()) df = pd.DataFrame([dd], columns=clm.clmn_fact_ric_rolled_up.value) df_FactRICRolledUp = pd.concat([df_FactRICRolledUp, df]) except Exception as ex: total = total + 1 company.append(company_id) print(ex) df_FactRICRolledUp = df_FactRICRolledUp[clm.clmn_fact_ric_rolled_up.value] BapQuarterly.file.save_as_csv(df_FactRICRolledUp, 'BAP_Rolled_UP_{}.xlsx'.format(str(datetime.datetime.today())), '/Users/mnadew/Box Sync/mnadew/IE/data/ETL/BAP') print(company) print('{} + {} = {}/ 6236 '.format(len(df_FactRICRolledUp), total, len(df_FactRICRolledUp) + total)) # @staticmethod # def generate_bap_report(): # pass @staticmethod def create_postal_code_list(): pcdb = PostalCodeDatabase() results = pcdb.get_postalcodes_around_radius('T3Z', 2500) print(type(results)) cl = ['postalcode', 'city', 'province', 'longitude', 'latitude', 'timezone', 'dst'] dfs = pd.DataFrame(columns=cl) for r in results: df = pd.DataFrame([r.__dict__], columns=cl) dfs = pd.concat([dfs, df]) dfs @staticmethod def read_postal_code(): path = '/Users/mnadew/Box Sync/mnadew/PRD_DB_REVIEW' print(os.getcwd()) os.chdir(path) print(os.getcwd()) columns = ['FSALDU', 'LATITUDE', 'LONGITUDE', 'COMMNAME', 'CSDNAMEE', 'CSDNAMEF', 'CSDTYPENE', 'PRABB'] df = pd.read_csv('postal_code_utf8.csv') df = df[columns] print(len(df)) i = 846000 j = 847000 while j < 847001: print('From {} to {}'.format(i, j)) df_ins = df.iloc[i:j] BapQuarterly.insert(df_ins) print(len(df_ins)) i, j = i + 1000, j + 1000 print('From {} to {}'.format(i, j)) @staticmethod def bap_insert(df): values_list = COM.df_list(df) db.bulk_insert(sql.sql_postal_code_insert.value, values_list) @staticmethod def main(): while True: fy, fq = COM.fiscal_year_quarter() print('_'*100) print('| WELCOME TO BAP QUARTERLY ETL\n| FISCAL YEAR: {}\n| FISCAL QUARTER: {}'.format(fy, fq - 1)) print('_' * 100) menu = ''' 1: Show Source File for BAP quarterly FY18-Q3 1a: CHECK Columns Completeness 2: QA spreadsheet by RIC 3: Combine RICs BQ spreadsheet 4: QA RICs BQ combined spreadsheet 5: Push RICs data ro the database 6: Generate Batch for RICs FY18 -Q3 7: Match Company name 8: Push Company data to DIM COMPANY and DIM COMPANY SOURCE 9: Push quarterly company data to FACT RIC COMPANY DATA 10: Push Annual company data to FACT RIC COMPANY DATA 11: push Program and Program youth data to FACT RIC Aggregation ''' print(menu) option = input('\nChoose your option:\t') if str(option) == '1': BapQuarterly.show_bap_quarterly_template() if str(option) == '1a': BapQuarterly.qa.check_columns_completeness() if str(option) == '2': BapQuarterly.qa.check_rics_file() if str(option) == '3': pass if str(option) == '4': pass if str(option) == '5': pass if str(option) == '6': pass if str(option) == '7': pass if str(option) == '8': pass if str(option) == '9': pass if str(option) == '10': pass if str(option) == '11': pass if str(option) == '12': pass @staticmethod def tech_alliance_intake_date_TEMP(): # update = 'UPDATE BAP.QuarterlyCompanyData SET [Date of Intake] = \'{}\' WHERE [Company Name] = \'{}\' AND DataSource = 6' update = ' SELECT * FROM BAP.QuarterlyCompanyData WHERE [Company Name] = \'{}\' AND DataSource = 6 UNION' current_path = os.path.join(os.path.expanduser("~"), '/Users/mnadew/Box Sync/Workbench/BAP/BAP_FY18/FY18_Q3/for ETL/Missing data Reports') os.chdir(current_path) df = pd.read_excel('01 TechAlliance_BAP_qtrly_perCompany_MISSING DATA(2).xlsx', 'Quarterly Company data') # df['BasicName'] = df.apply(lambda dfs: COM.update_cb_basic_name(dfs['Company Name']), axis=1) i = 0 for i, r in df.iterrows(): if r[2] is not None or r[2]== 'nan': # print(r[2]) year = r[2][-4:] month = r[2][3:5] date = r[2][:2] i = i + 1 # print('{}. {} ---> {}-{}-{}'.format(i, r[2], year, month, date)) d = '{}-{}-{}'.format(year, month, date) # print(update.format(d, r[0])) print(update.format(r[0])) @staticmethod def combine_missing_data(): quarterly_missing = BapQuarterly.file.combine_bap_missing_source_file( current_path=fp.path_missing_bap_etl.value) quarterly_missing = quarterly_missing.where(pd.notnull(quarterly_missing), None) quarterly_missing['BasicName'] = quarterly_missing.apply(lambda dfs: COM.get_basic_name(dfs.CompanyName), axis=1) df = quarterly_missing.where(pd.notnull(quarterly_missing), None) print(df.columns) dfs = df[['CompanyName', 'BasicName', 'Website', 'AnnualRevenue', 'NumberOfEmployees', 'FundingToDate', 'DataSource']] BapQuarterly.file.save_as_csv(dfs, '00 BAP Missing data Combined.xlsx', os.getcwd(), 'BAP Missing data') print(dfs.head()) @staticmethod def push_bap_missing_data_to_temp_table(): current_path = os.path.join(os.path.expanduser("~"), '/Users/mnadew/Box Sync/Workbench/BAP/BAP_FY18/FY18_Q3/for ETL/Missing data Reports') os.chdir(current_path) df = pd.read_excel('00 BAP Missing data Combined.xlsx', 'BAP Missing data') df['CompanyID'] = 0 new_col = ['CompanyID','CompanyName','BasicName','Website','AnnualRevenue','NumberOfEmployees','FundingToDate','DataSource'] dfs = df[new_col] sql = 'INSERT INTO BAP.BAP_FY18Q3_Missing_Data VALUES (?, ?, ?, ?, ?, ?, ?, ?)' values = COM.df_list(dfs) db.bulk_insert(sql, values) @staticmethod def bap_company_basic_name(): db.update_basic_name(sql.sql_bap_basic_name.value, 'ID', 'CompanyName', sql.sql_bap_basic_name_update.value)
# run sql to delete old ans del_old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'del_old_ans') DB.execute(del_old_ans_sql) def etl(self): # clean self.clean_df() # delete old ans # DBInteractions.delete_old_ans() # load self.load() if __name__ == '__main__': select_qs = CM.get_config("config_sql.ini", "ann_survey_18", "select_ans_by_qids") domain = 'restapica' v = '4' survey = 'survey' surveyid = '50021327' resp = 'surveyresponse' params = { 'resultsperpage': 200, "filter[field][0]=status&filter[operator][0]=!=&filter[value][0]": 'deleted', 'page': 1 } api = API(API_TOKEN, domain, v, survey, surveyid, resp, params) print('Fetching data from API') data = api.get_data(test=False) j = Json(data, surveyid)
def _main_(): ''' PELASE INSTALL CERTIFICATE AND REMOVE THIS, WHERE EVER THE CERTIFICATE IS ''' # urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) """Menu-selected actions for SGizmo API """ menu_actions.write_survey_entries(API_TOKEN) menu = menu_actions.construct_menu() survey_id = 'w' selection = 0 campaign_id = 'w' table_dict = 'w' surveys_df = None session_variables = [] print("\nYou must enter a survey ID for this session.") # sleep(1) print("Downloading list of surveys now.") # sleep(1) surveys_df = menu_actions.get_surveys(API_TOKEN, with_stats=True) if surveys_df is not None: session_variables.append(1) # while type(survey_id) != int: # try: survey_id = input("Enter a survey ID from above to use for this session: ") survey_id = menu_actions.validate_survey_id(survey_id, session_variables, API_TOKEN, surveys_df, mandatory=True) # except ValueError: # continue menu_title = "\nMenu\n Quit: 99\n Back to main menu: -1\n============================================" # write all surveys from SG to DB, including BatchID while selection == 0: # main menu print(menu_title) # get survey title from survey_df survey_title = surveys_df.loc[surveys_df["id"] == str(survey_id), ["title"]] try: survey_title = survey_title.values[0][0] except IndexError: survey_title = None for key in menu: if survey_id != 'w' and key == 17: # specify current session survey print( str(key) + ".\t" + str(menu[key]) + " (Set as " + str(survey_id) + " : " + str(survey_title) + ")") elif key in session_variables: # strike through if action has been completed this session print('\u0336'.join(str(key) + ". " + menu[key]) + '\u0336' + " DONE") else: print(str(key) + ".\t" + menu[key]) while type(selection) != int or selection not in range( 1, len(menu) + 1): try: selection = int(input("\nEnter a valid option number: ")) if selection == 99: print("You entered 99.\nI can't believe you've done this.") print('Farewell...') break except ValueError: continue # get surveys if selection == 1: if 1 not in session_variables: surveys_df = menu_actions.get_surveys(API_TOKEN) if surveys_df is not None: session_variables.append(1) elif 1 in session_variables: print(surveys_df) print("Surveys already downloaded from SGizmo API.") sleep(0.75) print("Returning to main menu.") sleep(1) # get campaigns elif selection == 3: campaigns_df = menu_actions.get_campaigns(API_TOKEN, survey_id, session_variables, surveys_df) session_variables.append(3) # get email msgs elif selection == 5: if 3 not in session_variables: print("You must download campaign data first.") get_campaigns = input("Get campaign data now? (y/n): ") if get_campaigns.lower() == "y": campaigns_df = menu_actions.get_campaigns( API_TOKEN, survey_id, session_variables, surveys_df) if campaigns_df is not None: session_variables.append(5) else: print("Returning to main menu.") sleep(1) elif 2 in session_variables: emails_df = menu_actions.get_emails(survey_id, API_TOKEN, session_variables, surveys_df) if emails_df is not None: session_variables.append(5) # get contact lists elif selection == 9: if 9 not in session_variables: contact_list_df = menu_actions.get_contact_lists( survey_id, API_TOKEN) # contact_list_df = sg_contact_lists.sg_contactlists_df(API_TOKEN) print(contact_list_df) if contact_list_df is not None: session_variables.append(9) elif 9 in session_variables: print(contact_list_df) print( "Contact lists already downloaded. Returning to main menu." ) sleep(1.5) # get contacts on list elif selection == 10: list_id = 'w' while type(list_id) != int: try: list_id = int( input( "Enter ID of contact list that you would like to retrieve: " )) except ValueError: continue if list_id != -1: contacts_df = menu_actions.get_contacts(API_TOKEN, list_id) # get respondents elif selection == 11: if 11 not in session_variables: resps_df = menu_actions.get_resps(survey_id, API_TOKEN) if resps_df is not None: session_variables.append(11) elif 11 in session_variables: print("Already downloaded responses. Returning to main menu") sleep(1.5) # get questions or options elif selection in [12, 13]: if 12 not in session_variables and 13 not in session_variables: qs_df, os_df = menu_actions.get_qsos(survey_id, API_TOKEN) if qs_df is not None: session_variables.append(12) if os_df is not None: session_variables.append(13) if selection == 12 or (selection == 12 and 12 in session_variables): print(qs_df) elif selection == 13 or (selection == 13 and 13 in session_variables): print(os_df) # get answers elif selection == 14: if 14 not in session_variables: answers_df, resps_df = menu_actions.get_ans( survey_id, API_TOKEN) if answers_df is not None: session_variables.append(14) elif 14 in session_variables: print("Already downloaded answers. Returning to main menu.") sleep(1.5) # get response statuses elif selection == 6: try: reports_df, status_df = menu_actions.get_resp_stats( survey_id, API_TOKEN) except TypeError: selection = 0 if "JLAB" in survey_title: path = "/Users/gcree/Box Sync/MaRS DataCatalyst 2017 CONFIDENTIAL/JLABS Toronto Annual Survey 2017/Response_Status_Reports/" misc.write_to_xl(status_df.drop("invite_link", axis=1), "ResponseStatuses", out_path=path, sheetname="response_statuses") elif "annual" in survey_title.lower() and "2018" in survey_title: path = CM.get_config("config.ini", "paths", "survey2018_response_stats") misc.write_to_xl(status_df, 'ResponseStatuses' + survey_title, out_path=path, sheetname="response_statuses") # get resp stats for all campaigns elif selection == 19: campaigns_df = menu_actions.get_campaigns(API_TOKEN, survey_id, session_variables, surveys_df) campaigns_df = campaigns_df[[ 'id', 'campaign_name', 'link_type', 'campaign_status' ]] reports_list = [] status_list = [] campaigns_df = campaigns_df[ campaigns_df['campaign_status'] != 'Deleted'] campaigns_df = campaigns_df[campaigns_df['link_type'] != 'link'] for cid in campaigns_df["id"]: print('Downloading data for campaign id: {}'.format(cid)) reports_df, status_df = menu_actions.get_resp_stats( survey_id, API_TOKEN, campaign_id=int(cid)) if len(reports_df) > 0: reports_list.append(reports_df) status_list.append(status_df) # concat all reports dfs and concat all status dfs if len(reports_list) == 0: pass elif len(reports_list) == 1: reports_df = reports_list[0] status_df = status_list[0] else: reports_df = pd.concat(reports_list) status_df = pd.concat(status_list) # left join campaigns <- reports <- statuses dfs df1 = pd.merge(campaigns_df, reports_df, how='inner', left_on=["id"], right_on=["campaign_id"]) all_resp_stats = pd.merge(df1, status_df, how='left', left_on='id_y', right_on='report_id') all_resp_stats = all_resp_stats.drop('id_y', axis=1).drop( 'campaign_id', axis=1).drop('report_id', axis=1).drop('primary_RIC', axis=1).drop('venture_id', axis=1) all_resp_stats = all_resp_stats.rename( columns={'id_x': "campaign_id"}) path_ini = CM.get_config("config.ini", "paths", "sandbox") path = CM.change_working_directory(path_ini) print(path) misc.write_to_xl(all_resp_stats, 'ResponseStatuses - {}'.format(survey_title), out_path=path, sheetname="response_statuses") # stat_table = 'MDCReport.Fact_Response_Status' # print('Truncating and writing to ' + stat_table) # trunc_sql = CM.get_config('config.ini', 'sql_queries', 'trunc_stat_rep') # DB.execute(trunc_sql) # ins_sql = CM.get_config('config.ini', 'sql_queries', 'insert_stat_rep_f') # ins_sql = ins_sql.replace('(WHAT_HEADERS) ', '') # values = CM.df_list(all_resp_stats) # val_num = len(values) # for i in range(val_num): # try: # val = [] # for l, j in enumerate(values[i]): # if isinstance(values[i][l], list): # val.append(''.join(str(x) for x in values[i][l])) # elif isinstance(values[i][l], str): # val.append(CM.sql_compliant(values[i][l])) # else: # val.append(values[i][l]) # tup = tuple(val) # ins_sql_final = ins_sql.format(tup) # ins_sql_final = ins_sql_final.replace('"', '\'') # DB.execute(ins_sql_final) # print("Record {} of {} : SUCCESS".format(i, val_num)) # except Exception as e: # print("Record number {} of {} : ERROR: {}".format(i, val_num, e)) # print("ERROR VALUES : {}".format(ins_sql)) # continue # set survey ID elif selection == 17: # survey_id_choice = 1 r_u_sure = 0 # if type(survey_id) == int: while str(r_u_sure).lower() not in ['n', 'y']: try: r_u_sure = input(""" Warning: changing the surveyID for this session will clear the data downloaded for the previous survey during this session. Do you still want to change the survey ID? (y/n): """) if str(r_u_sure).lower() == 'y': session_variables[:] = [ y for y in session_variables if y in [1, 15] ] survey_id_choice = input( "Survey ID has been reset. Enter new ID: ") survey_id_choice = menu_actions.validate_survey_id( survey_id_choice, session_variables, API_TOKEN, surveys_df) if survey_id_choice is not None: survey_id = survey_id_choice elif str(r_u_sure).lower() in ['n', str(-1)]: print('Returning to main menu') sleep(0.75) selection = 0 break except ValueError: continue # get all tables from schema into dfs elif selection == 15: # schema = "JLABS" # schema = str(input("Enter name of schema for which you would like to load all tables into dataframes: ")) # # table_dict = menu_actions.get_db_tables(schema, printout=True) # # session_variables.append(15) # # # ========= Dependency query and dict ========== # # dependency_dict = menu_actions.get_dependencies(schema, printout=True) # # load_ordered_tables = menu_actions.get_load_order(schema, printout=True) pass # test get dependencies elif selection == 16: schema = str( input( "Enter name of schema you would like to get dependencies for: " )) skipped = False if menu_actions.return_to_main(schema) == 1: print("skipped") skipped = True sleep(0.5) if not skipped: dependencies = sg_get_tables.get_dependencies(schema) print("\n", dependencies) dependency_dict = {} for i in range(0, len(dependencies)): fkt = dependencies.iloc[:, 0][i] reft = dependencies.iloc[:, 1][i] if fkt not in dependency_dict.keys(): dependency_dict[fkt] = [] dependency_dict[fkt].append(reft) print("\nDependency dict: \n") for key in dependency_dict.keys(): print(key, ":", dependency_dict[key]) load_order = sg_get_tables.get_load_order(schema) print("\nLOAD ORDER:\n", load_order) # load survey into DB elif selection == 2: # if 12 not in session_variables: # print("Pull in tables from DB before loading survey data into DB") # print("Execute menu item 12.") # sleep(1) # # elif 12 in session_variables: # # check if surveyID selected is in survey DB table # surveys_table = table_dict["Surveys"] # survey_ids = [] # for id in surveys_table["id"]: # survey_ids.append(id) # if survey_id in survey_ids: # print("Survey already exists in database") # elif survey_id not in survey_ids: # print("Survey does not exist in DB. Loading survey data now") print("\nLoading survey entry into DB") menu_actions.load_survey_entry(surveys_df, survey_id) print("\nLoading survey questions & options into DB") menu_actions.load_qsos(survey_id, API_TOKEN) session_variables.append(2) # load responses, answers, contacts, contact lists, and contacts__lists entries elif selection == 4: exist = menu_actions.check_qs_exist(survey_id) if exist: print( "At least one question for this survey exists in DB. Proceeding to load answers into DB" ) print("Loading Responses first...") menu_actions.load_resps_ans_contacts__lists( survey_id, API_TOKEN) else: print( "No questions for this survey exist in DB. Load questions before loading answers." ) # write all current survey to DB elif selection == 7: menu_actions.write_all_survey_components_to_db( session_variables, surveys_df, survey_id, API_TOKEN) session_variables.append(7) # write all components of all surveys to DB elif selection == 8: menu_actions.do_everything_for_all_surveys(session_variables, surveys_df, API_TOKEN) session_variables.append(8) elif selection == 18: sure = input( "\nAre you sure you wish to delete all components of current survey from the database? (y/n): " ) if str(sure).lower() == "y": menu_actions.del_survey_components(survey_id) # quit program elif selection == 99: break selection = 0
def _main_(): # make the damn ric dict: ricname: datasourceID (except CII & OSVP, number is not datasourceid) rics = { 'MaRS Discovery District': { 'db_name': 'MaRS Discovery District', 'code': 7 }, 'RIC Centre': { 'db_name': 'RIC Centre', 'code': 9 }, 'Innovation Factory': { 'db_name': 'Innovation Factory', 'code': 12 }, 'NWOIC': { 'db_name': 'NWO Innovation Centre', 'code': 14 }, 'Invest Ottawa': { 'db_name': 'Invest Ottawa', 'code': 16 }, 'IION': { 'db_name': 'IION', 'code': 5 }, 'CII': { 'db_name': 'MaRS Centre for Impact Investing', 'code': -1 }, 'OSVP': { 'db_name': 'Ontario Scale-Up Voucher Program', 'code': -1 }, 'Innovation Guelph': { 'db_name': 'Innovation Guelph', 'code': 15 }, 'WEtech': { 'db_name': 'WEtech', 'code': 2 }, 'SSMIC': { 'db_name': 'SSMIC', 'code': 3 }, 'TechAlliance': { 'db_name': 'TechAlliance', 'code': 6 }, 'Haltech': { 'db_name': 'Haltech', 'code': 8 }, 'Spark Centre': { 'db_name': 'Spark Centre', 'code': 10 }, 'NORCAT': { 'db_name': 'NORCAT', 'code': 1 }, 'VentureLAB': { 'db_name': 'ventureLAB', 'code': 11 }, 'Innovate Niagara': { 'db_name': 'Innovate Niagara', 'code': 17 }, 'Launch Lab': { 'db_name': 'Launch Lab', 'code': 13 } # ,'Communitech': {'db_name': 'Communitech', 'code': 4} } with shelve.open(q_meta_name, 'r') as qs_metadata: print("Creating ric_qs dict") ric_qs = {} for ric in rics: if ric in list(qs_metadata['addedby'].keys()): ric_qids = include_list(ric) ric_qs[ric] = ric_qids # elif ric.lower() == 'communitech': # ric_qs[ric] = qs_metadata['which_survey']['COMMUNITECH'] else: ric_qs[ric] = qs_metadata['core/noncore']['core'] print("Reading qs_metadata.xlsx to df") cwd = os.getcwd() user_path = os.path.expanduser("~") filename = '/qs_metadata.xlsx' meta_dfs = CM.xl_to_dfs(cwd, filename) sheetname = 'Sheet1' meta_df = meta_dfs[sheetname] # create master data dict with qid: concatted name (i.e., <survey_section - readable_name>) print("Creating master data dict") meta_df = meta_df.sort_values(by=['q_num'], ascending=[True]) meta_df['col_title'] = meta_df['survey_section'].astype( str) + ' - ' + meta_df['readable_name'] data_dict = meta_df[['id', 'col_title', 'title', 'q_num']] # split master data dict into one for each ric print("Splitting master data dict into 1 per RIC") ric_data_dicts = {} for ric in ric_qs.keys(): qids_df = pd.DataFrame(ric_qs[ric], columns=['id']) ric_data_dict = pd.merge(qids_df, data_dict, how='inner', on=['id']) ric_data_dict.sort_values(by='q_num', inplace=True) ric_data_dicts[ric] = ric_data_dict # read questions and options from DB print("Reading questions and options from DB into qsos df") qsos_sql = CM.get_config("config_sql.ini", "ann_survey_18", "all_qsos") qsos = DB.pandas_read(qsos_sql) # add col_title column to qsos df qsos = pd.merge(qsos, meta_df[['id', 'col_title', 'q_num']], how='left', left_on='qid', right_on='id') qsos.drop('id', inplace=True, axis=1) print("Transforming qsos df") # put flag on 'ESSAY', 'TABLE', 'TEXTBOX', 'MENU', 'RADIO' so that their col_title does not change in next step qsos['multi_options'] = qsos.q_type.apply(multi_options) # for options, make col_title = col_title + "Option: " + [o_label] qsos['col_title'] = qsos.apply(opt_col_title, axis=1) qsos = qsos[qsos['q_num'] > 0] # capture correct order for columns for use later in formatting pivoted datasheets col_title_order = pd.Series(qsos.q_num.values, index=qsos.col_title).to_dict() # read answers from DB print("Reading answers from DB into ans df") ans_sql = CM.get_config("config_sql.ini", "ann_survey_18", "sel_ann_survey_res") ans = DB.pandas_read(ans_sql) # separate process for Communitech shared ventures # 1. get list of Communitech shared client answers print("Reading Communitech shared clients") comm_sql = CM.get_config("config_sql.ini", "ann_survey_18", "sel_communitech_shared") comm_ans = DB.pandas_read(comm_sql) # 2. concat with rest of answers (?) ans = pd.concat([ans, comm_ans]) # clean ans print("Cleaning ans df") ans.dropna(subset=['Answer'], inplace=True) ans['Answer'] = ans.apply(replacements, axis=1) ans['page_pipe'] = ans['page_pipe'].fillna('') # for each RIC print("\nPer RIC df datasheet creation:") for ric in ric_qs: # if ric == 'MaRS Discovery District': # turn that RIC's qid list into df print("\nRIC: {}".format(ric)) print("Creating df of questions for {}".format(ric)) qs_df = pd.DataFrame(ric_qs[ric], columns=['qid']) qs_df['ric'] = rics[ric]['db_name'] # left join that df with qsos df on qid qs_df = pd.merge(qs_df, qsos, how='left', on='qid') # left join resulting df with ans df print("Left join qs with ans") ric_survey_results = pd.merge( qs_df, ans, how='left', left_on=['qid', 'oid', 'ric'], right_on=['QuestionID', 'OptionID', 'RIC_Program']) # drop empty answers and sort print("Clean ans") ric_survey_results = ric_survey_results[pd.notnull( ric_survey_results['Answer'])] ric_survey_results.sort_values(by='q_num', inplace=True) # ric_survey_results.dropna(subset=['Answer']) print("Pivot into datasheet for {}".format(ric)) ric_datasheet = ric_survey_results[[ 'resp_id', 'CompanyID', 'col_title', 'Answer', 'page_pipe' ]].drop_duplicates() ric_datasheet['col_title'] = ric_datasheet[ 'col_title'] + ' ' + ric_datasheet['page_pipe'].astype(str) ric_datasheet['rid_cid'] = ric_datasheet['resp_id'].astype( float).astype(str) + '-' + ric_datasheet['CompanyID'].astype(str) ric_datasheet = ric_datasheet[['rid_cid', 'col_title', 'Answer']] try: ric_datasheet = ric_datasheet.pivot(index='rid_cid', columns='col_title', values='Answer') # ric_datasheet = pd.pivot_table(ric_datasheet, values='Answer', columns='col_title', index='rid_cid') ric_datasheet.reset_index(inplace=True) ric_datasheet['resp_id'], ric_datasheet[ 'CompanyID'] = ric_datasheet['rid_cid'].str.split('-', 1).str ric_datasheet.drop('rid_cid', axis=1, inplace=True) ric_datasheet = ric_datasheet.apply(pd.to_numeric, errors='ignore') # remove non-consenting responses for val in list(ric_datasheet): if 'consent' in str(val.lower()): consent_col = val ric_datasheet[consent_col] = ric_datasheet[ consent_col].str.replace(u"\u2019", "'") ric_datasheet = ric_datasheet[ ric_datasheet[consent_col] != "I don't give consent"] consent_col = '' break # re-order columns to reflect q_num ordering cols = list(ric_datasheet) rid_cid = cols[-2:] q_cols = cols[:-2] ordered_q_cols = [] for q in q_cols: if q[-2:] == '.0': ordered_q_cols.append([col_title_order[q[:-8]], q]) else: ordered_q_cols.append([col_title_order[q.strip()], q]) ordered_q_cols.sort() for i in range(len(ordered_q_cols)): ordered_q_cols[i] = ordered_q_cols[i][1] cols = rid_cid + ordered_q_cols ric_datasheet = ric_datasheet[cols] save_path = path_xl( user_path=user_path, path_extension= "Box Sync/Workbench/BAP/Annual Survey FY2018/DEV - Results to RICs/", filename=ric + '.xlsx') # pull out social impact companies separately for use later in CII datasheet if ric == 'MaRS Discovery District': soc_imp_df = ric_datasheet[ ric_datasheet['social_impact - Motives '] == 'Yes'] if ric != 'CII': # save to disc results_sheets = [ric_datasheet, ric_data_dicts[ric]] sheetnames = ['SurveyData', 'DataDictionary'] save_xls(results_sheets, save_path, sheetnames) print("Wrote to {}".format(save_path)) else: print('Add extra tabs to {} datasheet'.format(ric)) results_sheets = [ ric_datasheet, soc_imp_df, ric_data_dicts[ric], ric_data_dicts['MaRS Discovery District'] ] sheetnames = [ 'CII_SurveyData', 'All_RICs_SocialImpact_SurveyData', 'CII_DataDict', 'MaRS_DataDict' ] save_xls(results_sheets, save_path, sheetnames) print("Wrote to {}".format(save_path)) except ValueError as ex: print("!\nERROR FOR {}: {}\n!\n".format(ric, ex)) # save conflicting answer values when pivot fails save_path = path_xl( user_path=user_path, path_extension= "Box Sync/Workbench/BAP/Annual Survey FY2018/DEV - Results to RICs/__dupies/", filename=ric + '_dupies' + '.xlsx') save_xls([ ric_datasheet[ric_datasheet.duplicated( ['rid_cid', 'col_title'], keep=False)] ], save_path, ['dupies']) continue pass