def pick_record_from_file_system(storage_dir, table, known_info_d=None): """ Looks for record in file system. Returns a file-style <record> (with enums as plaintext). If no record found, <idx> is none; otherwise value of <idx> is irrelevant.""" # initialize to keep syntax-checker happy filtered_file = None if not known_info_d: known_info_d = {} name_field = dbr.get_name_field(table) # identify/create the directory for storing individual records in file system if not os.path.isdir(storage_dir): os.makedirs(storage_dir) # read any info from <table>'s file within that directory storage_file = os.path.join(storage_dir, f'{table}.txt') if os.path.isfile(storage_file): from_file = pd.read_csv(storage_file, sep='\t') if not from_file.empty: # filter via known_info_d filtered_file = from_file.loc[( from_file[list(known_info_d)] == pd.Series(known_info_d)).all( axis=1)] else: filtered_file = from_file print(f'Pick a record from {table} list in file system:') idx, file_style_record = pick_one(filtered_file, name_field) else: idx, file_style_record = None, None if idx is not None: file_style_record = dict(filtered_file.loc[idx]) else: file_style_record = None return idx, file_style_record
def get_ids_for_foreign_keys(session, df1, element, foreign_key, refs, load_refs, error): """ TODO <fn> is foreign key""" df = df1.copy() # append the Id corresponding to <fn> from the db foreign_elt = f'{foreign_key[:-3]}' interim = f'{foreign_elt}_Name' target_list = [] for r in refs: ref_name_field = db_routines.get_name_field(r) r_target = pd.read_sql_table(r, session.bind)[['Id', ref_name_field]] r_target.rename(columns={ 'Id': foreign_key, ref_name_field: interim }, inplace=True) if element == 'ExternalIdentifier': # add column for cdf_table of referent r_target.loc[:, 'cdf_element'] = r target_list.append(r_target) target = pd.concat(target_list) if element == 'ExternalIdentifier': # join on cdf_element name as well df = df.merge(target, how='left', left_on=['cdf_element', 'internal_name'], right_on=['cdf_element', interim]) # rename 'Foreign_Id' to 'Foreign' for consistency in definition of missing # TODO why is ExternalIdentifier special in this regard? # Is it that ExternalIdentifier doesn't have a name field? df.rename(columns={foreign_key: foreign_elt}, inplace=True) else: df = df.merge(target, how='left', left_on=foreign_elt, right_on=interim) missing = df[(df[foreign_elt].notnull()) & (df[interim].isnull())] if missing.empty: df.drop([interim], axis=1) else: if load_refs: # Always try to handle/fill in the missing IDs raise ForeignKeyException( f'For some {element} records, {foreign_elt} was not found') else: if not element in error: error[element] = {} error[element]["foreign_key"] = \ f"For some {element} records, {foreign_elt} was not found" return df
def check_dependencies(juris_dir, element): """Looks in <juris_dir> to check that every dependent column in <element>.txt is listed in the corresponding jurisdiction file. Note: <juris_dir> assumed to exist. """ d = juris_dependency_dictionary() f_path = os.path.join(juris_dir, f'{element}.txt') assert os.path.isdir(juris_dir) element_df = pd.read_csv(f_path, sep='\t', index_col=None, encoding='iso-8859-1') unmatched_error = [] # Find all dependent columns dependent = [c for c in element_df if c in d.keys()] changed_elements = set() report = [f'In {element}.txt:'] for c in dependent: target = d[c] ed = pd.read_csv(os.path.join(juris_dir, f'{element}.txt'), sep='\t', header=0, encoding='iso-8859-1').fillna('').loc[:, c].unique() # create list of elements, removing any nulls ru = list( pd.read_csv(os.path.join(juris_dir, f'{target}.txt'), sep='\t', encoding='iso-8859-1').fillna( '').loc[:, db_routines.get_name_field(target)]) try: ru.remove(np.nan) except ValueError: pass missing = [x for x in ed if x not in ru] if len(missing) == 0: report.append(f'Every {c} in {element}.txt is a {target}.') elif len(missing) == 1 and missing == [ '' ]: # if the only missing is null or blank # TODO some dependencies are ok with null (eg. PrimaryParty) and some are not report.append( f'Some {c} are null, and every non-null {c} is a {target}.') else: changed_elements.add(element) changed_elements.add(target) unmatched_error.append( f'Every {c} must be a {target}. This is not optional!!') # if dependent: # print('\n\t'.join(report)) return changed_elements, unmatched_error
def raw_elements_to_cdf(session, project_root, juris, mu, raw, count_cols, ids=None): """load data from <raw> into the database. Note that columns to be munged (e.g. County_xxx) have mu.field_rename_suffix (e.g., _xxx) added already""" working = raw.copy() # enter elements from sources outside raw data, including creating id column(s) # TODO what if contest_type (BallotMeasure or Candidate) has source 'other'? if not ids: for t, r in mu.cdf_elements[mu.cdf_elements.source == 'other'].iterrows(): # add column for element id # TODO allow record to be passed as a parameter idx, db_record, enum_d, fk_d = ui.pick_or_create_record( session, project_root, t) working = add_constant_column(working, f'{t}_Id', idx) else: working = add_constant_column(working, 'Election_Id', ids[1]) working = add_constant_column(working, '_datafile_Id', ids[0]) working = munge_and_melt(mu, working, count_cols) # append ids for BallotMeasureContests and CandidateContests working = add_constant_column(working, 'contest_type', 'unknown') for c_type in ['BallotMeasure', 'Candidate']: df_contest = pd.read_sql_table(f'{c_type}Contest', session.bind) working = replace_raw_with_internal_ids( working, juris, df_contest, f'{c_type}Contest', dbr.get_name_field(f'{c_type}Contest'), mu.path_to_munger_dir, drop_unmatched=False) # set contest_type where id was found working.loc[working[f'{c_type}Contest_Id'].notnull(), 'contest_type'] = c_type # drop column with munged name working.drop(f'{c_type}Contest', axis=1, inplace=True) # drop rows with unmatched contests to_be_dropped = working[working['contest_type'] == 'unknown'] working_temp = working[working['contest_type'] != 'unknown'] if working_temp.empty: raise MungeError( 'No contests in database matched. No results will be loaded to database.' ) elif not to_be_dropped.empty: print(f'Warning: Results for {to_be_dropped.shape[0]} rows ' f'with unmatched contests will not be loaded to database.') working = working_temp # get ids for remaining info sourced from rows and columns element_list = [ t for t in mu.cdf_elements[mu.cdf_elements.source != 'other'].index if (t[-7:] != 'Contest' and t[-9:] != 'Selection') ] for t in element_list: # capture id from db in new column and erase any now-redundant cols df = pd.read_sql_table(t, session.bind) name_field = dbr.get_name_field(t) # set drop_unmatched = True for fields necessary to BallotMeasure rows, # drop_unmatched = False otherwise to prevent losing BallotMeasureContests for BM-inessential fields if t == 'ReportingUnit' or t == 'CountItemType': drop = True else: drop = False working = replace_raw_with_internal_ids(working, juris, df, t, name_field, mu.path_to_munger_dir, drop_unmatched=drop) working.drop(t, axis=1, inplace=True) # working = add_non_id_cols_from_id(working,df,t) # append BallotMeasureSelection_Id, drop BallotMeasureSelection df_selection = pd.read_sql_table(f'BallotMeasureSelection', session.bind) working = replace_raw_with_internal_ids( working, juris, df_selection, 'BallotMeasureSelection', dbr.get_name_field('BallotMeasureSelection'), mu.path_to_munger_dir, drop_unmatched=False, mode=mu.cdf_elements.loc['BallotMeasureSelection', 'source']) # drop records with a BMC_Id but no BMS_Id (i.e., keep if BMC_Id is null or BMS_Id is not null) working = working[(working['BallotMeasureContest_Id'].isnull()) | (working['BallotMeasureSelection_Id']).notnull()] working.drop('BallotMeasureSelection', axis=1, inplace=True) # append CandidateSelection_Id # First must load CandidateSelection table (not directly munged, not exactly a join either) # Note left join, as not every record in working has a Candidate_Id # TODO maybe introduce Selection and Contest tables, have C an BM types refer to them? c_df = pd.read_sql_table('Candidate', session.bind) c_df.rename(columns={'Id': 'Candidate_Id'}, inplace=True) cs_df, err = dbr.dframe_to_sql(c_df, session, 'CandidateSelection', return_records='original') # add CandidateSelection_Id column, merging on Candidate_Id working = working.merge(cs_df[['Candidate_Id', 'Id']], how='left', left_on='Candidate_Id', right_on='Candidate_Id') working.rename(columns={'Id': 'CandidateSelection_Id'}, inplace=True) # drop records with a CC_Id but no CS_Id (i.e., keep if CC_Id is null or CS_Id is not null) working = working[(working['CandidateContest_Id'].isnull()) | (working['CandidateSelection_Id']).notnull()] # TODO: warn user if contest is munged but candidates are not # TODO warn user if BallotMeasureSelections not recognized in dictionary.txt for j in [ 'BallotMeasureContestSelectionJoin', 'CandidateContestSelectionJoin', 'ElectionContestJoin' ]: working = append_join_id(project_root, session, working, j) # Fill VoteCount and ElectionContestSelectionVoteCountJoin # To get 'VoteCount_Id' attached to the correct row, temporarily add columns to VoteCount # add ElectionContestSelectionVoteCountJoin columns to VoteCount # Define ContestSelectionJoin_Id field needed in ElectionContestSelectionVoteCountJoin ref_d = { 'ContestSelectionJoin_Id': [ 'BallotMeasureContestSelectionJoin_Id', 'CandidateContestSelectionJoin_Id' ] } working = append_multi_foreign_key(working, ref_d) # add extra columns to VoteCount table temporarily to allow proper join extra_cols = [ 'ElectionContestJoin_Id', 'ContestSelectionJoin_Id', '_datafile_Id' ] dbr.add_integer_cols(session, 'VoteCount', extra_cols) # upload to VoteCount table, pull Ids working_fat, err = dbr.dframe_to_sql(working, session, 'VoteCount', raw_to_votecount=True) working_fat.rename(columns={'Id': 'VoteCount_Id'}, inplace=True) session.commit() # TODO check that all candidates in munged contests (including write ins!) are munged # upload to ElectionContestSelectionVoteCountJoin data, err = dbr.dframe_to_sql(working_fat, session, 'ElectionContestSelectionVoteCountJoin') # drop extra columns dbr.drop_cols(session, 'VoteCount', extra_cols) return
def get_record_info_from_user(sess, element, known_info_d={}, mode='database'): """Collect new record info from user, with chance to confirm. For each enumeration, translate the user's plaintext input into id/othertext. Return the corresponding record (id/othertext only) and an enumeration-value dictionary. Depending on <mode> ('database', 'filesystem' or 'database_and_filesystem'), returns enum plaintext, or enum id/othertext pairs, or both. """ # read existing info from db all_from_db = pd.read_sql_table(element, sess.bind, index_col='Id') # initialize <show_user_cols> db_cols = list(all_from_db.columns) # note: does not include 'Id' show_user_cols = db_cols.copy() # initialize value dictionaries to be returned enum_val = fk_val = new = {} enum_list = dbr.get_enumerations(sess, element) fk_df = dbr.get_foreign_key_df(sess, element) # get enumeration tables from db e_df = {} for e in enum_list: e_df[e] = pd.read_sql_table(e, sess.bind, index_col='Id') # add cols to all_from_db for showing user and update show_user_cols for e in enum_list: all_from_db = mr.enum_col_from_id_othertext(all_from_db, e, e_df[e], drop_old=False) show_user_cols.append(e) show_user_cols.remove(f'{e}_Id') show_user_cols.remove(f'Other{e}') for i, r in fk_df.iterrows(): # exclude foreign ids pointing to enumerations if i[:-3] not in enum_list: all_from_db = dbr.add_foreign_key_name_col( sess, all_from_db, r['foreign_column_name'], r['foreign_table_name'], drop_old=False) show_user_cols.append(i[:-3]) show_user_cols.remove(i) # collect and confirm info from user unconfirmed = True while unconfirmed: # solicit info from user and store values for db insertion new = {} print(f'Enter info for new {element} record.') for c in db_cols: # define new[c] if value is known if c in known_info_d.keys(): new[c] = known_info_d[c] # if c is an enumeration Id if c[-3:] == '_Id' and c[:-3] in enum_list: c_plain = c[:-3] # if plaintext of enumeration is known if c_plain in new.keys(): new[c], new[ f'Other{c_plain}'] = mr.enum_value_to_id_othertext( e_df[c], new[c_plain]) # if id/othertext of enumeration is known elif f'{c}_Id' in new.keys() and f'Other{c}' in new.keys(): new[c] = mr.enum_value_from_id_othertext( e_df[c], new[f'{c}_Id'], new[f'Other{c}']) # otherwise else: new[c], new[f'Other{c_plain}'], new[c_plain] = pick_enum( sess, c_plain) # if c is an Other<enumeration>, new value was defined in loop through enum_list elif c[:5] == 'Other' and c[5:] in enum_list: pass # if c is a foreign key (and not an enumeration) elif c in fk_df.index: # if foreign key id is known c_plain = c[:-3] if c in new.keys(): new[c_plain] = dbr.name_from_id( sess, fk_df.loc[c, 'foreign_table_name'], new[c]) # if foreign key plaintext is known elif c_plain in new.keys(): new[c] = dbr.name_to_id(sess, fk_df.loc[c, 'foreign_table_name'], new[c_plain]) # otherwise else: print( f'Specify the {fk_df.loc[c,"foreign_table_name"]} for this {element}' ) idx, db_record = pick_record_from_db( sess, fk_df.loc[c, 'foreign_table_name'], required=True) new[c_plain] = db_record[dbr.get_name_field( fk_df.loc[c, 'foreign_table_name'])] # TODO pull from DB info about whether the foreign key is required new[c] = dbr.name_to_id(sess, fk_df.loc[c, 'foreign_table_name'], new[c_plain]) else: new[c] = enter_and_check_datatype(f'Enter the {c}', get_datatype(all_from_db, c)) # present to user for confirmation entry = '\n\t'.join([f'{k}:\t{new[k]}' for k in show_user_cols]) confirm = input(f'Confirm entry:\n\t{entry}\nIs this correct (y/n)?\n') if confirm == 'y': unconfirmed = False # get db_record, enum_val, fk_val db_record = {k: new[k] for k in db_cols} enum_val = {e: new[e] for e in enum_list} fk_val = {k[:-3]: new[k[:-3]] for k in fk_df.index} show_user = {k: new[k] for k in show_user_cols} if mode == 'database': return db_record, enum_val, fk_val elif mode == 'filesystem': return show_user, enum_val, fk_val elif mode == 'database_and_filesystem': return {**db_record, **show_user}, enum_val, fk_val else: print(f'Mode {mode} not recognized.') return None, None, None
def pick_record_from_db(sess, element, known_info_d=None, required=False, db_idx=None): """Get id and info from database, if it exists. If <db_idx> is passed, return that index and a dictionary with the rest of the record""" if not known_info_d: known_info_d = {} element_df = pd.read_sql_table(element, sess.bind, index_col='Id') if element_df.empty: return None, None elif db_idx: return db_idx, element_df.loc[db_idx].to_dict() # add columns for plaintext of any enumerations # FIXME also add columns for foreign key plaintext enums = dbr.read_enums_from_db_table(sess, element) element_enhanced_df = element_df.copy() for e in enums: e_df = pd.read_sql_table(e, sess.bind, index_col='Id') element_enhanced_df = mr.enum_col_from_id_othertext( element_enhanced_df, e, e_df, drop_old=False) # filter by known_info_d d = { k: v for k, v in known_info_d.items() if k in element_enhanced_df.columns } filtered = element_enhanced_df.loc[( element_enhanced_df[list(d)] == pd.Series(d)).all(axis=1)] # TODO if filtered is empty, offer all if filtered.empty: print( 'Nothing meets the filter criteria. Unfiltered options will be offered.' ) filtered = element_enhanced_df print(f'Pick the {element} record from the database:') name_field = db_routines.get_name_field(element) element_idx, values = pick_one(filtered, name_field, element) if element_idx in element_df.index: d = dict(element_df.loc[element_idx]) else: d = None if required and element_idx is None: # offer to filter by available enumerations enum_list = [ x for x in dbr.get_enumerations(sess, element) if x not in known_info_d ] if len(enum_list) == 0: print('No more filters available. You must choose from this list') element_idx, d = pick_record_from_db(sess, element, known_info_d=known_info_d) else: while element_idx is None and len(enum_list) > 0: e = enum_list[0] e_filter = input(f'Filter by {e} (y/n)?\n') if e_filter == 'y': known_info_d[f'{e}_Id'], known_info_d[ f'Other{e}'], known_info_d[e] = pick_enum(sess, e) element_idx, d = pick_record_from_db( sess, element, known_info_d=known_info_d, required=True) enum_list.remove(e) return element_idx, d