def context_sources_to_dfs(excel_dirpath, context_sources=CONTEXT_SOURCES): """Loads sources of context data into data frames""" source_dfs = {} for excel_filepath in list_excel_files(excel_dirpath): dfs = read_excel_to_dataframes(excel_filepath) for source_key, _ in context_sources.items(): if not source_key in dfs: continue source_dfs[source_key] = dfs[source_key] return source_dfs
def prepare_media_links_df(excel_dirpath, project_uuid, all_contexts_df): """Prepares a media link dataframe.""" df_link = None for excel_filepath in list_excel_files(excel_dirpath): if not 'Media' in excel_filepath: continue dfs = read_excel_to_dataframes(excel_filepath) df_link = prepare_media_links_from_dfs(project_uuid, dfs, all_contexts_df) return df_link
def prepare_catalog(project_uuid, excel_dirpath): """Prepares catalog dataframes.""" dfs = None for excel_filepath in list_excel_files(excel_dirpath): if not 'Catalog' in excel_filepath: continue dfs = read_excel_to_dataframes(excel_filepath) df_f = dfs[CATALOG_ATTRIBUTES_SHEET] df_f = drop_empty_cols(df_f) df_f = update_multivalue_columns(df_f) df_f = clean_up_multivalue_cols(df_f) dfs[CATALOG_ATTRIBUTES_SHEET] = df_f return dfs
def prep_field_tables(excels_filepath, project_uuid, year, field_data_preps=None): """Prepares main field created data tables.""" if field_data_preps is None: field_data_preps = FIELD_DATA_PREPS excels = list_excel_files(excels_filepath) field_config_dfs = {} for excel_filepath in excels: dfs = read_excel_to_dataframes(excel_filepath) for act_sheet, config in field_data_preps.items(): if not act_sheet in dfs: # Not applicable. continue df_f = drop_empty_cols(dfs[act_sheet]) df_f = update_multivalue_columns(df_f) df_f = clean_up_multivalue_cols( df_f, skip_cols=SKIP_MULTI_VALUE_REDACTIONS) if 'child_context_cols' in config: df_f = prepare_trench_contexts( df_f, year, child_context_cols=config['child_context_cols']) if config.get('tb_new_title') is not None: # Do a Trench book specific change, making a new # title column. df_f = add_make_new_trench_book_title_column( df_f, config['tb_new_title']) if config.get('tb_doc_type') is not None: # Note that all of the data (so far) are for doc_type_col, doc_type = config.get('tb_doc_type') df_f[doc_type_col] = doc_type if config.get('tb_entry_year') is not None: # Add the Trench Book entry year. entry_year_col = config.get('tb_entry_year') df_f[entry_year_col] = year if config.get('tb_doc_type_root') is not None: df_f['subject_uuid_source'] = UUID_SOURCE_KOBOTOOLBOX df_f = add_trench_book_parents(df_f, project_uuid, year, config) dfs[act_sheet] = df_f config['dfs'] = dfs field_config_dfs[act_sheet] = config return field_config_dfs
def make_all_export_media_df(excels_dirpath, media_cols_endswith=None, new_file_prefixes=None): """Make a dataframe of all media in all export files.""" if new_file_prefixes is None: new_file_prefixes = MEDIA_SOURCE_FILE_PREFIXS df_all_media_list = [] for excel_filepath in list_excel_files(excels_dirpath): excel_file = os.path.basename(excel_filepath) dfs = read_excel_to_dataframes(excel_filepath) df_media = make_dfs_media_df(dfs, media_cols_endswith=media_cols_endswith) if df_media is None: continue df_media['source_file'] = excel_file df_media['new_filename'] = df_media['filename'].apply(revise_filename) for file_start, prefix in new_file_prefixes.items(): if not excel_file.startswith(file_start): continue df_media['new_filename'] = prefix + df_media['new_filename'] if MEDIA_SOURCE_COMPOSITION_TYPES.get(file_start): df_media[ 'Type of Composition Subject'] = MEDIA_SOURCE_COMPOSITION_TYPES[ file_start] df_all_media_list.append(df_media) if not len(df_all_media_list): return None df_all_media = pd.concat(df_all_media_list) if df_all_media.empty: return None df_all_media = df_all_media[df_all_media['new_filename'].notnull()] df_all_media.drop_duplicates(subset=['new_filename'], inplace=True) expected_len = len(df_all_media.index) if (len(df_all_media['new_filename'].unique().tolist()) != expected_len or len(df_all_media['filename'].unique().tolist()) != expected_len): raise RuntimeError( 'Expected {}, but have {} filenames, and {} new-filenames'.format( expected_len, len(df_all_media['filename'].unique().tolist()), len(df_all_media['new_filename'].unique().tolist()))) return df_all_media