def metadata_folder_to_database(folder_path, delete_db = True, db_suffix = None, explicit_database_name = None, explicit_database_location = None): """ Take a metadata folder and build the database and all tables Args: delete_db bool: Delete the database before starting db_suffix: If provided, metadata will be modified so that the database name, and s3 data locations include the folder suffix explicit_database_name: if not None the database name in glue will be set to the string specified in explicit_database_location explicit_database_location: if not None the database location in glue will be set to the string specified in explicit_database_location. If explicit_database_location starts with s3:// Then the function will assume that the string provided is the full path to the database folder. Otherwise, the function will assume that explicit_database_location is a prefix to be added to the current location in the original json. If explicit_database_name or explicit_database_location are not None then it is advised to leave db_suffix as None or vis-versa. """ files = os.listdir(folder_path) files = set([f for f in files if re.match(".+\.json$", f)]) if "database.json" in files: db_metadata = read_json(os.path.join(folder_path, "database.json")) if db_suffix: str_to_add = "_" + db_suffix if db_metadata["location"][-1] == "/": db_metadata["location"] = db_metadata["location"][:-1] db_metadata["location"] = db_metadata["location"] + str_to_add db_metadata["name"] = db_metadata["name"] + str_to_add # Added this code to allow user to self define locations (for creation of near duplicate databases for different users) if explicit_database_name is not None : db_metadata["name"] = explicit_database_name if explicit_database_location is not None : if db_metadata["location"][-1] == "/" : db_metadata["location"] = db_metadata["location"][:-1] if 's3://' not in explicit_database_location : db_metadata["location"] = db_metadata["location"] + explicit_database_location else : db_metadata["location"] = explicit_database_location database_name = db_metadata["name"] try: glue_client.delete_database(Name=database_name) except glue_client.exceptions.EntityNotFoundException: pass overwrite_or_create_database(database_name, db_metadata["description"]) else: raise ValueError("database.json not found in metadata folder") return None table_paths = files.difference({"database.json"}) for table_path in table_paths: table_path = os.path.join(folder_path, table_path) table_metadata = read_json(table_path) populate_glue_catalogue_from_metadata(table_metadata, db_metadata, check_existence=False)
def get_locations_df(locations): locations_df = pd.io.json.json_normalize(locations) renames = read_json("metadata/locations_renames.json") locations_df = locations_df[renames.keys()].rename(columns=renames) locations_metadata = read_json("metadata/locations.json") locations_df = impose_exact_conformance_on_pd_df( locations_df, locations_metadata ) return locations_df
def get_survey_fact_df(survey_fact): survey_fact_long = survey_fact_to_long_format(survey_fact) sensor_observations_metadata = read_json( "glue/meta_data/occupeye_db/sensor_observations.json" ) renames = read_json("column_renames/sensor_observations_renames.json") sensor_observations_df = pd.DataFrame(survey_fact_long).rename( columns=renames ) sensor_observations_df = impose_metadata_column_order_on_pd_df( sensor_observations_df, sensor_observations_metadata ) return sensor_observations_df
def get_surveys_df(surveys): surveys = strip_commas_from_api_response(surveys) surveys_df = pd.DataFrame(surveys) # Rename columns to conform to metadata renames = read_json("column_renames/surveys_renames.json") surveys_df = surveys_df.rename(columns=renames) # Impose metadata - i.e. ensure all expected columns are present # and in correct order surveys_metadata = read_json("glue/meta_data/occupeye_db/surveys.json") surveys_df = impose_exact_conformance_on_pd_df( surveys_df, surveys_metadata ) return surveys_df
def get_bookings_df(bookings): bookings_df = pd.io.json.json_normalize(bookings) renames = read_json("metadata/bookings_renames.json") bookings_metadata = read_json("metadata/bookings.json") if len(bookings_df) > 0: bookings_df = bookings_df.reindex(columns=renames.keys()) bookings_df = bookings_df[renames.keys()].rename(columns=renames) else: bookings_df = pd.DataFrame(columns=renames.values()) bookings_df = impose_exact_conformance_on_pd_df( bookings_df, bookings_metadata ) return bookings_df
def delete_all_target_data_from_database(database_metadata_path): files = os.listdir(database_metadata_path) files = set([f for f in files if re.match(".+\.json$", f)]) if "database.json" in files: db_metadata = read_json(os.path.join(database_metadata_path, "database.json")) database_name = db_metadata["name"] else: raise ValueError("database.json not found in metadata folder") return None table_paths = files.difference({"database.json"}) for table_path in table_paths: table_path = os.path.join(database_metadata_path, table_path) table_metadata = read_json(table_path) location = table_metadata["location"] bucket, bucket_folder = s3_path_to_bucket_key(location) delete_folder_from_bucket(bucket, bucket_folder)
def get_sensor_dimension_df(sensor_dimension): sensor_dimension = strip_commas_from_api_response(sensor_dimension) sensors_df = pd.DataFrame(sensor_dimension) del sensors_df[ "SurveyID" ] # Because it's a partition so we don't need to duplicate # Rename columns to conform to metadata renames = read_json("column_renames/sensors_renames.json") sensors_df = sensors_df.rename(columns=renames) # Impose metadata - i.e. ensure all expected columns are present # and in correct order sensors_metadata = read_json("glue/meta_data/occupeye_db/sensors.json") sensors_df = impose_exact_conformance_on_pd_df( sensors_df, sensors_metadata ) return sensors_df
def __init__(self, filepath): self.meta = read_json(filepath) self.__update_column_names()