def save_website_record(dynamo_table_name: str, web_site_name: str, save_only_new_records: bool = True): """ Save website record to dynamo given web_site_name """ config = {'local': False} json_record = {'id': web_site_name, 'title': web_site_name} json_record = add_website_keys(json_record) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records)
def __init__(self, config: dict, time_to_break: int = None): self.config = config self.table_name = self.config.get('website-metadata-tablename') self.local = config.get('local', True) self.related_ids = {} self.time_to_break = time_to_break if not self.local: self.related_ids = get_all_parent_override_records(self.table_name) self.table = boto3.resource('dynamodb').Table(self.table_name) self.save_json_to_dynamo_class = SaveJsonToDynamo( config, self.config['website-metadata-tablename'])
def save_file_to_process_record(dynamo_table_name: str, json_record: dict, save_only_new_records: bool = True): """ Save fileToProcess record to dynamo Examples of storageSystem include: S3 and Google and Curate Examples of typeOfData include: Museum and 'RBSC website bucket' and Curate""" config = {'local': False} json_record = add_file_to_process_keys(json_record) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records)
def save_source_system_record(dynamo_table_name: str, source_system_name: str, save_only_new_records: bool = True): """ Save Source System Name """ config = {'local': False} json_record = {'sourceSystem': source_system_name} if not save_only_new_records: json_record['dateAddedToDynamo'] = get_iso_date_as_string() json_record = add_source_system_keys(json_record) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records) return
def save_parent_override_record(dynamo_table_name: str, item_id: str, parent_id: str, sequence: int = 0, save_only_new_records: bool = True): """ Save parent override record to dynamo """ config = {'local': False} json_record = {'id': item_id} json_record['parentId'] = parent_id json_record['sequence'] = sequence json_record = add_parent_override_keys(json_record) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records)
def save_file_system_record(dynamo_table_name: str, storage_system: str, type_of_data: str, save_only_new_records: bool = True): """ Save File System Name """ config = {'local': False} json_record = {'storageSystem': storage_system} json_record['typeOfData'] = type_of_data if not save_only_new_records: json_record['dateAddedToDynamo'] = get_iso_date_as_string() json_record = add_file_systems_keys(json_record) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records) return
def save_new_subject_term_authority_record(dynamo_table_name: str, authority: str, source_system: str, id: str, save_only_new_records: bool = True): """ Save NewSubjectTermAuthority record to dynamo """ config = {'local': False} if authority and source_system: json_record = {'authority': authority} json_record['id'] = id json_record['sourceSystem'] = source_system json_record = add_new_subject_term_authority_keys(json_record) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records)
def save_harvest_ids(config: dict, source_system: str, string_list_to_save: list, dynamo_table_name: str, save_only_new_records: bool = True): """ Loop through items to harvest, saving each to DynamoDB with appropriate keys """ save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) for harvest_item_id in string_list_to_save: json_record = { 'sourceSystem': source_system, 'harvestItemId': harvest_item_id } if not save_only_new_records: json_record['dateAddedToDynamo'] = get_iso_date_as_string() json_record = add_item_to_harvest_keys(json_record) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records)
def save_media_group_record(dynamo_table_name: str, media_group_id: str, storage_system: str, type_of_data: str, save_only_new_records: bool = True): """ Save Media Group Record """ config = {'local': False} json_record = {'mediaGroupId': media_group_id} json_record['storageSystem'] = storage_system json_record['typeOfData'] = type_of_data if not save_only_new_records: json_record['dateAddedToDynamo'] = get_iso_date_as_string() json_record = add_media_group_keys(json_record) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records) return
def save_subject_term_record(dynamo_table_name: str, json_record: dict, saving_expanded_record_flag: bool = False, save_only_new_records: bool = False) -> dict: """ Save SubjectTerm record to dynamo json_record must include uri and authority """ config = {'local': False} results = {} if dynamo_table_name and json_record.get('uri') and json_record.get( 'authority'): json_record = add_subject_term_keys(json_record, saving_expanded_record_flag) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) dynamo_results = save_json_to_dynamo_class.save_json_to_dynamo_returning_results( json_record, "ALL_OLD", save_only_new_records) if 'Attributes' in dynamo_results: results = dynamo_results.get('Attributes') return results
def __init__(self, config, event, time_to_break): self.config = config self.event = event if not self.config.get('local', True): self.curate_header = {"X-Api-Token": self.config["curate-token"]} self.start_time = time.time() self.time_to_break = time_to_break self.save_curate_json_locally = event.get("local", False) self.local_folder = os.path.dirname(os.path.realpath(__file__)) + "/" self.attempting_huge_export_with_resumption_flag = False self.save_json_to_dynamo_class = SaveJsonToDynamo( config, self.config.get('website-metadata-tablename', ''))
def save_unharvested_subject_term_record(dynamo_table_name: str, authority: str, source_system: str, term: str, id: str, uri: str, save_only_new_records: bool = True): """ Save NewSubjectTermAuthority record to dynamo """ config = {'local': False} if authority and term: json_record = {'id': id} json_record['authority'] = authority json_record['term'] = term if uri: json_record['uri'] = uri if source_system: json_record['sourceSystem'] = source_system json_record = add_unharvested_subject_term_keys(json_record) save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name) save_json_to_dynamo_class.save_json_to_dynamo(json_record, save_only_new_records)
def __init__(self, config: dict, event: dict, time_to_break: datetime): self.config = config self.event = event self.time_to_break = time_to_break self.folder_name = "/tmp" self.file_name = 'web_kiosk_composite_metadata.json' self.save_local_copy = False self.delete_local_copy = False self.start_time = time.time() self.table_name = self.config.get('website-metadata-tablename', '') self.save_json_to_dynamo_class = SaveJsonToDynamo( config, self.table_name) validate_json_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'dependencies', 'pipelineutilities', 'validate_json.py') self.validate_json_modified_date = datetime.fromtimestamp( os.path.getmtime(validate_json_path)).isoformat() self.file_to_process_records_in_dynamo = {} if not self.config.get('local', True): self.file_to_process_records_in_dynamo = get_all_file_to_process_records_by_storage_system( self.table_name, 'Google', 'Museum') self.table = boto3.resource('dynamodb').Table(self.table_name)
def __init__(self, config, event, time_to_break): self.config = config self.event = event self.start_time = time.time() if not self.config.get('local', True): self.curate_header = {"X-Api-Token": self.config["curate-token"]} self.start_time = time.time() self.time_to_break = time_to_break self.table_name = self.config.get('website-metadata-tablename', '') self.translate_curate_json_node_class = TranslateCurateJsonNode(config) self.save_standard_json_locally = event.get( "local", False ) # To generate standard_json locally, set "local" to false, and temporarily set save_standard_json_locally to True if not self.save_standard_json_locally and event.get( "recreateStandardJsonLocallyButDoNotSaveToDynamo", False): self.save_standard_json_locally = event.get( "recreateStandardJsonLocallyButDoNotSaveToDynamo") self.create_standard_json_class = CreateStandardJson(config) self.local_folder = os.path.dirname(os.path.realpath(__file__)) + "/" self.save_json_to_dynamo_class = SaveJsonToDynamo( config, self.table_name) self.get_curate_metadata_class = GetCurateMetadata( config, event, time_to_break) validate_json_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'dependencies', 'pipelineutilities', 'validate_json.py') self.validate_json_modified_date = datetime.fromtimestamp( os.path.getmtime(validate_json_path)).isoformat() local_control_file_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'curate_to_json_translation_control_file.json') self.local_control_file_modified_date = datetime.fromtimestamp( os.path.getmtime(local_control_file_path)).isoformat() self.file_to_process_records_in_dynamo = {} if not self.config.get('local', True): self.file_to_process_records_in_dynamo = get_all_file_to_process_records_by_storage_system( self.table_name, 'Curate', 'Curate') self.table = boto3.resource('dynamodb').Table(self.table_name)
class SaveStandardJsonToDynamo(): """ Save Standard Json to Dynamo """ def __init__(self, config: dict, time_to_break: int = None): self.config = config self.table_name = self.config.get('website-metadata-tablename') self.local = config.get('local', True) self.related_ids = {} self.time_to_break = time_to_break if not self.local: self.related_ids = get_all_parent_override_records(self.table_name) self.table = boto3.resource('dynamodb').Table(self.table_name) self.save_json_to_dynamo_class = SaveJsonToDynamo( config, self.config['website-metadata-tablename']) def save_standard_json(self, standard_json: dict, save_only_new_records: bool = False) -> bool: """ First, validate the standard_json. If this is the first time this standard_json is being saved, we will need to make sure all images and related files get processed. We next call a process to record files needing processed. We then save the standard_json. """ success_flag = False if validate_standard_json(standard_json) and not self.local: if "id" in standard_json: standard_json = add_item_keys(standard_json) success_flag = self._save_json_to_dynamo( standard_json, save_only_new_records) return success_flag def _save_json_to_dynamo( self, standard_json: dict, save_only_new_records: bool = False) -> bool: # noqa: C901 """ Save each item recursively to dynamo then save root """ success_flag = True if "items" in standard_json: for item in standard_json['items']: if item.get("level") != "file" and ( not self.time_to_break or datetime.now() < self.time_to_break): self._save_json_to_dynamo(item) if item.get("level") == 'file' and item.get( 'storageSystem', '') == 'Uri': self._save_special_file_record(item) if "childIds" in standard_json: self._append_related_ids(standard_json) standard_json = self._optionally_update_parent_id(standard_json) new_dict = {i: standard_json[i] for i in standard_json if i != 'items'} if standard_json.get("sourceSystem", "") in self.config.get( "source-systems-requiring-metadata-expire-time"): new_dict['expireTime'] = int( datetime.timestamp(datetime.now() + timedelta(days=int( self.config.get('metadata-time-to-live-days', 3))))) new_dict = add_item_keys(new_dict) if self.time_to_break and datetime.now() > self.time_to_break: return False try: with self.table.batch_writer() as batch: batch.put_item(Item=new_dict) if new_dict.get( 'parentId' ) == 'root': # add WebsiteItem record to dynamo for all root items webiste_item_record = { 'id': new_dict['id'], 'websiteId': 'Marble' } webiste_item_record = add_website_item_keys( webiste_item_record) batch.put_item(Item=webiste_item_record) except ClientError as ce: success_flag = False capture_exception(ce) print( f"Error saving to {self.table_name} table - {ce.response['Error']['Code']} - {ce.response['Error']['Message']}" ) return success_flag def _append_related_ids(self, standard_json) -> dict: """ update local dictionary with related ids """ for related_id in standard_json.get("childIds", []): if not self.local: save_parent_override_record( self.table_name, related_id.get("id"), standard_json.get("id"), related_id.get("sequence")) # added to save to dynamo return self.related_ids def _optionally_update_parent_id(self, standard_json: dict) -> dict: if standard_json["id"] in self.related_ids: standard_json["parentId"] = self.related_ids[ standard_json["id"]].get("parentId") standard_json["sequence"] = self.related_ids[ standard_json["id"]].get("sequence") return standard_json def _save_special_file_record(self, standard_json: dict): """ Files are automatically stored elsewhere for Curate, Museum, and S3, but not for Uri """ if not self.local and standard_json.get( 'level') == 'file' and standard_json.get( 'storageSystem') == 'Uri': if standard_json.get( 'objectFileGroupId' ): # TODO Remove this once iageGroupId has been adopted standard_json = add_file_keys( standard_json, self.config.get('image-server-base-url', '')) self.save_json_to_dynamo_class.save_json_to_dynamo( standard_json) # save_file_to_process_record(self.config['website-metadata-tablename'], standard_json) # removed since we are now adding fileToProcess records as image records. save_file_group_record( self.config['website-metadata-tablename'], standard_json.get('objectFileGroupId'), standard_json.get('storageSystem'), standard_json.get('typeOfData')) if standard_json.get('imageGroupId'): standard_json = add_image_keys( standard_json, self.config.get('image-server-base-url', '')) self.save_json_to_dynamo_class.save_json_to_dynamo( standard_json) save_file_to_process_record( self.config['website-metadata-tablename'], standard_json) save_image_group_record( self.config['website-metadata-tablename'], standard_json.get('imageGroupId'), standard_json.get('storageSystem'), standard_json.get('typeOfData')) if standard_json.get('mediaGroupId'): standard_json = add_media_keys( standard_json, self.config.get('media-server-base-url', '')) self.save_json_to_dynamo_class.save_json_to_dynamo( standard_json) save_media_group_record( self.config['website-metadata-tablename'], standard_json.get('mediaGroupId'), standard_json.get('storageSystem'), standard_json.get('typeOfData'))