Пример #1
0
def save_website_record(dynamo_table_name: str,
                        web_site_name: str,
                        save_only_new_records: bool = True):
    """ Save website record to dynamo given web_site_name """
    config = {'local': False}
    json_record = {'id': web_site_name, 'title': web_site_name}
    json_record = add_website_keys(json_record)
    save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
    save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                  save_only_new_records)
Пример #2
0
 def __init__(self, config: dict, time_to_break: int = None):
     self.config = config
     self.table_name = self.config.get('website-metadata-tablename')
     self.local = config.get('local', True)
     self.related_ids = {}
     self.time_to_break = time_to_break
     if not self.local:
         self.related_ids = get_all_parent_override_records(self.table_name)
         self.table = boto3.resource('dynamodb').Table(self.table_name)
         self.save_json_to_dynamo_class = SaveJsonToDynamo(
             config, self.config['website-metadata-tablename'])
Пример #3
0
def save_file_to_process_record(dynamo_table_name: str,
                                json_record: dict,
                                save_only_new_records: bool = True):
    """ Save fileToProcess record to dynamo
        Examples of storageSystem include: S3 and Google and Curate
        Examples of typeOfData include: Museum and 'RBSC website bucket' and Curate"""
    config = {'local': False}
    json_record = add_file_to_process_keys(json_record)
    save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
    save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                  save_only_new_records)
Пример #4
0
def save_source_system_record(dynamo_table_name: str,
                              source_system_name: str,
                              save_only_new_records: bool = True):
    """ Save Source System Name """
    config = {'local': False}
    json_record = {'sourceSystem': source_system_name}
    if not save_only_new_records:
        json_record['dateAddedToDynamo'] = get_iso_date_as_string()
    json_record = add_source_system_keys(json_record)
    save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
    save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                  save_only_new_records)
    return
Пример #5
0
def save_parent_override_record(dynamo_table_name: str,
                                item_id: str,
                                parent_id: str,
                                sequence: int = 0,
                                save_only_new_records: bool = True):
    """ Save parent override record to dynamo """
    config = {'local': False}
    json_record = {'id': item_id}
    json_record['parentId'] = parent_id
    json_record['sequence'] = sequence
    json_record = add_parent_override_keys(json_record)
    save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
    save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                  save_only_new_records)
Пример #6
0
def save_file_system_record(dynamo_table_name: str,
                            storage_system: str,
                            type_of_data: str,
                            save_only_new_records: bool = True):
    """ Save File System Name """
    config = {'local': False}
    json_record = {'storageSystem': storage_system}
    json_record['typeOfData'] = type_of_data
    if not save_only_new_records:
        json_record['dateAddedToDynamo'] = get_iso_date_as_string()
    json_record = add_file_systems_keys(json_record)
    save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
    save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                  save_only_new_records)
    return
Пример #7
0
def save_new_subject_term_authority_record(dynamo_table_name: str,
                                           authority: str,
                                           source_system: str,
                                           id: str,
                                           save_only_new_records: bool = True):
    """ Save NewSubjectTermAuthority record to dynamo """
    config = {'local': False}
    if authority and source_system:
        json_record = {'authority': authority}
        json_record['id'] = id
        json_record['sourceSystem'] = source_system
        json_record = add_new_subject_term_authority_keys(json_record)
        save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
        save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                      save_only_new_records)
Пример #8
0
def save_harvest_ids(config: dict,
                     source_system: str,
                     string_list_to_save: list,
                     dynamo_table_name: str,
                     save_only_new_records: bool = True):
    """ Loop through items to harvest, saving each to DynamoDB with appropriate keys """
    save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
    for harvest_item_id in string_list_to_save:
        json_record = {
            'sourceSystem': source_system,
            'harvestItemId': harvest_item_id
        }
        if not save_only_new_records:
            json_record['dateAddedToDynamo'] = get_iso_date_as_string()
        json_record = add_item_to_harvest_keys(json_record)
        save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                      save_only_new_records)
Пример #9
0
def save_media_group_record(dynamo_table_name: str,
                            media_group_id: str,
                            storage_system: str,
                            type_of_data: str,
                            save_only_new_records: bool = True):
    """ Save Media Group Record """
    config = {'local': False}
    json_record = {'mediaGroupId': media_group_id}
    json_record['storageSystem'] = storage_system
    json_record['typeOfData'] = type_of_data
    if not save_only_new_records:
        json_record['dateAddedToDynamo'] = get_iso_date_as_string()
    json_record = add_media_group_keys(json_record)
    save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
    save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                  save_only_new_records)
    return
Пример #10
0
def save_subject_term_record(dynamo_table_name: str,
                             json_record: dict,
                             saving_expanded_record_flag: bool = False,
                             save_only_new_records: bool = False) -> dict:
    """ Save SubjectTerm record to dynamo
        json_record must include uri and authority """
    config = {'local': False}
    results = {}
    if dynamo_table_name and json_record.get('uri') and json_record.get(
            'authority'):
        json_record = add_subject_term_keys(json_record,
                                            saving_expanded_record_flag)
        save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
        dynamo_results = save_json_to_dynamo_class.save_json_to_dynamo_returning_results(
            json_record, "ALL_OLD", save_only_new_records)
        if 'Attributes' in dynamo_results:
            results = dynamo_results.get('Attributes')
    return results
 def __init__(self, config, event, time_to_break):
     self.config = config
     self.event = event
     if not self.config.get('local', True):
         self.curate_header = {"X-Api-Token": self.config["curate-token"]}
     self.start_time = time.time()
     self.time_to_break = time_to_break
     self.save_curate_json_locally = event.get("local", False)
     self.local_folder = os.path.dirname(os.path.realpath(__file__)) + "/"
     self.attempting_huge_export_with_resumption_flag = False
     self.save_json_to_dynamo_class = SaveJsonToDynamo(
         config, self.config.get('website-metadata-tablename', ''))
Пример #12
0
def save_unharvested_subject_term_record(dynamo_table_name: str,
                                         authority: str,
                                         source_system: str,
                                         term: str,
                                         id: str,
                                         uri: str,
                                         save_only_new_records: bool = True):
    """ Save NewSubjectTermAuthority record to dynamo """
    config = {'local': False}
    if authority and term:
        json_record = {'id': id}
        json_record['authority'] = authority
        json_record['term'] = term
        if uri:
            json_record['uri'] = uri
        if source_system:
            json_record['sourceSystem'] = source_system
        json_record = add_unharvested_subject_term_keys(json_record)
        save_json_to_dynamo_class = SaveJsonToDynamo(config, dynamo_table_name)
        save_json_to_dynamo_class.save_json_to_dynamo(json_record,
                                                      save_only_new_records)
 def __init__(self, config: dict, event: dict, time_to_break: datetime):
     self.config = config
     self.event = event
     self.time_to_break = time_to_break
     self.folder_name = "/tmp"
     self.file_name = 'web_kiosk_composite_metadata.json'
     self.save_local_copy = False
     self.delete_local_copy = False
     self.start_time = time.time()
     self.table_name = self.config.get('website-metadata-tablename', '')
     self.save_json_to_dynamo_class = SaveJsonToDynamo(
         config, self.table_name)
     validate_json_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), 'dependencies',
         'pipelineutilities', 'validate_json.py')
     self.validate_json_modified_date = datetime.fromtimestamp(
         os.path.getmtime(validate_json_path)).isoformat()
     self.file_to_process_records_in_dynamo = {}
     if not self.config.get('local', True):
         self.file_to_process_records_in_dynamo = get_all_file_to_process_records_by_storage_system(
             self.table_name, 'Google', 'Museum')
         self.table = boto3.resource('dynamodb').Table(self.table_name)
Пример #14
0
 def __init__(self, config, event, time_to_break):
     self.config = config
     self.event = event
     self.start_time = time.time()
     if not self.config.get('local', True):
         self.curate_header = {"X-Api-Token": self.config["curate-token"]}
     self.start_time = time.time()
     self.time_to_break = time_to_break
     self.table_name = self.config.get('website-metadata-tablename', '')
     self.translate_curate_json_node_class = TranslateCurateJsonNode(config)
     self.save_standard_json_locally = event.get(
         "local", False
     )  # To generate standard_json locally, set "local" to false, and temporarily set save_standard_json_locally to True
     if not self.save_standard_json_locally and event.get(
             "recreateStandardJsonLocallyButDoNotSaveToDynamo", False):
         self.save_standard_json_locally = event.get(
             "recreateStandardJsonLocallyButDoNotSaveToDynamo")
     self.create_standard_json_class = CreateStandardJson(config)
     self.local_folder = os.path.dirname(os.path.realpath(__file__)) + "/"
     self.save_json_to_dynamo_class = SaveJsonToDynamo(
         config, self.table_name)
     self.get_curate_metadata_class = GetCurateMetadata(
         config, event, time_to_break)
     validate_json_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), 'dependencies',
         'pipelineutilities', 'validate_json.py')
     self.validate_json_modified_date = datetime.fromtimestamp(
         os.path.getmtime(validate_json_path)).isoformat()
     local_control_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         'curate_to_json_translation_control_file.json')
     self.local_control_file_modified_date = datetime.fromtimestamp(
         os.path.getmtime(local_control_file_path)).isoformat()
     self.file_to_process_records_in_dynamo = {}
     if not self.config.get('local', True):
         self.file_to_process_records_in_dynamo = get_all_file_to_process_records_by_storage_system(
             self.table_name, 'Curate', 'Curate')
         self.table = boto3.resource('dynamodb').Table(self.table_name)
Пример #15
0
class SaveStandardJsonToDynamo():
    """ Save Standard Json to Dynamo """
    def __init__(self, config: dict, time_to_break: int = None):
        self.config = config
        self.table_name = self.config.get('website-metadata-tablename')
        self.local = config.get('local', True)
        self.related_ids = {}
        self.time_to_break = time_to_break
        if not self.local:
            self.related_ids = get_all_parent_override_records(self.table_name)
            self.table = boto3.resource('dynamodb').Table(self.table_name)
            self.save_json_to_dynamo_class = SaveJsonToDynamo(
                config, self.config['website-metadata-tablename'])

    def save_standard_json(self,
                           standard_json: dict,
                           save_only_new_records: bool = False) -> bool:
        """ First, validate the standard_json.  If this is the first time this standard_json is being saved,
            we will need to make sure all images and related files get processed.
            We next call a process to record files needing processed.
            We then save the standard_json. """
        success_flag = False
        if validate_standard_json(standard_json) and not self.local:
            if "id" in standard_json:
                standard_json = add_item_keys(standard_json)
                success_flag = self._save_json_to_dynamo(
                    standard_json, save_only_new_records)
        return success_flag

    def _save_json_to_dynamo(
            self,
            standard_json: dict,
            save_only_new_records: bool = False) -> bool:  # noqa: C901
        """ Save each item recursively to dynamo then save root """
        success_flag = True
        if "items" in standard_json:
            for item in standard_json['items']:
                if item.get("level") != "file" and (
                        not self.time_to_break
                        or datetime.now() < self.time_to_break):
                    self._save_json_to_dynamo(item)
                if item.get("level") == 'file' and item.get(
                        'storageSystem', '') == 'Uri':
                    self._save_special_file_record(item)
        if "childIds" in standard_json:
            self._append_related_ids(standard_json)
        standard_json = self._optionally_update_parent_id(standard_json)
        new_dict = {i: standard_json[i] for i in standard_json if i != 'items'}
        if standard_json.get("sourceSystem", "") in self.config.get(
                "source-systems-requiring-metadata-expire-time"):
            new_dict['expireTime'] = int(
                datetime.timestamp(datetime.now() + timedelta(days=int(
                    self.config.get('metadata-time-to-live-days', 3)))))
        new_dict = add_item_keys(new_dict)
        if self.time_to_break and datetime.now() > self.time_to_break:
            return False
        try:
            with self.table.batch_writer() as batch:
                batch.put_item(Item=new_dict)
                if new_dict.get(
                        'parentId'
                ) == 'root':  # add WebsiteItem record to dynamo for all root items
                    webiste_item_record = {
                        'id': new_dict['id'],
                        'websiteId': 'Marble'
                    }
                    webiste_item_record = add_website_item_keys(
                        webiste_item_record)
                    batch.put_item(Item=webiste_item_record)
        except ClientError as ce:
            success_flag = False
            capture_exception(ce)
            print(
                f"Error saving to {self.table_name} table - {ce.response['Error']['Code']} - {ce.response['Error']['Message']}"
            )
        return success_flag

    def _append_related_ids(self, standard_json) -> dict:
        """ update local dictionary with related ids """
        for related_id in standard_json.get("childIds", []):
            if not self.local:
                save_parent_override_record(
                    self.table_name, related_id.get("id"),
                    standard_json.get("id"),
                    related_id.get("sequence"))  # added to save to dynamo
        return self.related_ids

    def _optionally_update_parent_id(self, standard_json: dict) -> dict:
        if standard_json["id"] in self.related_ids:
            standard_json["parentId"] = self.related_ids[
                standard_json["id"]].get("parentId")
            standard_json["sequence"] = self.related_ids[
                standard_json["id"]].get("sequence")
        return standard_json

    def _save_special_file_record(self, standard_json: dict):
        """ Files are automatically stored elsewhere for Curate, Museum, and S3, but not for Uri """
        if not self.local and standard_json.get(
                'level') == 'file' and standard_json.get(
                    'storageSystem') == 'Uri':
            if standard_json.get(
                    'objectFileGroupId'
            ):  # TODO Remove this once iageGroupId has been adopted
                standard_json = add_file_keys(
                    standard_json, self.config.get('image-server-base-url',
                                                   ''))
                self.save_json_to_dynamo_class.save_json_to_dynamo(
                    standard_json)
                # save_file_to_process_record(self.config['website-metadata-tablename'], standard_json)  # removed since we are now adding fileToProcess records as image records.
                save_file_group_record(
                    self.config['website-metadata-tablename'],
                    standard_json.get('objectFileGroupId'),
                    standard_json.get('storageSystem'),
                    standard_json.get('typeOfData'))
            if standard_json.get('imageGroupId'):
                standard_json = add_image_keys(
                    standard_json, self.config.get('image-server-base-url',
                                                   ''))
                self.save_json_to_dynamo_class.save_json_to_dynamo(
                    standard_json)
                save_file_to_process_record(
                    self.config['website-metadata-tablename'], standard_json)
                save_image_group_record(
                    self.config['website-metadata-tablename'],
                    standard_json.get('imageGroupId'),
                    standard_json.get('storageSystem'),
                    standard_json.get('typeOfData'))
            if standard_json.get('mediaGroupId'):
                standard_json = add_media_keys(
                    standard_json, self.config.get('media-server-base-url',
                                                   ''))
                self.save_json_to_dynamo_class.save_json_to_dynamo(
                    standard_json)
                save_media_group_record(
                    self.config['website-metadata-tablename'],
                    standard_json.get('mediaGroupId'),
                    standard_json.get('storageSystem'),
                    standard_json.get('typeOfData'))