Exemplo n.º 1
0
 def save_files_details(self):
     """ This will crawl available files, then loop through the file listing, saving each to dynamo """
     if self.event['objectFilesApi_execution_count'] == 1:
         marble_files = self._crawl_available_files_from_s3_or_cache(self.config['marble-content-bucket'], True)
         # rbsc_files = self._crawl_available_files_from_s3_or_cache(self.config['rbsc-image-bucket'], True)  # save in case we need to crawl the RBSC bucket ever again
         # all_files_listing = {**rbsc_files, **marble_files}
         all_files_listing = {**marble_files}
     else:
         all_files_listing = self._resume_execution()
     file_objects = []
     processing_complete = True
     for key, value in all_files_listing.items():
         if not value.get('recordProcessedFlag', False):
             file_objects.extend(self._save_file_objects_per_collection(value))
             value['recordProcessedFlag'] = True
             print("saved", len(value.get('files', [])), "files for collection: ", key, int(time.time() - self.start_time), 'seconds.')
         if datetime.now() >= self.time_to_break:
             self._save_progress(all_files_listing)
             processing_complete = False
             break
     if processing_complete:
         self._clean_up_when_done()
         self.event['objectFilesApiComplete'] = True
     if self.event['local']:
         self._cache_s3_call(os.path.join(self.directory, "file_objects.json"), file_objects)
     else:
         write_s3_json(self.config['manifest-server-bucket'], 'objectFiles/all/index.json', file_objects)
     return file_objects
Exemplo n.º 2
0
 def _save_progress(self, all_files_listing: dict):
     """ This is used to save progress in order to resume execution later """
     if self.event['local']:
         cache_file_name = os.path.join(self.directory, self.resumption_filename)
         self._cache_s3_call(self, cache_file_name, all_files_listing)
     else:
         s3_key = os.path.join(self.config['pipeline-control-folder'], self.resumption_filename)
         write_s3_json(self.config['process-bucket'], s3_key, all_files_listing)
def cache_pipeline_config(config, event):
    if event.get('local', False):
        return

    test_required_fields(event)
    s3Path = "pipeline_runs/" + event['config-file']
    s3Bucket = event['process-bucket']
    write_s3_json(s3Bucket, s3Path, config)
 def _save_curate_json(self, curate_json: dict):
     """ Once we retrieve curate_json, save it so we can process more easily next time. """
     item_id = curate_json.get('id', '')
     if self.config.get('local', True):
         filename = os.path.join(self.local_folder, "save",
                                 item_id + "_curate.json")
         with open(filename, 'w') as f:
             json.dump(curate_json, f, indent=2)
     else:
         key = os.path.join('save', item_id + '_curate.json')
         write_s3_json(self.config['process-bucket'], key, curate_json)
Exemplo n.º 5
0
 def _save_standard_json_for_future_processing(self, standard_json: dict):
     """ Once we get standard_json, save it so we can process more easily next time. """
     item_id = standard_json.get('id', '')
     if self.config.get('local', True) or self.save_standard_json_locally:
         filename = os.path.join(self.local_folder, "save",
                                 item_id + "_standard.json")
         with open(filename, 'w') as f:
             json.dump(standard_json, f, indent=2, sort_keys=True)
     else:
         key = os.path.join('save', item_id + '_standard.json')
         write_s3_json(self.config['process-bucket'], key, standard_json)
Exemplo n.º 6
0
def _save_seed_files_to_s3(bucket_name, folder_name):
    local_folder = os.path.dirname(os.path.realpath(__file__)) + "/"
    for file_name in os.listdir(folder_name):
        local_file_name = os.path.join(local_folder, folder_name, file_name)
        if os.path.isfile(local_file_name):
            try:
                with io.open(local_file_name, 'r',
                             encoding='utf-8') as json_file:
                    json_to_save = json.load(json_file)
                s3_key = os.path.join(folder_name, file_name)
                _delete_multipart_s3_file_if_necessary(bucket_name, s3_key)
                print('saving filename to s3 = ', file_name)
                write_s3_json(bucket_name, s3_key, json_to_save)
            except:  # noqa E722 - intentionally ignore warning about bare except
                pass
Exemplo n.º 7
0
def run(event, _context):
    """ run the process to retrieve and process web kiosk metadata """
    _suplement_event(event)
    config = setup_pipeline_config(event)
    google_config = load_config_ssm(config['google_keys_ssm_base'])
    config.update(google_config)
    museum_config = load_config_ssm(config['museum_keys_ssm_base'])
    config.update(museum_config)
    time_to_break = datetime.now() + timedelta(
        seconds=config['seconds-to-allow-for-processing'])
    print("Will break after ", time_to_break)

    mode = event.get("mode", "full")
    if mode not in ["full", "incremental", "ids"]:
        mode = "full"
    json_web_kiosk_class = ProcessWebKioskJsonMetadata(config, event,
                                                       time_to_break)
    if event["museumExecutionCount"] == 1:
        if not event.get('local'):
            save_file_system_record(config.get('website-metadata-tablename'),
                                    'Google', 'Museum')
            save_source_system_record(config.get('website-metadata-tablename'),
                                      'EmbARK')
        composite_json = json_web_kiosk_class.get_composite_json_metadata(mode)
        museum_image_metadata = json_web_kiosk_class.find_images_for_composite_json_metadata(
            composite_json)
        composite_json = CleanUpCompositeJson(
            composite_json).cleaned_up_content
        event['countToProcess'] = len(composite_json.get('objects'))
        write_s3_json(config['process-bucket'],
                      'museum_composite_metadata.json', composite_json)
        write_s3_json(config['process-bucket'], 'museum_image_metadata.json',
                      museum_image_metadata)
    else:
        composite_json = read_s3_json(config['process-bucket'],
                                      'museum_composite_metadata.json')
        museum_image_metadata = read_s3_json(config['process-bucket'],
                                             'museum_image_metadata.json')

    if composite_json:
        objects_processed = json_web_kiosk_class.process_composite_json_metadata(
            composite_json, museum_image_metadata)
        event['museumHarvestComplete'] = _done_processing(composite_json)
    else:
        print('No JSON to process')

    if event["museumExecutionCount"] >= event["maximumMuseumExecutions"]:
        event['museumHarvestComplete'] = True
    if event['museumHarvestComplete']:
        if s3_file_exists(config['process-bucket'],
                          'museum_composite_metadata.json'):
            delete_s3_key(config['process-bucket'],
                          'museum_composite_metadata.json')
        if s3_file_exists(config['process-bucket'],
                          'museum_image_metadata.json'):
            delete_s3_key(config['process-bucket'],
                          'museum_image_metadata.json')
    elif composite_json:
        write_s3_json(config['process-bucket'],
                      'museum_composite_metadata.json', composite_json)
        key = 'countHarvestedLoop' + str(event["museumExecutionCount"])
        event[key] = objects_processed
    event['countRemaining'] = len(composite_json.get('objects'))
    return event