def get_hdf5_from_s3(galaxy, directory): bucket_name = get_saved_files_bucket() key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) s3_helper = S3Helper() if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info( 'Galaxy {0} ({1}) is still restoring from glacier'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) else: # if file is not restoring, need to request. LOG.info('Making request for archived galaxy {0} ({1})'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) s3_helper.restore_archived_file(bucket_name, key, days=10) else: # file is not archived LOG.info('Galaxy {0} ({1}) is available in s3'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) filename = os.path.join( directory, get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id])) + '.hdf5' s3_helper.get_file_from_bucket(bucket_name=bucket_name, key_name=key, file_name=filename) else: LOG.info('The key {0} in bucket {1} does not exist'.format( key, bucket_name))
def get_hdf5_from_s3(galaxy, directory): bucket_name = get_saved_files_bucket() key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) s3_helper = S3Helper() if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info( 'Galaxy {0} ({1}) is still restoring from glacier'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id] ) ) else: # if file is not restoring, need to request. LOG.info('Making request for archived galaxy {0} ({1})'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) s3_helper.restore_archived_file(bucket_name, key, days=10) else: # file is not archived LOG.info('Galaxy {0} ({1}) is available in s3'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) filename = os.path.join( directory, get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id])) + '.hdf5' s3_helper.get_file_from_bucket(bucket_name=bucket_name, key_name=key, file_name=filename) else: LOG.info('The key {0} in bucket {1} does not exist'.format(key, bucket_name))
def regenerated_original_images(galaxy_name, run_id, galaxy_id, s3_helper, connection): """ We need to regenerate the image :param galaxy_name: :param run_id: :param galaxy_id: :return: if we succeed """ all_ok = False # Get the fits file bucket = s3_helper.get_bucket(get_saved_files_bucket()) galaxy_file_name = get_galaxy_file_name(galaxy_name, run_id, galaxy_id) key_name = '{0}/{0}.fits'.format(galaxy_name) key = bucket.get_key(key_name) if key is None: LOG.error('The fits file does not seem to exists') return all_ok path_name = get_temp_file('fits') key.get_contents_to_filename(path_name) # Now regenerate try: image = FitsImage(connection) image.build_image(path_name, galaxy_file_name, galaxy_id, get_galaxy_image_bucket()) all_ok = True except Exception: LOG.exception('Major error') all_ok = False finally: os.remove(path_name) return all_ok
def get_hdf5_file(s3_helper, output_dir, galaxy_name, run_id, galaxy_id): """ Get the HDF file :param s3_helper: The S3 helper :param output_dir: where to write the file :param galaxy_name: the name of the galaxy :param run_id: the run id :param galaxy_id: the galaxy id :return: """ bucket_name = get_saved_files_bucket() key = get_key_hdf5(galaxy_name, run_id, galaxy_id) tmp_file = get_temp_file('.hdf5', 'pogs', output_dir) s3_helper.get_file_from_bucket(bucket_name=bucket_name, key_name=key, file_name=tmp_file) return tmp_file
def store_files(connection, modulus, remainder): """ Scan a directory for files and send them to the archive """ LOG.info('Directory: %s', HDF5_OUTPUT_DIRECTORY) to_store_dir = os.path.join(HDF5_OUTPUT_DIRECTORY, 'to_store') files = os.path.join(to_store_dir, '*.hdf5') file_count = 0 s3helper = S3Helper() bucket_name = get_saved_files_bucket() for file_name in glob.glob(files): galaxy_id, galaxy_name = get_galaxy_id_and_name(file_name) if galaxy_id >= 0: if modulus is None or galaxy_id % modulus == remainder: size = os.path.getsize(file_name) key = '{0}/{0}.hdf5'.format(galaxy_name) LOG.info('File name: %s', file_name) LOG.info('File size: %d', size) LOG.info('Bucket: %s', bucket_name) LOG.info('Key: %s', key) s3helper.add_file_to_bucket(bucket_name, key, file_name) file_count += 1 os.remove(file_name) connection.execute( GALAXY.update() .where(GALAXY.c.galaxy_id == galaxy_id) .values(status_id=STORED, status_time=datetime.datetime.now())) else: LOG.error('File name: %s', file_name) LOG.error('Could not get the galaxy id') if shutdown() is True: raise SystemExit return file_count
def store_files(connection, modulus, remainder): """ Scan a directory for files and send them to the archive """ LOG.info('Directory: %s', HDF5_OUTPUT_DIRECTORY) to_store_dir = os.path.join(HDF5_OUTPUT_DIRECTORY, 'to_store') files = os.path.join(to_store_dir, '*.hdf5') file_count = 0 s3helper = S3Helper() bucket_name = get_saved_files_bucket() for file_name in glob.glob(files): galaxy_id, galaxy_name = get_galaxy_id_and_name(file_name) if galaxy_id >= 0: if modulus is None or galaxy_id % modulus == remainder: size = os.path.getsize(file_name) key = '{0}/{0}.hdf5'.format(galaxy_name) LOG.info('File name: %s', file_name) LOG.info('File size: %d', size) LOG.info('Bucket: %s', bucket_name) LOG.info('Key: %s', key) s3helper.add_file_to_bucket(bucket_name, key, file_name) file_count += 1 os.remove(file_name) connection.execute(GALAXY.update().where( GALAXY.c.galaxy_id == galaxy_id).values( status_id=STORED, status_time=datetime.datetime.now())) else: LOG.error('File name: %s', file_name) LOG.error('Could not get the galaxy id') if shutdown() is True: raise SystemExit return file_count
def generate_files(connection, hdf5_request_galaxy_ids, email, features, layers, pixel_types): """ Get the FITS files for this request :type connection: The database connection :param pixel_types: :param hdf5_request_galaxy_ids: the galaxy id :param email: :param features: :param layers: :return: """ uuid_string = str(uuid.uuid4()) results = [] available_galaxies = [] s3_helper = S3Helper() bucket_name = get_saved_files_bucket() # Check whether all the requested galaxies are available or not. for hdf5_request_galaxy in hdf5_request_galaxy_ids: galaxy = connection.execute( select([GALAXY]).where( GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() hdf5_request_galaxy = connection.execute( select([ HDF5_REQUEST_GALAXY ]).where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id)).first() state = hdf5_request_galaxy.state if state is not 0: LOG.info('Skipping {0}, state is {1}'.format( galaxy[GALAXY.c.name], state)) continue # Skip key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info( 'Galaxy {0} is still restoring from glacier'.format( galaxy[GALAXY.c.name])) else: # if file is not restoring, need to request. file_size = s3_helper.file_size(bucket_name, key) if restore_file_size_check(connection, bucket_name, file_size): # We're good to restore LOG.info( 'Making request for archived galaxy {0}'.format( galaxy[GALAXY.c.name])) s3_helper.restore_archived_file(bucket_name, key) connection.execute( HDF5_REQUEST_GALAXY_SIZE.insert(), hdf5_request_galaxy_id=hdf5_request_galaxy[ 'hdf5_request_galaxy_id'], size=file_size, request_time=seconds_since_epoch(datetime.now())) else: # Don't restore or we risk spending a lot of money LOG.info( 'Daily galaxy restore size hit. Cannot request archived galaxy.' ) else: # file is not archived LOG.info('Galaxy {0} is available in s3'.format( galaxy[GALAXY.c.name])) available_galaxies.append(hdf5_request_galaxy) else: LOG.error('Galaxy {0} does not exist on s3 or glacier!'.format( galaxy[GALAXY.c.name])) total_request_galaxies = len(hdf5_request_galaxy_ids) LOG.info( 'Need to have {0} galaxies available ({1} currently available)'.format( total_request_galaxies * GALAXY_EMAIL_THRESHOLD, len(available_galaxies))) if len( available_galaxies ) >= total_request_galaxies * GALAXY_EMAIL_THRESHOLD: # Only proceed if more than the threshold of galaxies are available LOG.info('{0}/{1} (> {2}%) galaxies are available. Email will be sent'. format(len(available_galaxies), total_request_galaxies, GALAXY_EMAIL_THRESHOLD * 100)) remaining_galaxies = total_request_galaxies - len(available_galaxies) for hdf5_request_galaxy in available_galaxies: result = HDF5ToFitsResult() results.append(result) connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values(state=1)) # noinspection PyBroadException try: galaxy = connection.execute( select([GALAXY ]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() result.galaxy_name = galaxy[GALAXY.c.name] LOG.info('Processing {0} ({1}) for {2}'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], email)) # make sure the galaxy is available if galaxy[GALAXY.c.status_id] == STORED or galaxy[ GALAXY.c.status_id] == DELETED: output_dir = tempfile.mkdtemp() try: s3_helper = S3Helper() LOG.info('Getting HDF5 file to {0}'.format(output_dir)) tmp_file = get_hdf5_file(s3_helper, output_dir, galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('File stored in {0}'.format(tmp_file)) # We have the file if os.path.isfile(tmp_file): int_flux_output = os.path.join( output_dir, 'intflux') rad_output = os.path.join(output_dir, 'rad') if not os.path.exists(int_flux_output): os.mkdir(int_flux_output) if not os.path.exists(rad_output): os.mkdir(rad_output) file_names = process_hdf5_file( tmp_file, galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], pixel_types, features, result, layers, output_dir, rad_output, int_flux_output, ) url = zip_files( s3_helper, get_galaxy_file_name( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), uuid_string, file_names, output_dir) connection.execute(HDF5_REQUEST_GALAXY.update( ).where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id ).values(state=2, link=url, link_expires_at=datetime.now() + timedelta(days=10))) result.error = None result.link = url except S3ResponseError as e: # Handling for a strange s3 error LOG.error( 'Error retrieving galaxy {0} from s3. Retrying next run' .format(galaxy[GALAXY.c.name])) LOG.error('{0}'.format(str(e))) key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('Key: {0}'.format(key)) LOG.info('Exists: {0}'.format( s3_helper.file_exists(bucket_name, key))) result.error = traceback.format_exc() remaining_galaxies += 1 finally: # Delete the temp files now we're done shutil.rmtree(output_dir) else: connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values( state=3)) result.error = 'Cannot process {0} ({1}) as the HDF5 file has not been generated'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id]) LOG.info(result.error) except: LOG.error('Major error') result.error = traceback.format_exc() connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values( state=3)) send_email(email, results, features, layers, pixel_types, remaining_galaxies)
def generate_files(connection, hdf5_request_galaxy_ids, email, features, layers, pixel_types): """ Get the FITS files for this request :type connection: The database connection :param pixel_types: :param hdf5_request_galaxy_ids: the galaxy id :param email: :param features: :param layers: :return: """ uuid_string = str(uuid.uuid4()) results = [] available_galaxies = [] s3_helper = S3Helper() bucket_name = get_saved_files_bucket() # Check whether all the requested galaxies are available or not. for hdf5_request_galaxy in hdf5_request_galaxy_ids: galaxy = connection.execute(select([GALAXY]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() hdf5_request_galaxy = connection.execute(select([HDF5_REQUEST_GALAXY]) .where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id)).first() state = hdf5_request_galaxy.state if state is not 0: LOG.info('Skipping {0}, state is {1}'.format(galaxy[GALAXY.c.name], state)) continue # Skip key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info('Galaxy {0} is still restoring from glacier'.format(galaxy[GALAXY.c.name])) else: # if file is not restoring, need to request. file_size = s3_helper.file_size(bucket_name, key) if restore_file_size_check(connection, bucket_name, file_size): # We're good to restore LOG.info('Making request for archived galaxy {0}'.format(galaxy[GALAXY.c.name])) s3_helper.restore_archived_file(bucket_name, key) connection.execute(HDF5_REQUEST_GALAXY_SIZE.insert(), hdf5_request_galaxy_id=hdf5_request_galaxy['hdf5_request_galaxy_id'], size=file_size, request_time=seconds_since_epoch(datetime.now())) else: # Don't restore or we risk spending a lot of money LOG.info('Daily galaxy restore size hit. Cannot request archived galaxy.') else: # file is not archived LOG.info('Galaxy {0} is available in s3'.format(galaxy[GALAXY.c.name])) available_galaxies.append(hdf5_request_galaxy) else: LOG.error('Galaxy {0} does not exist on s3 or glacier!'.format(galaxy[GALAXY.c.name])) total_request_galaxies = len(hdf5_request_galaxy_ids) LOG.info('Need to have {0} galaxies available ({1} currently available)'.format(total_request_galaxies * GALAXY_EMAIL_THRESHOLD, len(available_galaxies))) if len(available_galaxies) >= total_request_galaxies * GALAXY_EMAIL_THRESHOLD: # Only proceed if more than the threshold of galaxies are available LOG.info('{0}/{1} (> {2}%) galaxies are available. Email will be sent'.format( len(available_galaxies), total_request_galaxies, GALAXY_EMAIL_THRESHOLD * 100) ) remaining_galaxies = total_request_galaxies - len(available_galaxies) for hdf5_request_galaxy in available_galaxies: result = HDF5ToFitsResult() results.append(result) connection.execute(HDF5_REQUEST_GALAXY.update().where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values(state=1)) # noinspection PyBroadException try: galaxy = connection.execute(select([GALAXY]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() result.galaxy_name = galaxy[GALAXY.c.name] LOG.info('Processing {0} ({1}) for {2}'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], email)) # make sure the galaxy is available if galaxy[GALAXY.c.status_id] == STORED or galaxy[GALAXY.c.status_id] == DELETED: output_dir = tempfile.mkdtemp() try: s3_helper = S3Helper() LOG.info('Getting HDF5 file to {0}'.format(output_dir)) tmp_file = get_hdf5_file(s3_helper, output_dir, galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('File stored in {0}'.format(tmp_file)) # We have the file if os.path.isfile(tmp_file): int_flux_output = os.path.join(output_dir, 'intflux') rad_output = os.path.join(output_dir, 'rad') if not os.path.exists(int_flux_output): os.mkdir(int_flux_output) if not os.path.exists(rad_output): os.mkdir(rad_output) file_names = process_hdf5_file( tmp_file, galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], pixel_types, features, result, layers, output_dir, rad_output, int_flux_output, ) url = zip_files( s3_helper, get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), uuid_string, file_names, output_dir ) connection.execute( HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=2, link=url, link_expires_at=datetime.now() + timedelta(days=10))) result.error = None result.link = url except S3ResponseError as e: # Handling for a strange s3 error LOG.error('Error retrieving galaxy {0} from s3. Retrying next run'.format(galaxy[GALAXY.c.name])) LOG.error('{0}'.format(str(e))) key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('Key: {0}'.format(key)) LOG.info('Exists: {0}'.format(s3_helper.file_exists(bucket_name, key))) result.error = traceback.format_exc() remaining_galaxies += 1 finally: # Delete the temp files now we're done shutil.rmtree(output_dir) else: connection.execute(HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=3)) result.error = 'Cannot process {0} ({1}) as the HDF5 file has not been generated'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id]) LOG.info(result.error) except: LOG.error('Major error') result.error = traceback.format_exc() connection.execute(HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=3)) send_email(email, results, features, layers, pixel_types, remaining_galaxies)