def get_hdf5_from_s3(galaxy, directory): bucket_name = get_saved_files_bucket() key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) s3_helper = S3Helper() if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info( 'Galaxy {0} ({1}) is still restoring from glacier'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) else: # if file is not restoring, need to request. LOG.info('Making request for archived galaxy {0} ({1})'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) s3_helper.restore_archived_file(bucket_name, key, days=10) else: # file is not archived LOG.info('Galaxy {0} ({1}) is available in s3'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) filename = os.path.join( directory, get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id])) + '.hdf5' s3_helper.get_file_from_bucket(bucket_name=bucket_name, key_name=key, file_name=filename) else: LOG.info('The key {0} in bucket {1} does not exist'.format( key, bucket_name))
def migrate_hdf5_files(connection, file_bucket_name, s3helper): for galaxy in connection.execute(select([GALAXY])): # Get the hdf5 file if galaxy[GALAXY.c.version_number] > 1: ngas_file_name = '{0}_V{1}.hdf5'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.version_number]) else: ngas_file_name = '{0}.hdf5'.format(galaxy[GALAXY.c.name]) path_name = get_temp_file('hdf5') command_string = 'wget -O {0} http://cortex.ivec.org:7780/RETRIEVE?file_id={1}&processing=ngamsMWACortexStageDppi'.format( path_name, urllib.quote(ngas_file_name, '')) print(command_string) try: output = subprocess.check_output(shlex.split(command_string), stderr=subprocess.STDOUT) if check_results(output, path_name): add_file_to_bucket1( file_bucket_name, get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), path_name, s3helper) else: LOG.error('Big error with {0}'.format(ngas_file_name)) except subprocess.CalledProcessError as e: LOG.exception('Big error') raise
def migrate_hdf5_files(bad_galaxies, connection, file_bucket_name, s3helper): for bad_galaxy_name in bad_galaxies: LOG.info('Migrating {0}'.format(bad_galaxy_name)) # extract the galaxy galaxy_number = get_galaxy_number(bad_galaxy_name) galaxy = connection.execute(select([GALAXY]).where(GALAXY.c.galaxy_id == galaxy_number)).first() # Get the hdf5 file old_galaxy = MAP_OLD_GALAXY_DATA[galaxy_number] if galaxy[GALAXY.c.version_number] > 1: ngas_file_name = '{0}_V{1}.hdf5'.format(old_galaxy[NAME], galaxy[GALAXY.c.version_number]) else: ngas_file_name = '{0}.hdf5'.format(old_galaxy[NAME]) path_name = get_temp_file('hdf5') command_string = 'wget -O {0} http://cortex.ivec.org:7780/RETRIEVE?file_id={1}&processing=ngamsMWACortexStageDppi'.format(path_name, urllib.quote(ngas_file_name, '')) print(command_string) try: output = subprocess.check_output(shlex.split(command_string), stderr=subprocess.STDOUT) if check_results(output, path_name): add_file_to_bucket1(file_bucket_name, get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), path_name, s3helper) else: LOG.error('Big error with {0}'.format(ngas_file_name)) raise Exception('wget failed') except subprocess.CalledProcessError as e: LOG.exception('Big error') raise os.remove(path_name)
def get_hdf5_from_s3(galaxy, directory): bucket_name = get_saved_files_bucket() key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) s3_helper = S3Helper() if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info( 'Galaxy {0} ({1}) is still restoring from glacier'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id] ) ) else: # if file is not restoring, need to request. LOG.info('Making request for archived galaxy {0} ({1})'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) s3_helper.restore_archived_file(bucket_name, key, days=10) else: # file is not archived LOG.info('Galaxy {0} ({1}) is available in s3'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id])) filename = os.path.join( directory, get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id])) + '.hdf5' s3_helper.get_file_from_bucket(bucket_name=bucket_name, key_name=key, file_name=filename) else: LOG.info('The key {0} in bucket {1} does not exist'.format(key, bucket_name))
def get_hdf5_file(s3Helper, output_dir, galaxy_name, run_id, galaxy_id): """ Get the HDF file :param galaxy_name: the name of the galaxy :param run_id: the run id :param galaxy_id: the galaxy id :return: """ bucket_name = get_files_bucket() key = get_key_hdf5(galaxy_name, run_id, galaxy_id) tmp_file = get_temp_file('.hdf5', 'pogs', output_dir) s3Helper.get_file_from_bucket(bucket_name=bucket_name, key_name=key, file_name=tmp_file) return tmp_file
def get_hdf5_file(s3_helper, output_dir, galaxy_name, run_id, galaxy_id): """ Get the HDF file :param s3_helper: The S3 helper :param output_dir: where to write the file :param galaxy_name: the name of the galaxy :param run_id: the run id :param galaxy_id: the galaxy id :return: """ bucket_name = get_saved_files_bucket() key = get_key_hdf5(galaxy_name, run_id, galaxy_id) tmp_file = get_temp_file('.hdf5', 'pogs', output_dir) s3_helper.get_file_from_bucket(bucket_name=bucket_name, key_name=key, file_name=tmp_file) return tmp_file
def migrate_hdf5_files(connection, file_bucket_name, s3helper): for galaxy in connection.execute(select([GALAXY])): # Get the hdf5 file if galaxy[GALAXY.c.version_number] > 1: ngas_file_name = '{0}_V{1}.hdf5'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.version_number]) else: ngas_file_name = '{0}.hdf5'.format(galaxy[GALAXY.c.name]) path_name = get_temp_file('hdf5') command_string = 'wget -O {0} http://cortex.ivec.org:7780/RETRIEVE?file_id={1}&processing=ngamsMWACortexStageDppi'.format(path_name, urllib.quote(ngas_file_name, '')) print(command_string) try: output = subprocess.check_output(shlex.split(command_string), stderr=subprocess.STDOUT) if check_results(output, path_name): add_file_to_bucket1(file_bucket_name, get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), path_name, s3helper) else: LOG.error('Big error with {0}'.format(ngas_file_name)) except subprocess.CalledProcessError as e: LOG.exception('Big error') raise
def migrate_hdf5_files(bad_galaxies, connection, file_bucket_name, s3helper): for bad_galaxy_name in bad_galaxies: LOG.info('Migrating {0}'.format(bad_galaxy_name)) # extract the galaxy galaxy_number = get_galaxy_number(bad_galaxy_name) galaxy = connection.execute( select([GALAXY ]).where(GALAXY.c.galaxy_id == galaxy_number)).first() # Get the hdf5 file old_galaxy = MAP_OLD_GALAXY_DATA[galaxy_number] if galaxy[GALAXY.c.version_number] > 1: ngas_file_name = '{0}_V{1}.hdf5'.format( old_galaxy[NAME], galaxy[GALAXY.c.version_number]) else: ngas_file_name = '{0}.hdf5'.format(old_galaxy[NAME]) path_name = get_temp_file('hdf5') command_string = 'wget -O {0} http://cortex.ivec.org:7780/RETRIEVE?file_id={1}&processing=ngamsMWACortexStageDppi'.format( path_name, urllib.quote(ngas_file_name, '')) print(command_string) try: output = subprocess.check_output(shlex.split(command_string), stderr=subprocess.STDOUT) if check_results(output, path_name): add_file_to_bucket1( file_bucket_name, get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), path_name, s3helper) else: LOG.error('Big error with {0}'.format(ngas_file_name)) raise Exception('wget failed') except subprocess.CalledProcessError as e: LOG.exception('Big error') raise os.remove(path_name)
def generate_files(connection, hdf5_request_galaxy_ids, email, features, layers, pixel_types): """ Get the FITS files for this request :type connection: The database connection :param pixel_types: :param hdf5_request_galaxy_ids: the galaxy id :param email: :param features: :param layers: :return: """ uuid_string = str(uuid.uuid4()) results = [] available_galaxies = [] s3_helper = S3Helper() bucket_name = get_saved_files_bucket() # Check whether all the requested galaxies are available or not. for hdf5_request_galaxy in hdf5_request_galaxy_ids: galaxy = connection.execute( select([GALAXY]).where( GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() hdf5_request_galaxy = connection.execute( select([ HDF5_REQUEST_GALAXY ]).where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id)).first() state = hdf5_request_galaxy.state if state is not 0: LOG.info('Skipping {0}, state is {1}'.format( galaxy[GALAXY.c.name], state)) continue # Skip key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info( 'Galaxy {0} is still restoring from glacier'.format( galaxy[GALAXY.c.name])) else: # if file is not restoring, need to request. file_size = s3_helper.file_size(bucket_name, key) if restore_file_size_check(connection, bucket_name, file_size): # We're good to restore LOG.info( 'Making request for archived galaxy {0}'.format( galaxy[GALAXY.c.name])) s3_helper.restore_archived_file(bucket_name, key) connection.execute( HDF5_REQUEST_GALAXY_SIZE.insert(), hdf5_request_galaxy_id=hdf5_request_galaxy[ 'hdf5_request_galaxy_id'], size=file_size, request_time=seconds_since_epoch(datetime.now())) else: # Don't restore or we risk spending a lot of money LOG.info( 'Daily galaxy restore size hit. Cannot request archived galaxy.' ) else: # file is not archived LOG.info('Galaxy {0} is available in s3'.format( galaxy[GALAXY.c.name])) available_galaxies.append(hdf5_request_galaxy) else: LOG.error('Galaxy {0} does not exist on s3 or glacier!'.format( galaxy[GALAXY.c.name])) total_request_galaxies = len(hdf5_request_galaxy_ids) LOG.info( 'Need to have {0} galaxies available ({1} currently available)'.format( total_request_galaxies * GALAXY_EMAIL_THRESHOLD, len(available_galaxies))) if len( available_galaxies ) >= total_request_galaxies * GALAXY_EMAIL_THRESHOLD: # Only proceed if more than the threshold of galaxies are available LOG.info('{0}/{1} (> {2}%) galaxies are available. Email will be sent'. format(len(available_galaxies), total_request_galaxies, GALAXY_EMAIL_THRESHOLD * 100)) remaining_galaxies = total_request_galaxies - len(available_galaxies) for hdf5_request_galaxy in available_galaxies: result = HDF5ToFitsResult() results.append(result) connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values(state=1)) # noinspection PyBroadException try: galaxy = connection.execute( select([GALAXY ]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() result.galaxy_name = galaxy[GALAXY.c.name] LOG.info('Processing {0} ({1}) for {2}'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], email)) # make sure the galaxy is available if galaxy[GALAXY.c.status_id] == STORED or galaxy[ GALAXY.c.status_id] == DELETED: output_dir = tempfile.mkdtemp() try: s3_helper = S3Helper() LOG.info('Getting HDF5 file to {0}'.format(output_dir)) tmp_file = get_hdf5_file(s3_helper, output_dir, galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('File stored in {0}'.format(tmp_file)) # We have the file if os.path.isfile(tmp_file): int_flux_output = os.path.join( output_dir, 'intflux') rad_output = os.path.join(output_dir, 'rad') if not os.path.exists(int_flux_output): os.mkdir(int_flux_output) if not os.path.exists(rad_output): os.mkdir(rad_output) file_names = process_hdf5_file( tmp_file, galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], pixel_types, features, result, layers, output_dir, rad_output, int_flux_output, ) url = zip_files( s3_helper, get_galaxy_file_name( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), uuid_string, file_names, output_dir) connection.execute(HDF5_REQUEST_GALAXY.update( ).where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id ).values(state=2, link=url, link_expires_at=datetime.now() + timedelta(days=10))) result.error = None result.link = url except S3ResponseError as e: # Handling for a strange s3 error LOG.error( 'Error retrieving galaxy {0} from s3. Retrying next run' .format(galaxy[GALAXY.c.name])) LOG.error('{0}'.format(str(e))) key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('Key: {0}'.format(key)) LOG.info('Exists: {0}'.format( s3_helper.file_exists(bucket_name, key))) result.error = traceback.format_exc() remaining_galaxies += 1 finally: # Delete the temp files now we're done shutil.rmtree(output_dir) else: connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values( state=3)) result.error = 'Cannot process {0} ({1}) as the HDF5 file has not been generated'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id]) LOG.info(result.error) except: LOG.error('Major error') result.error = traceback.format_exc() connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values( state=3)) send_email(email, results, features, layers, pixel_types, remaining_galaxies)
def generate_files(connection, hdf5_request_galaxy_ids, email, features, layers, pixel_types): """ Get the FITS files for this request :type connection: The database connection :param pixel_types: :param hdf5_request_galaxy_ids: the galaxy id :param email: :param features: :param layers: :return: """ uuid_string = str(uuid.uuid4()) results = [] available_galaxies = [] s3_helper = S3Helper() bucket_name = get_saved_files_bucket() # Check whether all the requested galaxies are available or not. for hdf5_request_galaxy in hdf5_request_galaxy_ids: galaxy = connection.execute(select([GALAXY]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() hdf5_request_galaxy = connection.execute(select([HDF5_REQUEST_GALAXY]) .where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id)).first() state = hdf5_request_galaxy.state if state is not 0: LOG.info('Skipping {0}, state is {1}'.format(galaxy[GALAXY.c.name], state)) continue # Skip key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info('Galaxy {0} is still restoring from glacier'.format(galaxy[GALAXY.c.name])) else: # if file is not restoring, need to request. file_size = s3_helper.file_size(bucket_name, key) if restore_file_size_check(connection, bucket_name, file_size): # We're good to restore LOG.info('Making request for archived galaxy {0}'.format(galaxy[GALAXY.c.name])) s3_helper.restore_archived_file(bucket_name, key) connection.execute(HDF5_REQUEST_GALAXY_SIZE.insert(), hdf5_request_galaxy_id=hdf5_request_galaxy['hdf5_request_galaxy_id'], size=file_size, request_time=seconds_since_epoch(datetime.now())) else: # Don't restore or we risk spending a lot of money LOG.info('Daily galaxy restore size hit. Cannot request archived galaxy.') else: # file is not archived LOG.info('Galaxy {0} is available in s3'.format(galaxy[GALAXY.c.name])) available_galaxies.append(hdf5_request_galaxy) else: LOG.error('Galaxy {0} does not exist on s3 or glacier!'.format(galaxy[GALAXY.c.name])) total_request_galaxies = len(hdf5_request_galaxy_ids) LOG.info('Need to have {0} galaxies available ({1} currently available)'.format(total_request_galaxies * GALAXY_EMAIL_THRESHOLD, len(available_galaxies))) if len(available_galaxies) >= total_request_galaxies * GALAXY_EMAIL_THRESHOLD: # Only proceed if more than the threshold of galaxies are available LOG.info('{0}/{1} (> {2}%) galaxies are available. Email will be sent'.format( len(available_galaxies), total_request_galaxies, GALAXY_EMAIL_THRESHOLD * 100) ) remaining_galaxies = total_request_galaxies - len(available_galaxies) for hdf5_request_galaxy in available_galaxies: result = HDF5ToFitsResult() results.append(result) connection.execute(HDF5_REQUEST_GALAXY.update().where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values(state=1)) # noinspection PyBroadException try: galaxy = connection.execute(select([GALAXY]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() result.galaxy_name = galaxy[GALAXY.c.name] LOG.info('Processing {0} ({1}) for {2}'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], email)) # make sure the galaxy is available if galaxy[GALAXY.c.status_id] == STORED or galaxy[GALAXY.c.status_id] == DELETED: output_dir = tempfile.mkdtemp() try: s3_helper = S3Helper() LOG.info('Getting HDF5 file to {0}'.format(output_dir)) tmp_file = get_hdf5_file(s3_helper, output_dir, galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('File stored in {0}'.format(tmp_file)) # We have the file if os.path.isfile(tmp_file): int_flux_output = os.path.join(output_dir, 'intflux') rad_output = os.path.join(output_dir, 'rad') if not os.path.exists(int_flux_output): os.mkdir(int_flux_output) if not os.path.exists(rad_output): os.mkdir(rad_output) file_names = process_hdf5_file( tmp_file, galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], pixel_types, features, result, layers, output_dir, rad_output, int_flux_output, ) url = zip_files( s3_helper, get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), uuid_string, file_names, output_dir ) connection.execute( HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=2, link=url, link_expires_at=datetime.now() + timedelta(days=10))) result.error = None result.link = url except S3ResponseError as e: # Handling for a strange s3 error LOG.error('Error retrieving galaxy {0} from s3. Retrying next run'.format(galaxy[GALAXY.c.name])) LOG.error('{0}'.format(str(e))) key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('Key: {0}'.format(key)) LOG.info('Exists: {0}'.format(s3_helper.file_exists(bucket_name, key))) result.error = traceback.format_exc() remaining_galaxies += 1 finally: # Delete the temp files now we're done shutil.rmtree(output_dir) else: connection.execute(HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=3)) result.error = 'Cannot process {0} ({1}) as the HDF5 file has not been generated'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id]) LOG.info(result.error) except: LOG.error('Major error') result.error = traceback.format_exc() connection.execute(HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=3)) send_email(email, results, features, layers, pixel_types, remaining_galaxies)