def main(args): """ Main entry point """ base_output_dir = Settings.get_solo().base_output_dir ceda_base = '/badc/cmip6/data' for ed in ESGFDataset.objects.filter(status='PUBLISHED'): for df in ed.data_request.datafile_set.order_by('name'): ceda_dir = os.path.join(ceda_base, construct_drs_path(df)) ceda_path = os.path.join(ceda_dir, df.name) if df.directory: logger.error('Directory given {}'.format( os.path.join(df.directory, df.name))) if os.path.exists(ceda_path): df.online = True df.directory = ceda_dir df.save() else: logger.error('Not in archive {}'.format(ceda_path)) continue base_dir = os.path.join(base_output_dir, construct_drs_path(df)) base_path = os.path.join(base_dir, df.name) if os.path.exists(base_path): os.remove(base_path) logger.error('Deleted {}'.format(base_path)) if not os.path.exists(base_dir): os.makedirs(base_dir) os.symlink(ceda_path, base_path)
def main(args): """ Main entry point """ dfs = DataFile.objects.filter(climate_model__short_name='MPI-ESM1-2-XR', experiment__short_name='highres-future', version='v20190617') prim_gws = '/gws/nopw/j04/primavera5/stream1' old_dirs = [] for df in dfs: old_drs_path = construct_drs_path(df) df.version = 'v20190517' df.save() if df.online: # file itself gws = get_gws(df.directory) old_dir = df.directory new_dir = os.path.join(gws, construct_drs_path(df)) if not os.path.exists(new_dir): os.makedirs(new_dir) os.rename(os.path.join(df.directory, df.name), os.path.join(new_dir, df.name)) df.directory = new_dir df.save() if old_dir not in old_dirs: old_dirs.append(old_dir) # sym link if not is_same_gws(df.directory, prim_gws): old_sym_dir = os.path.join(prim_gws, old_drs_path) old_sym = os.path.join(old_sym_dir, df.name) # TODO next line doesn't work as this is now a broken symlink so returns false if os.path.exists(old_sym): if os.path.islink(old_sym): os.remove(old_sym) else: logger.warning(f'Not symlink as expected: {old_sym}') new_sym_dir = os.path.join(prim_gws, construct_drs_path(df)) if not os.path.exists(new_sym_dir): os.makedirs(new_sym_dir) os.symlink(os.path.join(new_dir, df.name), os.path.join(new_sym_dir, df.name)) if old_sym_dir not in old_dirs: old_dirs.append(old_sym_dir) logger.debug(f'Removing {len(old_dirs)} old dirs') for old_dir in old_dirs: delete_drs_dir(old_dir)
def main(args): """ Main entry point """ base_output_dir = Settings.get_solo().base_output_dir for data_file in DataFile.objects.filter(online=True): gws_pattern = r'^/group_workspaces/jasmin2/primavera(\d)/(\S*)' gws = re.match(gws_pattern, data_file.directory) if not gws: logger.error('No GWS match for {}'.format(data_file.name)) continue new_gws = '/gws/nopw/j04/primavera' + gws.group(1) new_dir = os.path.join(new_gws, gws.group(2)) new_path = os.path.join(new_dir, data_file.name) if not os.path.exists(new_path): logger.error('Cannot find {}'.format(new_path)) continue data_file.directory = new_dir data_file.save() if not is_same_gws(data_file.directory, base_output_dir): link_path = os.path.join(base_output_dir, construct_drs_path(data_file), data_file.name) # it's got to be a link but check anyway if os.path.islink(link_path): os.remove(link_path) os.symlink(os.path.join(data_file.directory, data_file.name), link_path) else: logger.error('Expected a link but found a file at {}'. format(link_path))
def main(): """ Main entry point """ affected_files = DataFile.objects.filter( climate_model__short_name='EC-Earth3P-HR', experiment__short_name='highresSST-present', rip_code='r1i1p1f1', variable_request__table_name='E3hr', variable_request__cmor_name__in=['clivi', 'rsdt']).distinct().order_by( 'variable_request__table_name', 'variable_request__cmor_name') num_files = affected_files.count() logger.debug(f'{num_files} affected files found') for df in affected_files: if not df.directory.startswith(ARCHIVE_BASE): logger.error(f'{df.name} not in {ARCHIVE_BASE}') continue new_dir = os.path.join(BASE_GWS, construct_drs_path(df)) new_path = os.path.join(new_dir, df.name) old_path = os.path.join(df.directory, df.name) if not os.path.exists(new_path): os.makedirs(new_path) shutil.copy(old_path, new_path) df.directory = new_dir df.save()
def main(): """ Main entry point """ affected_files = DataFile.objects.filter( climate_model__short_name='HadGEM3-GC31-HH', variable_request__table_name__in=['SImon', 'SIday', 'PrimSIday']).distinct().order_by( 'variable_request__table_name', 'variable_request__cmor_name') num_files = affected_files.count() logger.debug(f'{num_files} affected files found') for df in affected_files: if not df.directory.startswith(ARCHIVE_BASE): logger.error(f'{df.name} not in {ARCHIVE_BASE}') continue new_dir = os.path.join(BASE_GWS, construct_drs_path(df)) new_path = os.path.join(new_dir, df.name) old_path = os.path.join(df.directory, df.name) if not os.path.exists(new_dir): os.makedirs(new_dir) shutil.copy(old_path, new_path) df.directory = new_dir df.save()
def main(): """Main entry point""" data_reqs = DataRequest.objects.filter( climate_model__short_name='EC-Earth3', experiment__short_name='highresSST-present', rip_code__in=['r1i1p1f1'], variable_request__table_name='Amon', variable_request__cmor_name__in=[ 'clt', 'hus', 'pr', 'rlut', 'rlutcs', 'rsut', 'rsutcs', 'ta', 'tas', 'ts', 'ua', 'va', 'zg' ]) logger.debug('{} data requests found'.format(data_reqs.count())) for data_req in data_reqs: for data_file in data_req.datafile_set.all(): if not data_file.online: raise ValueError('{} is not online'.format(data_file.name)) src_path = os.path.join(data_file.directory, data_file.name) dest_dir = os.path.join(TEST_DATA_DIR, construct_drs_path(data_file)) if not os.path.exists(dest_dir): os.makedirs(dest_dir) dest_path = os.path.join(dest_dir, data_file.name) shutil.copyfile(src_path, dest_path) data_file.directory = dest_dir data_file.save()
def main(args): """ Main entry point """ logger.debug('Starting file structure scan.') for nc_file in ilist_files(args.directory): nc_file_name = os.path.basename(nc_file) db_files = DataFile.objects.filter(name=nc_file_name) if db_files.count() == 0: logger.error('File not found in database: {}'.format(nc_file)) elif db_files.count() > 1: logger.error('{} entries found in database for file: {}'. format(db_files.count(), nc_file)) else: db_file = db_files.first() act_size = os.path.getsize(nc_file) if act_size != db_file.size: logger.info('File %s has size %d', db_file.name, act_size) db_file.online = False db_file.directory = None db_file.save() os.remove(nc_file) if not is_same_gws(nc_file, BASE_OUTPUT_DIR): sym_link_path = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(db_file), db_file.name) try: if os.path.exists(sym_link_path): os.remove(sym_link_path) except OSError: logger.error('Unable to delete sym link %s', sym_link_path)
def scan_database(): """ Start the scan of the database. """ logger.debug('Starting database scan.') for data_file in DataFile.objects.filter(online=True).iterator(): full_path = os.path.join(data_file.directory, data_file.name) if not os.path.exists(full_path): logger.warning('File cannot be found on disk, status changed to ' 'offline: {}'.format(full_path)) data_file.online = False data_file.directory = None data_file.save() continue if not is_same_gws(data_file.directory, BASE_OUTPUT_DIR): sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(data_file)) sym_link_path = os.path.join(sym_link_dir, data_file.name) if not os.path.exists(sym_link_path): if not os.path.exists(sym_link_dir): os.makedirs(sym_link_dir) os.symlink(full_path, sym_link_path) logger.warning('Created symlink for file {} from {}'. format(data_file.name, sym_link_path)) logger.debug('Completed database scan.')
def main(args): """ Main entry point """ affected_files = DataFile.objects.filter( climate_model__short_name='CMCC-CM2-VHR4', experiment__short_name='control-1950', variable_request__table_name=args.table_name ).distinct().order_by( 'variable_request__table_name', 'variable_request__cmor_name' ) num_files = affected_files.count() logger.debug(f'{num_files} affected files found') for df in affected_files: if not df.directory.startswith(ARCHIVE_BASE): logger.error(f'{df.name} not in {ARCHIVE_BASE}') continue new_dir = os.path.join(BASE_GWS, construct_drs_path(df)) new_path = os.path.join(new_dir, df.name) old_path = os.path.join(df.directory, df.name) if not os.path.exists(new_path): os.makedirs(new_path) shutil.copy(old_path, new_path) df.directory = new_dir df.save()
def main(args): """ Main entry point """ for ret_req in RET_REQS: rr = RetrievalRequest.objects.get(id=ret_req) logger.debug('Starting retrieval request {}'.format(ret_req)) for dr in rr.data_request.all(): logger.debug('Starting data request {}'.format(dr)) num_files_moved = 0 for df in dr.datafile_set.filter(online=True): if df.directory.startswith(INCOMING_DIR): drs_path = construct_drs_path(df) dest_dir = os.path.join(NEW_BASE_OUTPUT_DIR, drs_path) if not os.path.exists(dest_dir): os.makedirs(dest_dir) dest_path = os.path.join(dest_dir, df.name) src_path = os.path.join(df.directory, df.name) # copy the file shutil.copy(src_path, dest_path) # check its checksum checksum = adler32(dest_path) if checksum != df.checksum_set.first().checksum_value: msg = 'Checksum does not match for {}'.format(df.name) raise ValueError(msg) # construct a sym link primary_path = os.path.join(BASE_OUTPUT_DIR, drs_path) if not os.path.exists(primary_path): os.makedirs(primary_path) os.symlink(dest_path, os.path.join(primary_path, df.name)) # update the DB df.directory = dest_dir df.save() num_files_moved += 1 logger.debug('{} files moved'.format(num_files_moved))
def main(args): """ Main entry point """ copy_dir = '/gws/nopw/j04/primavera5/upload/CMCC/fluxes' dreqs = DataRequest.objects.filter( institute__short_name='CMCC', experiment__short_name__in=[ 'highres-future', 'highresSST-future', 'hist-1950' ], variable_request__cmor_name__in=['rlut', 'rlutcs', 'rsutcs'], datafile__isnull=False).distinct() num_dreqs = dreqs.count() expected_dreqs = 18 if num_dreqs != expected_dreqs: logger.error(f'Found {num_dreqs} but was expecting {expected_dreqs}.') sys.exit(1) for dreq in dreqs: logger.info(dreq) for df in dreq.datafile_set.order_by('name'): new_dir = os.path.join(copy_dir, construct_drs_path(df)) if not os.path.exists(new_dir): os.makedirs(new_dir) shutil.copyfile(os.path.join(df.directory, df.name), os.path.join(new_dir, df.name))
def main(args): """Main entry point""" base_dir = Settings.get_solo().base_output_dir for extracted_file in ilist_files(args.top_dir): found_name = os.path.basename(extracted_file) try: data_file = DataFile.objects.get(name=found_name) except django.core.exceptions.ObjectDoesNotExist: logger.warning('Cannot find DMT entry. Skipping {}'. format(extracted_file)) continue found_checksum = adler32(extracted_file) if not found_checksum == data_file.checksum_set.first().checksum_value: logger.warning("Checksum doesn't match. Skipping {}". format(found_name)) continue dest_dir = os.path.join(get_gws_any_dir(extracted_file), 'stream1', construct_drs_path(data_file)) dest_path = os.path.join(dest_dir, found_name) if os.path.exists(dest_path): logger.warning('Skipping {} as it already exists at {}'. format(found_name, dest_path)) continue # create the directory if it doesn't exist if not os.path.exists(dest_dir): os.makedirs(dest_dir) os.rename(extracted_file, dest_path) # create a link from the base dir if not is_same_gws(dest_path, base_dir): link_dir = os.path.join(base_dir, construct_drs_path(data_file)) link_path = os.path.join(link_dir, data_file.name) if not os.path.exists(link_dir): os.makedirs(link_dir) os.symlink(dest_path, link_path) data_file.online = True data_file.directory = dest_dir data_file.save()
def main(args): """ Main entry point """ dreqs = DataRequest.objects.filter( institute__short_name='MPI-M', experiment__short_name__in=['control-1950', 'hist-1950'], variable_request__cmor_name='tos', datafile__isnull=False).distinct() logger.debug(f'Found {dreqs.count()} datasets') for dreq in dreqs: if dreq.esgfdataset_set.all(): # ESGF dataset's been created... esgf = dreq.esgfdataset_set.first() if esgf.status == 'PUBLISHED': # ... and published so the data's in the CEDA archive # and symlinked from the PRIMAVERA data structure # All sym links will be in one directory set_dir = os.path.join( BASE_OUTPUT_DIR, construct_drs_path(dreq.datafile_set.first())) for df in dreq.datafile_set.all(): file_path = os.path.join(set_dir, df.name) if not os.path.islink(file_path): logger.warning(f'Expected a sym link {file_path}') continue try: os.remove(file_path) except OSError as exc: logger.error(str(exc)) df.online = False df.directory = None df.save() delete_drs_dir(set_dir) logger.debug(f'Removed files for ESGFDataset {esgf}') esgf.status = 'CREATED' esgf.save() continue # The data's not been published so delete the files and their sym links delete_files(dreq.datafile_set.all(), BASE_OUTPUT_DIR) logger.debug(f'Removed files for DataRequest {dreq}') dreq.datafile_set.update(directory=None, online=False) for dreq in dreqs: dreq.datafile_set.update(version='v20191129')
def main(args): """ Main entry point """ dreqs = DataRequest.objects.filter( climate_model__short_name='CMCC-CM2-VHR4', experiment__short_name='control-1950', datafile__isnull=False).exclude(variable_request__table_name__in=[ 'LImon', 'Lmon', 'Oday', 'Omon', 'PrimOday', 'PrimOmon', 'SIday', 'SImon' ]).distinct().order_by('variable_request__table_name', 'variable_request__cmor_name') num_dreqs = dreqs.count() logger.info(f'{num_dreqs} data requests found') for dreq in dreqs: try: df = dreq.datafile_set.get(name__contains='198207') except django.core.exceptions.ObjectDoesNotExist: logger.error(f'{dreq} no files found in DMT') continue logger.debug(f'Replacing {df.name}') file_name = df.name old_dir = df.directory old_path = os.path.join(old_dir, file_name) drs_path = construct_drs_path(df) incoming_dir = os.path.join(BASE_INCOMING_DIR, drs_path).replace(df.version, 'v20200401') incoming_path = os.path.join(incoming_dir, file_name) if not os.path.exists(incoming_path): logger.error(f'{incoming_path} not found') # Copy os.remove(old_path) shutil.copy(incoming_path, old_path) df.tape_url = 'et:21500' df.incoming_directory = incoming_dir df.save() checksum = md5(old_path) df.checksum_set.all().delete() df.tapechecksum_set.all().delete() Checksum.objects.create(data_file=df, checksum_value=checksum, checksum_type='ADLER32') TapeChecksum.objects.create(data_file=df, checksum_value=checksum, checksum_type='ADLER32')
def __init__(self, datafile, new_value, update_file_only=False): """ Initialise the class :param pdata_apps.models.DataFile datafile: the file to update :param str new_value: the new value to apply :param bool update_file_only: if true then update just the file and don't make any changes to the database. """ self.datafile = datafile self.new_value = new_value self.old_filename = self.datafile.name self.old_directory = self.datafile.directory self.old_sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(self.datafile)) self.new_filename = None self.new_directory = None self.update_file_only = update_file_only
def main(args): """ Main entry point """ datasets = ESGFDataset.objects.filter( data_request__institute__short_name='MPI-M', data_request__experiment__short_name='highresSST-present', status='PUBLISHED') logger.debug(f'Found {datasets.count()} datasets') for dataset in datasets: for datafile in dataset.data_request.datafile_set.all(): dest_dir = os.path.join(TAPE_WRITE_DIR, construct_drs_path(datafile)) if not os.path.exists(dest_dir): os.makedirs(dest_dir) shutil.copy(os.path.join(datafile.directory, datafile.name), dest_dir) logger.debug(f'Copied {dataset}')
def main(args): """ Main entry point """ data_req = DataRequest.objects.get( climate_model__short_name='EC-Earth3', experiment__short_name='highresSST-present', variable_request__table_name='day', variable_request__cmor_name='va' ) links_created = 0 for data_file in data_req.datafile_set.all(): drs_path = construct_drs_path(data_file) stream1_dir = os.path.join(BASE_OUTPUT_DIR, drs_path) stream1_file = os.path.join(stream1_dir, data_file.name) dest_file = os.path.join(NEW_BASE_OUTPUT_DIR, drs_path, data_file.name) if not os.path.exists(stream1_file): os.symlink(dest_file, stream1_file) links_created += 1 logger.debug('{} links created'.format(links_created))
def main(args): """ Main entry point """ bad_files = list_files(BAD_DIR) logger.debug(f'{len(bad_files)} files found') for bf in bad_files: df = DataFile.objects.get(name=os.path.basename(bf)) new_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) new_path = os.path.join(new_dir, df.name) if not os.path.exists(new_dir): os.makedirs(new_dir) if os.path.exists(new_path): if os.path.islink(new_path): os.remove(new_path) else: logger.error(f'{new_path} is not a link') continue os.rename(bf, new_path) df.directory = new_dir df.save()
def _rename_file(self): """ Rename the file on disk and move to its new directory. Update the link from the primary directory. """ if not os.path.exists(self.new_directory): os.makedirs(self.new_directory) os.rename(os.path.join(self.old_directory, self.old_filename), os.path.join(self.new_directory, self.new_filename)) # check for empty directory if not os.listdir(self.old_directory): delete_drs_dir(self.old_directory) # Update the symbolic link if required if not is_same_gws(self.old_directory, BASE_OUTPUT_DIR): old_link_path = os.path.join(self.old_sym_link_dir, self.old_filename) if os.path.lexists(old_link_path): if not os.path.islink(old_link_path): logger.error("{} exists and isn't a symbolic link.".format( old_link_path)) raise SymLinkIsFileError(old_link_path) else: # it is a link so remove it os.remove(old_link_path) # check for empty directory if not os.listdir(self.old_sym_link_dir): delete_drs_dir(self.old_sym_link_dir) new_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(self.datafile)) if not os.path.exists(new_link_dir): os.makedirs(new_link_dir) os.symlink(os.path.join(self.new_directory, self.new_filename), os.path.join(new_link_dir, self.new_filename))
def main(args): """ Main entry point """ dreqs = DataRequest.objects.filter( climate_model__short_name='CMCC-CM2-VHR4', experiment__short_name='control-1950', datafile__isnull=False ).distinct().order_by( 'variable_request__table_name', 'variable_request__cmor_name' ) num_dreqs = dreqs.count() logger.info(f'{num_dreqs} data requests found') for dreq in dreqs: dreq.datafile_set.update(version='v20200917') for df in dreq.datafile_set.filter(online=True).order_by('name'): old_dir = df.directory old_path = os.path.join(old_dir, df.name) if not os.path.exists(old_path): logger.error(f'{old_path} not found') continue new_dir = os.path.join(get_gws(df.directory), construct_drs_path(df)) if df.directory != new_dir: if not os.path.exists(new_dir): os.makedirs(new_dir) os.rename(old_path, os.path.join(new_dir, df.name)) df.directory = new_dir df.save() # Delete original dir if it's now empty if not os.listdir(old_dir): delete_drs_dir(old_dir)
def main(args): """ Main entry point """ dreqs = DataRequest.objects.filter( climate_model__short_name = 'EC-Earth3P', experiment__short_name__in = ['primWP5-amv-pos','primWP5-amv-neg'], variable_request__cmor_name__in = ['rsus', 'rlus'] ).distinct() num_dreqs = dreqs.count() if num_dreqs != 100: logger.error(f'{num_dreqs} affected data requests found') sys.exit(1) for dreq in dreqs: for df in dreq.datafile_set.all(): new_dir = os.path.join(BASE_GWS, construct_drs_path(df)) new_path = os.path.join(new_dir, df.name) old_path = os.path.join(df.directory, df.name) if not os.path.exists(new_dir): os.makedirs(new_dir) shutil.copy(old_path, new_path) delete_files(dreq.datafile_set.all(), BASE_OUTPUT_DIR)
def main(args): """ Main entry point """ ret_req = RetrievalRequest.objects.get(id=args.retrieval_id) all_files = DataFile.objects.filter( data_request__in=ret_req.data_request.all()) time_units = all_files[0].time_units calendar = all_files[0].calendar start_float = cf_units.date2num( datetime.datetime(ret_req.start_year, 1, 1), time_units, calendar) end_float = cf_units.date2num( datetime.datetime(ret_req.end_year + 1, 1, 1), time_units, calendar) data_files = all_files.filter(start_time__gte=start_float, end_time__lt=end_float) num_files = 0 for data_file in data_files: drs_path = construct_drs_path(data_file) dest_dir = os.path.join(NEW_BASE_OUTPUT_DIR, drs_path) if dest_dir == data_file.directory: logger.warning('Skipping file as already in destination directory ' '{}'.format(data_file.name)) continue if not os.path.exists(dest_dir): os.makedirs(dest_dir) shutil.move(os.path.join(data_file.directory, data_file.name), dest_dir) os.symlink(os.path.join(dest_dir, data_file.name), os.path.join(data_file.directory, data_file.name)) data_file.directory = dest_dir data_file.save() num_files += 1 logger.debug('Moved {} files'.format(num_files))
def move_dirs(data_req, new_gws): """ Move the files :param pdata_app.models.DataRequest data_req: the data request to move :param int new_gws: the number of the gws to move to """ single_dir = '{}{}'.format(COMMON_GWS_NAME, new_gws) existing_dirs = data_req.directories() # ignore data that is offline if None in existing_dirs: existing_dirs.remove(None) use_single_dir = False for exist_dir in existing_dirs: if exist_dir.startswith(single_dir): use_single_dir = True break if not use_single_dir: # As a quick sanity check, generate an error if there is no # data already in the requested output directory logger.error('The new output directory is {} but no data from ' 'this variable is currently in this directory.'. format(single_dir)) sys.exit(1) for exist_dir in existing_dirs: if exist_dir.startswith(single_dir): continue files_to_move = data_req.datafile_set.filter(directory=exist_dir) logger.debug('Moving {} files from {}'.format( files_to_move.count(), exist_dir)) for file_to_move in files_to_move: # Move the file src = os.path.join(exist_dir, file_to_move.name) dest_path = os.path.join(single_dir, 'stream1', construct_drs_path(file_to_move)) if not os.path.exists(dest_path): os.makedirs(dest_path) dest = os.path.join(dest_path, file_to_move.name) # remove existing link if about to write over it if dest.startswith(BASE_OUTPUT_DIR): if os.path.exists(dest): if os.path.islink(dest): os.remove(dest) # Move the file shutil.move(src, dest) # Update the file's location in the DB file_to_move.directory = dest_path file_to_move.save() # Check that it was safely copied actual_checksum = adler32(dest) db_checksum = file_to_move.checksum_set.first().checksum_value if not actual_checksum == db_checksum: logger.error('For {}\ndatabase checksum: {}\n' 'actual checksum: {}'. format(dest, db_checksum, actual_checksum)) sys.exit(1) # Update the symlink if not is_same_gws(dest_path, BASE_OUTPUT_DIR): primary_path_dir = os.path.join( BASE_OUTPUT_DIR, construct_drs_path(file_to_move)) primary_path = os.path.join(primary_path_dir, file_to_move.name) if os.path.lexists(primary_path): if not os.path.islink(primary_path): logger.error("{} exists and isn't a symbolic " "link.".format(primary_path)) sys.exit(1) else: # it is a link so remove it os.remove(primary_path) if not os.path.exists(primary_path_dir): os.makedirs(primary_path_dir) os.symlink(dest, primary_path) delete_drs_dir(exist_dir)
def main(args): """ Main entry point """ dreqs_hr = DataRequest.objects.filter( climate_model__short_name='CMCC-CM2-HR4', experiment__short_name__in=['hist-1950', 'control-1950'], variable_request__table_name__startswith='SI', datafile__isnull=False).distinct() dreqs_vhr = DataRequest.objects.filter( climate_model__short_name='CMCC-CM2-VHR4', experiment__short_name='hist-1950', variable_request__table_name__startswith='SI', datafile__isnull=False).distinct() dreqs = dreqs_hr | dreqs_vhr logger.debug(f'Found {dreqs.count()} data requests') for dreq in dreqs: logger.debug(f'Processing {dreq}') old_directories = [] for df in dreq.datafile_set.order_by('name'): if not df.online: logger.error(f'Not online {df.name}') continue if df.version == NEW_VERSION: logger.warning(f'Already at {NEW_VERSION} {df.name}') continue # save the sym link directory before we make any changes if not is_same_gws(BASE_OUTPUT_DIR, df.directory): old_sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) # now get back to updating the version df.version = NEW_VERSION gws = get_gws(df.directory) new_dir = os.path.join(gws, construct_drs_path(df)) old_directory = df.directory if not os.path.exists(new_dir): os.mkdir(new_dir) os.rename(os.path.join(df.directory, df.name), os.path.join(new_dir, df.name)) df.directory = new_dir df.save() if old_directory not in old_directories: old_directories.append(old_directory) # Update any sym links too if not is_same_gws(BASE_OUTPUT_DIR, df.directory): sym_link_path = os.path.join(old_sym_link_dir, df.name) if os.path.lexists(sym_link_path): if os.path.islink(sym_link_path): os.remove(sym_link_path) if old_sym_link_dir not in old_directories: old_directories.append(old_sym_link_dir) sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) if not os.path.exists(sym_link_dir): os.makedirs(sym_link_dir) sym_link_path = os.path.join(sym_link_dir, df.name) os.symlink(os.path.join(df.directory, df.name), sym_link_path) for directory in old_directories: if not os.listdir(directory): delete_drs_dir(directory) else: logger.error(f'Not empty {directory}')
def main(args): """ Main entry point """ logger.debug('Starting incoming_to_drs.py') data_sub = _get_submission_object(os.path.normpath(args.directory)) if not args.alternative: drs_base_dir = BASE_OUTPUT_DIR else: drs_base_dir = args.alternative errors_encountered = False for data_file in data_sub.datafile_set.order_by('name'): # make full path of existing file existing_path = os.path.join(data_file.directory, data_file.name) # make full path of where it will live drs_sub_path = construct_drs_path(data_file) drs_dir = os.path.join(drs_base_dir, drs_sub_path) drs_path = os.path.join(drs_dir, data_file.name) # check the destination directory exists if not os.path.exists(drs_dir): os.makedirs(drs_dir) # link if on same GWS, or else copy this_file_error = False try: os.rename(existing_path, drs_path) except OSError as exc: logger.error('Unable to link from {} to {}. {}'.format( existing_path, drs_path, str(exc))) errors_encountered = True this_file_error = True # update the file's location in the database if not this_file_error: data_file.directory = drs_dir if not data_file.online: data_file.online = True data_file.save() # if storing the files in an alternative location, create a sym link # from the primary DRS structure to the file if not is_same_gws(BASE_OUTPUT_DIR, drs_base_dir): primary_path = os.path.join(BASE_OUTPUT_DIR, drs_sub_path) try: if not os.path.exists(primary_path): os.makedirs(primary_path) os.symlink(drs_path, os.path.join(primary_path, data_file.name)) except OSError as exc: logger.error('Unable to link from {} to {}. {}'.format( drs_path, os.path.join(primary_path, data_file.name), str(exc))) errors_encountered = True # summarise what happened and keep the DB updated if not errors_encountered: logger.debug('All files copied with no errors. Data submission ' 'incoming directory can be deleted.') else: logger.error('Errors were encountered. Please fix these before ' 'deleting the incoming directory.') logger.debug('Completed incoming_to_drs.py')
def main(args): """ Main entry point """ logger.debug('Starting delete_request.py for retrieval {}'.format( args.retrieval_id)) deletion_retrieval = match_one(RetrievalRequest, id=args.retrieval_id) if not deletion_retrieval: logger.error('Unable to find retrieval id {}'.format( args.retrieval_id)) sys.exit(1) if deletion_retrieval.date_deleted: logger.error('Retrieval {} was already deleted, at {}.'.format( deletion_retrieval.id, deletion_retrieval.date_deleted.strftime('%Y-%m-%d %H:%M'))) sys.exit(1) if not deletion_retrieval.data_finished: logger.error('Retrieval {} is not marked as finished.'.format( deletion_retrieval.id)) sys.exit(1) problems_encountered = False directories_found = [] base_output_dir = Settings.get_solo().base_output_dir # loop through all of the data requests in this retrieval for data_req in deletion_retrieval.data_request.all(): online_req_files = data_req.datafile_set.filter( online=True, directory__isnull=False) files_to_delete = date_filter_files(online_req_files, deletion_retrieval.start_year, deletion_retrieval.end_year) if files_to_delete is None: continue if not args.force: # find any other retrieval requests that still need this data other_retrievals = RetrievalRequest.objects.filter( data_request=data_req, data_finished=False) # loop through the retrieval requests that still need this data # request for ret_req in other_retrievals: ret_online_files = data_req.datafile_set.filter( online=True, directory__isnull=False) ret_filtered_files = date_filter_files(ret_online_files, ret_req.start_year, ret_req.end_year) if ret_filtered_files is None: continue # remove from the list of files to delete the ones that we have # just found are still needed files_to_delete = files_to_delete.difference( ret_filtered_files) # list the parts of the data request that are still required logger.debug("{} {} to {} won't be deleted".format( data_req, ret_req.start_year, ret_req.end_year)) # don't (try to) delete anything that's in the CEDA archive files_to_delete.exclude(directory__startswith=CEDA_ARCHIVE) # do the deleting if args.dryrun: logger.debug('{} {} files can be deleted.'.format( data_req, files_to_delete.distinct().count())) else: logger.debug('{} {} files will be deleted.'.format( data_req, files_to_delete.distinct().count())) for data_file in files_to_delete: old_file_dir = data_file.directory try: os.remove(os.path.join(data_file.directory, data_file.name)) except OSError as exc: logger.error(str(exc)) problems_encountered = True else: if data_file.directory not in directories_found: directories_found.append(data_file.directory) data_file.online = False data_file.directory = None data_file.save() # if a symbolic link exists from the base output directory # then delete this too if not old_file_dir.startswith(base_output_dir): sym_link_dir = os.path.join(base_output_dir, construct_drs_path(data_file)) sym_link = os.path.join(sym_link_dir, data_file.name) if not os.path.islink(sym_link): logger.error( "Expected {} to be a link but it isn't. " "Leaving this file in place.".format(sym_link)) problems_encountered = True else: try: os.remove(sym_link) except OSError as exc: logger.error(str(exc)) problems_encountered = True else: if sym_link_dir not in directories_found: directories_found.append(sym_link_dir) if not args.dryrun: # delete any empty directories for directory in directories_found: if not os.listdir(directory): delete_drs_dir(directory) # set date_deleted in the db if not problems_encountered: deletion_retrieval.date_deleted = timezone.now() deletion_retrieval.save() else: logger.error( 'Errors were encountered and so retrieval {} has not ' 'been marked as deleted. All possible files have been ' 'deleted.'.format(args.retrieval_id)) logger.debug('Completed delete_request.py for retrieval {}'.format( args.retrieval_id))
def _construct_directory(self): """ Construct the new directory path. """ self.new_directory = os.path.join(get_gws(self.datafile.directory), construct_drs_path(self.datafile))
def test_out_name(self): expected = 't/HighResMIP/MOHC/t/t/r1i1p1/Amon/var/g2/v87654321' self.assertEqual(construct_drs_path(self.data_file2), expected)
def test_success(self): expected = 't/HighResMIP/MOHC/t/t/r1i1p1/Amon/var1/gn/v12345678' self.assertEqual(construct_drs_path(self.data_file1), expected)
def main(args): """ Main entry point """ dreqs1 = DataRequest.objects.filter( climate_model__short_name='MPI-ESM1-2-XR', experiment__short_name='highresSST-present', variable_request__cmor_name__in=['hus7h', 'ta7h', 'ua7h'] ) dreqs2 = DataRequest.objects.filter( climate_model__short_name__in=['MPI-ESM1-2-HR', 'MPI-ESM1-2-XR'], experiment__short_name='highresSST-present', variable_request__table_name='Amon', variable_request__cmor_name='tas' ) dreqs = dreqs1 | dreqs2 logger.debug(f'Found {dreqs.count()} data requests') for dreq in dreqs: logger.debug(f'Processing {dreq}') old_directories = [] for df in dreq.datafile_set.order_by('name'): if not df.online: logger.error(f'Not online {df.name}') continue if df.version == NEW_VERSION: logger.warning(f'Already at {NEW_VERSION} {df.name}') continue # save the sym link directory before we make any changes old_sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) # now get back to updating the version df.version = NEW_VERSION gws = get_gws(df.directory) new_dir = os.path.join(gws, construct_drs_path(df)) old_directory = df.directory if not os.path.exists(new_dir): os.mkdir(new_dir) os.rename(os.path.join(df.directory, df.name), os.path.join(new_dir, df.name)) df.directory = new_dir df.save() if old_directory not in old_directories: old_directories.append(old_directory) # Update any sym links too sym_link_path = os.path.join(old_sym_link_dir, df.name) if os.path.lexists(sym_link_path): if os.path.islink(sym_link_path): os.remove(sym_link_path) if old_sym_link_dir not in old_directories: old_directories.append(old_sym_link_dir) sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) if not os.path.exists(sym_link_dir): os.makedirs(sym_link_dir) sym_link_path = os.path.join(sym_link_dir, df.name) os.symlink(os.path.join(df.directory, df.name), sym_link_path) for directory in old_directories: if not os.listdir(directory): delete_drs_dir(directory) else: logger.error(f'Not empty {directory}')