def main(args): """ Main entry point """ query_set = DataFile.objects.filter(institute__short_name='MPI-M', experiment__short_name='spinup-1950', version='v20171003') logger.debug('{} files found'.format(query_set.count())) directories_found = [] for df in query_set: if df.online: try: os.remove(os.path.join(df.directory, df.name)) except OSError as exc: logger.error(str(exc)) sys.exit(1) else: if df.directory not in directories_found: directories_found.append(df.directory) df.online = False df.directory = None df.save() for directory in directories_found: if not os.listdir(directory): delete_drs_dir(directory) logger.debug('{} directories removed'.format(len(directories_found))) replace_files(query_set)
def main(args): """ Main entry point """ var_tables = [ 'evspsbl_Amon', 'evspsbl_Primday', 'evspsbl_Prim3hr', 'hfls_Amon', 'hfls_day', 'hfls_3hr', 'hfss_Amon', 'hfss_day', 'hfss_3hr', 'tso_3hr', ] models = ['EC-Earth3-HR', 'EC-Earth3'] experiment = 'highresSST-present' for var_table in var_tables: var, __, table = var_table.partition('_') for model in models: query_set = DataFile.objects.filter( data_request__climate_model__short_name=model, data_request__experiment__short_name=experiment, variable_request__table_name=table, variable_request__cmor_name=var ) logger.debug('{} {} {} {}'.format(model, table, var, query_set.count())) directories_found = [] for df in query_set: if df.online: try: os.remove(os.path.join(df.directory, df.name)) except OSError as exc: logger.error(str(exc)) sys.exit(1) else: if df.directory not in directories_found: directories_found.append(df.directory) df.online = False df.directory = None df.save() for directory in directories_found: if not os.listdir(directory): delete_drs_dir(directory) replace_files(query_set)
def main(args): """ Main entry point """ dfs = DataFile.objects.filter(climate_model__short_name='MPI-ESM1-2-XR', experiment__short_name='highres-future', version='v20190617') prim_gws = '/gws/nopw/j04/primavera5/stream1' old_dirs = [] for df in dfs: old_drs_path = construct_drs_path(df) df.version = 'v20190517' df.save() if df.online: # file itself gws = get_gws(df.directory) old_dir = df.directory new_dir = os.path.join(gws, construct_drs_path(df)) if not os.path.exists(new_dir): os.makedirs(new_dir) os.rename(os.path.join(df.directory, df.name), os.path.join(new_dir, df.name)) df.directory = new_dir df.save() if old_dir not in old_dirs: old_dirs.append(old_dir) # sym link if not is_same_gws(df.directory, prim_gws): old_sym_dir = os.path.join(prim_gws, old_drs_path) old_sym = os.path.join(old_sym_dir, df.name) # TODO next line doesn't work as this is now a broken symlink so returns false if os.path.exists(old_sym): if os.path.islink(old_sym): os.remove(old_sym) else: logger.warning(f'Not symlink as expected: {old_sym}') new_sym_dir = os.path.join(prim_gws, construct_drs_path(df)) if not os.path.exists(new_sym_dir): os.makedirs(new_sym_dir) os.symlink(os.path.join(new_dir, df.name), os.path.join(new_sym_dir, df.name)) if old_sym_dir not in old_dirs: old_dirs.append(old_sym_dir) logger.debug(f'Removing {len(old_dirs)} old dirs') for old_dir in old_dirs: delete_drs_dir(old_dir)
def main(args): """ Main entry point """ dreqs = DataRequest.objects.filter( institute__short_name='MPI-M', experiment__short_name__in=['control-1950', 'hist-1950'], variable_request__cmor_name='tos', datafile__isnull=False).distinct() logger.debug(f'Found {dreqs.count()} datasets') for dreq in dreqs: if dreq.esgfdataset_set.all(): # ESGF dataset's been created... esgf = dreq.esgfdataset_set.first() if esgf.status == 'PUBLISHED': # ... and published so the data's in the CEDA archive # and symlinked from the PRIMAVERA data structure # All sym links will be in one directory set_dir = os.path.join( BASE_OUTPUT_DIR, construct_drs_path(dreq.datafile_set.first())) for df in dreq.datafile_set.all(): file_path = os.path.join(set_dir, df.name) if not os.path.islink(file_path): logger.warning(f'Expected a sym link {file_path}') continue try: os.remove(file_path) except OSError as exc: logger.error(str(exc)) df.online = False df.directory = None df.save() delete_drs_dir(set_dir) logger.debug(f'Removed files for ESGFDataset {esgf}') esgf.status = 'CREATED' esgf.save() continue # The data's not been published so delete the files and their sym links delete_files(dreq.datafile_set.all(), BASE_OUTPUT_DIR) logger.debug(f'Removed files for DataRequest {dreq}') dreq.datafile_set.update(directory=None, online=False) for dreq in dreqs: dreq.datafile_set.update(version='v20191129')
def main(args): """ Main entry point """ query_set = DataFile.objects.filter( institute__short_name='MPI-M', climate_model__short_name='MPIESM-1-2-HR', experiment__short_name__in=['control-1950', 'hist-1950'], version='v20171003' ) logger.debug('{} files found'.format(query_set.count())) directories_found = [] for df in query_set: if df.online: try: os.remove(os.path.join(df.directory, df.name)) except OSError as exc: logger.error(str(exc)) sys.exit(1) else: if df.directory not in directories_found: directories_found.append(df.directory) df.online = False df.directory = None df.save() for directory in directories_found: if not os.listdir(directory): delete_drs_dir(directory) logger.debug('{} directories removed'.format(len(directories_found))) replace_files(query_set) num_deleted = DataSubmission.objects.filter( incoming_directory__in=['/group_workspaces/jasmin2/primavera4/upload/' 'MPI-M/MPIESM-1-2-XR/incoming/20171027', '/group_workspaces/jasmin2/primavera4/upload/' 'MPI-M/MPIESM-1-2-XR/incoming/20171019', '/group_workspaces/jasmin2/primavera4/upload/' 'MPI-M/MPIESM-1-2-XR/incoming/20171010'] ).delete() logger.debug('{} DataSubmissions deleted.'.format(num_deleted))
def main(args): """ Main entry point """ for path in ilist_files(args.top_path, ignore_symlinks=True): data_file = Path(path) try: django_file = DataFile.objects.get(name=data_file.name) except django.core.exceptions.ObjectDoesNotExist: logger.debug(f'Not in DMT: {path}') continue if django_file.directory.startswith('/badc'): if not args.dryrun: action = 'Deleting' data_file.unlink() delete_drs_dir(str(data_file.parent)) else: action = 'Deletable' logger.debug(f'{action}: {path}')
def delete_files(query_set): """ Delete any files online from the specified queryset """ directories_found = [] for df in query_set.filter(online=True): try: os.remove(os.path.join(df.directory, df.name)) except OSError as exc: logger.error(str(exc)) else: if df.directory not in directories_found: directories_found.append(df.directory) df.online = False df.directory = None df.save() for directory in directories_found: if not os.listdir(directory): delete_drs_dir(directory) logger.debug('{} directories removed'.format(len(directories_found)))
def _rename_file(self): """ Rename the file on disk and move to its new directory. Update the link from the primary directory. """ if not os.path.exists(self.new_directory): os.makedirs(self.new_directory) os.rename(os.path.join(self.old_directory, self.old_filename), os.path.join(self.new_directory, self.new_filename)) # check for empty directory if not os.listdir(self.old_directory): delete_drs_dir(self.old_directory) # Update the symbolic link if required if not is_same_gws(self.old_directory, BASE_OUTPUT_DIR): old_link_path = os.path.join(self.old_sym_link_dir, self.old_filename) if os.path.lexists(old_link_path): if not os.path.islink(old_link_path): logger.error("{} exists and isn't a symbolic link.".format( old_link_path)) raise SymLinkIsFileError(old_link_path) else: # it is a link so remove it os.remove(old_link_path) # check for empty directory if not os.listdir(self.old_sym_link_dir): delete_drs_dir(self.old_sym_link_dir) new_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(self.datafile)) if not os.path.exists(new_link_dir): os.makedirs(new_link_dir) os.symlink(os.path.join(self.new_directory, self.new_filename), os.path.join(new_link_dir, self.new_filename))
def main(args): """ Main entry point """ dreqs = DataRequest.objects.filter( climate_model__short_name='CMCC-CM2-VHR4', experiment__short_name='control-1950', datafile__isnull=False ).distinct().order_by( 'variable_request__table_name', 'variable_request__cmor_name' ) num_dreqs = dreqs.count() logger.info(f'{num_dreqs} data requests found') for dreq in dreqs: dreq.datafile_set.update(version='v20200917') for df in dreq.datafile_set.filter(online=True).order_by('name'): old_dir = df.directory old_path = os.path.join(old_dir, df.name) if not os.path.exists(old_path): logger.error(f'{old_path} not found') continue new_dir = os.path.join(get_gws(df.directory), construct_drs_path(df)) if df.directory != new_dir: if not os.path.exists(new_dir): os.makedirs(new_dir) os.rename(old_path, os.path.join(new_dir, df.name)) df.directory = new_dir df.save() # Delete original dir if it's now empty if not os.listdir(old_dir): delete_drs_dir(old_dir)
def move_dirs(data_req, new_gws): """ Move the files :param pdata_app.models.DataRequest data_req: the data request to move :param int new_gws: the number of the gws to move to """ single_dir = '{}{}'.format(COMMON_GWS_NAME, new_gws) existing_dirs = data_req.directories() # ignore data that is offline if None in existing_dirs: existing_dirs.remove(None) use_single_dir = False for exist_dir in existing_dirs: if exist_dir.startswith(single_dir): use_single_dir = True break if not use_single_dir: # As a quick sanity check, generate an error if there is no # data already in the requested output directory logger.error('The new output directory is {} but no data from ' 'this variable is currently in this directory.'. format(single_dir)) sys.exit(1) for exist_dir in existing_dirs: if exist_dir.startswith(single_dir): continue files_to_move = data_req.datafile_set.filter(directory=exist_dir) logger.debug('Moving {} files from {}'.format( files_to_move.count(), exist_dir)) for file_to_move in files_to_move: # Move the file src = os.path.join(exist_dir, file_to_move.name) dest_path = os.path.join(single_dir, 'stream1', construct_drs_path(file_to_move)) if not os.path.exists(dest_path): os.makedirs(dest_path) dest = os.path.join(dest_path, file_to_move.name) # remove existing link if about to write over it if dest.startswith(BASE_OUTPUT_DIR): if os.path.exists(dest): if os.path.islink(dest): os.remove(dest) # Move the file shutil.move(src, dest) # Update the file's location in the DB file_to_move.directory = dest_path file_to_move.save() # Check that it was safely copied actual_checksum = adler32(dest) db_checksum = file_to_move.checksum_set.first().checksum_value if not actual_checksum == db_checksum: logger.error('For {}\ndatabase checksum: {}\n' 'actual checksum: {}'. format(dest, db_checksum, actual_checksum)) sys.exit(1) # Update the symlink if not is_same_gws(dest_path, BASE_OUTPUT_DIR): primary_path_dir = os.path.join( BASE_OUTPUT_DIR, construct_drs_path(file_to_move)) primary_path = os.path.join(primary_path_dir, file_to_move.name) if os.path.lexists(primary_path): if not os.path.islink(primary_path): logger.error("{} exists and isn't a symbolic " "link.".format(primary_path)) sys.exit(1) else: # it is a link so remove it os.remove(primary_path) if not os.path.exists(primary_path_dir): os.makedirs(primary_path_dir) os.symlink(dest, primary_path) delete_drs_dir(exist_dir)
def main(args): """ Main entry point """ logger.debug('Starting delete_request.py for retrieval {}'.format( args.retrieval_id)) deletion_retrieval = match_one(RetrievalRequest, id=args.retrieval_id) if not deletion_retrieval: logger.error('Unable to find retrieval id {}'.format( args.retrieval_id)) sys.exit(1) if deletion_retrieval.date_deleted: logger.error('Retrieval {} was already deleted, at {}.'.format( deletion_retrieval.id, deletion_retrieval.date_deleted.strftime('%Y-%m-%d %H:%M'))) sys.exit(1) if not deletion_retrieval.data_finished: logger.error('Retrieval {} is not marked as finished.'.format( deletion_retrieval.id)) sys.exit(1) problems_encountered = False directories_found = [] base_output_dir = Settings.get_solo().base_output_dir # loop through all of the data requests in this retrieval for data_req in deletion_retrieval.data_request.all(): online_req_files = data_req.datafile_set.filter( online=True, directory__isnull=False) files_to_delete = date_filter_files(online_req_files, deletion_retrieval.start_year, deletion_retrieval.end_year) if files_to_delete is None: continue if not args.force: # find any other retrieval requests that still need this data other_retrievals = RetrievalRequest.objects.filter( data_request=data_req, data_finished=False) # loop through the retrieval requests that still need this data # request for ret_req in other_retrievals: ret_online_files = data_req.datafile_set.filter( online=True, directory__isnull=False) ret_filtered_files = date_filter_files(ret_online_files, ret_req.start_year, ret_req.end_year) if ret_filtered_files is None: continue # remove from the list of files to delete the ones that we have # just found are still needed files_to_delete = files_to_delete.difference( ret_filtered_files) # list the parts of the data request that are still required logger.debug("{} {} to {} won't be deleted".format( data_req, ret_req.start_year, ret_req.end_year)) # don't (try to) delete anything that's in the CEDA archive files_to_delete.exclude(directory__startswith=CEDA_ARCHIVE) # do the deleting if args.dryrun: logger.debug('{} {} files can be deleted.'.format( data_req, files_to_delete.distinct().count())) else: logger.debug('{} {} files will be deleted.'.format( data_req, files_to_delete.distinct().count())) for data_file in files_to_delete: old_file_dir = data_file.directory try: os.remove(os.path.join(data_file.directory, data_file.name)) except OSError as exc: logger.error(str(exc)) problems_encountered = True else: if data_file.directory not in directories_found: directories_found.append(data_file.directory) data_file.online = False data_file.directory = None data_file.save() # if a symbolic link exists from the base output directory # then delete this too if not old_file_dir.startswith(base_output_dir): sym_link_dir = os.path.join(base_output_dir, construct_drs_path(data_file)) sym_link = os.path.join(sym_link_dir, data_file.name) if not os.path.islink(sym_link): logger.error( "Expected {} to be a link but it isn't. " "Leaving this file in place.".format(sym_link)) problems_encountered = True else: try: os.remove(sym_link) except OSError as exc: logger.error(str(exc)) problems_encountered = True else: if sym_link_dir not in directories_found: directories_found.append(sym_link_dir) if not args.dryrun: # delete any empty directories for directory in directories_found: if not os.listdir(directory): delete_drs_dir(directory) # set date_deleted in the db if not problems_encountered: deletion_retrieval.date_deleted = timezone.now() deletion_retrieval.save() else: logger.error( 'Errors were encountered and so retrieval {} has not ' 'been marked as deleted. All possible files have been ' 'deleted.'.format(args.retrieval_id)) logger.debug('Completed delete_request.py for retrieval {}'.format( args.retrieval_id))
def main(args): """ Main entry point """ dreqs1 = DataRequest.objects.filter( climate_model__short_name='MPI-ESM1-2-XR', experiment__short_name='highresSST-present', variable_request__cmor_name__in=['hus7h', 'ta7h', 'ua7h'] ) dreqs2 = DataRequest.objects.filter( climate_model__short_name__in=['MPI-ESM1-2-HR', 'MPI-ESM1-2-XR'], experiment__short_name='highresSST-present', variable_request__table_name='Amon', variable_request__cmor_name='tas' ) dreqs = dreqs1 | dreqs2 logger.debug(f'Found {dreqs.count()} data requests') for dreq in dreqs: logger.debug(f'Processing {dreq}') old_directories = [] for df in dreq.datafile_set.order_by('name'): if not df.online: logger.error(f'Not online {df.name}') continue if df.version == NEW_VERSION: logger.warning(f'Already at {NEW_VERSION} {df.name}') continue # save the sym link directory before we make any changes old_sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) # now get back to updating the version df.version = NEW_VERSION gws = get_gws(df.directory) new_dir = os.path.join(gws, construct_drs_path(df)) old_directory = df.directory if not os.path.exists(new_dir): os.mkdir(new_dir) os.rename(os.path.join(df.directory, df.name), os.path.join(new_dir, df.name)) df.directory = new_dir df.save() if old_directory not in old_directories: old_directories.append(old_directory) # Update any sym links too sym_link_path = os.path.join(old_sym_link_dir, df.name) if os.path.lexists(sym_link_path): if os.path.islink(sym_link_path): os.remove(sym_link_path) if old_sym_link_dir not in old_directories: old_directories.append(old_sym_link_dir) sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) if not os.path.exists(sym_link_dir): os.makedirs(sym_link_dir) sym_link_path = os.path.join(sym_link_dir, df.name) os.symlink(os.path.join(df.directory, df.name), sym_link_path) for directory in old_directories: if not os.listdir(directory): delete_drs_dir(directory) else: logger.error(f'Not empty {directory}')
def main(args): """ Main entry point """ dreqs_hr = DataRequest.objects.filter( climate_model__short_name='CMCC-CM2-HR4', experiment__short_name__in=['hist-1950', 'control-1950'], variable_request__table_name__startswith='SI', datafile__isnull=False).distinct() dreqs_vhr = DataRequest.objects.filter( climate_model__short_name='CMCC-CM2-VHR4', experiment__short_name='hist-1950', variable_request__table_name__startswith='SI', datafile__isnull=False).distinct() dreqs = dreqs_hr | dreqs_vhr logger.debug(f'Found {dreqs.count()} data requests') for dreq in dreqs: logger.debug(f'Processing {dreq}') old_directories = [] for df in dreq.datafile_set.order_by('name'): if not df.online: logger.error(f'Not online {df.name}') continue if df.version == NEW_VERSION: logger.warning(f'Already at {NEW_VERSION} {df.name}') continue # save the sym link directory before we make any changes if not is_same_gws(BASE_OUTPUT_DIR, df.directory): old_sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) # now get back to updating the version df.version = NEW_VERSION gws = get_gws(df.directory) new_dir = os.path.join(gws, construct_drs_path(df)) old_directory = df.directory if not os.path.exists(new_dir): os.mkdir(new_dir) os.rename(os.path.join(df.directory, df.name), os.path.join(new_dir, df.name)) df.directory = new_dir df.save() if old_directory not in old_directories: old_directories.append(old_directory) # Update any sym links too if not is_same_gws(BASE_OUTPUT_DIR, df.directory): sym_link_path = os.path.join(old_sym_link_dir, df.name) if os.path.lexists(sym_link_path): if os.path.islink(sym_link_path): os.remove(sym_link_path) if old_sym_link_dir not in old_directories: old_directories.append(old_sym_link_dir) sym_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) if not os.path.exists(sym_link_dir): os.makedirs(sym_link_dir) sym_link_path = os.path.join(sym_link_dir, df.name) os.symlink(os.path.join(df.directory, df.name), sym_link_path) for directory in old_directories: if not os.listdir(directory): delete_drs_dir(directory) else: logger.error(f'Not empty {directory}')
def main(args): """ Main entry point """ var_tables = [ 'rlus_3hr', 'rsus_3hr', 'rsuscs_3hr', 'rlut_E3hr', 'rlutcs_E3hr', 'rsut_E3hr', 'rlus_day', 'rlut_day', 'rsus_day', 'rlutcs_CFday', 'rsuscs_CFday', 'rsut_CFday', 'rsutcs_CFday', 'rlus_Amon', 'rlut_Amon', 'rlutcs_Amon', 'rsus_Amon', 'rsuscs_Amon', 'rsut_Amon', 'rsutcs_Amon' ] submissions = [ '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171110', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171111', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171112', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171116', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171024', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171027', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171101', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171114', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171115', '/group_workspaces/jasmin2/primavera4/upload/EC-Earth-Consortium/EC-Earth-3-HR/incoming/v20171113' ] models = ['EC-Earth3-HR'] experiment = 'spinup-1950' for var_table in var_tables: var, __, table = var_table.partition('_') for model in models: query_set = DataFile.objects.filter( data_request__climate_model__short_name=model, data_request__experiment__short_name=experiment, variable_request__table_name=table, variable_request__cmor_name=var, data_submission__incoming_directory__in=submissions ) logger.debug('{} {} {} {}'.format(model, table, var, query_set.count())) directories_found = [] for df in query_set: if df.online: try: os.remove(os.path.join(df.directory, df.name)) except OSError as exc: logger.error(str(exc)) sys.exit(1) else: if df.directory not in directories_found: directories_found.append(df.directory) df.online = False df.directory = None df.save() for directory in directories_found: if not os.listdir(directory): delete_drs_dir(directory) replace_files(query_set)
def main(): """ Main entry point """ # TODO: bug in symbolic link code that can't find link and doesn't create new link dreqs = DataRequest.objects.filter( climate_model__short_name='HadGEM3-GC31-HH', # experiment__short_name__in=['control-1950', 'hist-1950', # 'highres-future'], # variable_request__table_name__in=['SImon', 'SIday', 'PrimSIday'], experiment__short_name='hist-1950', variable_request__table_name='SImon', variable_request__cmor_name='sisnhc', datafile__isnull=False).distinct().order_by( 'experiment__short_name', 'variable_request__table_name', 'variable_request__cmor_name') num_dreqs = dreqs.count() logger.info(f'{num_dreqs} data requests found') for dreq in dreqs: logger.info(str(dreq)) old_drs_path = construct_drs_path(dreq.datafile_set.first()) dreq.datafile_set.update(version=NEW_VERSION_STRING) for df in dreq.datafile_set.order_by('name'): if not df.online: logger.error(f'File not online {df.name}') continue old_dir = df.directory old_path = os.path.join(old_dir, df.name) if not os.path.exists(old_path): logger.error(f'File not found {old_path}') continue new_dir = os.path.join(get_gws(df.directory), construct_drs_path(df)) if df.directory != new_dir: if not os.path.exists(new_dir): os.makedirs(new_dir) os.rename(old_path, os.path.join(new_dir, df.name)) df.directory = new_dir df.save() # Delete original dir if it's now empty if not os.listdir(old_dir): delete_drs_dir(old_dir) # Update symbolic links on primavera5 if not get_gws(df.directory) == BASE_OUTPUT_DIR: old_link_dir = os.path.join(BASE_OUTPUT_DIR, old_drs_path) old_link_path = os.path.join(old_link_dir, df.name) if not os.path.exists(old_link_path): logger.error(f'Link not found {old_link_path}') continue if not os.path.islink(old_link_path): logger.error(f'Not sym link {old_link_path}') continue os.remove(old_link_path) new_link_dir = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(df)) new_link_path = os.path.join(new_link_dir, df.name) if not os.path.exists(new_link_dir): os.makedirs(new_link_dir) os.symlink(os.path.join(new_dir, df.name), new_link_path) if not os.listdir(old_link_dir): delete_drs_dir(old_link_dir)