def main(args): """ Main entry point """ logger.debug('Starting file structure scan.') for nc_file in ilist_files(args.directory): nc_file_name = os.path.basename(nc_file) db_files = DataFile.objects.filter(name=nc_file_name) if db_files.count() == 0: logger.error('File not found in database: {}'.format(nc_file)) elif db_files.count() > 1: logger.error('{} entries found in database for file: {}'. format(db_files.count(), nc_file)) else: db_file = db_files.first() act_size = os.path.getsize(nc_file) if act_size != db_file.size: logger.info('File %s has size %d', db_file.name, act_size) db_file.online = False db_file.directory = None db_file.save() os.remove(nc_file) if not is_same_gws(nc_file, BASE_OUTPUT_DIR): sym_link_path = os.path.join(BASE_OUTPUT_DIR, construct_drs_path(db_file), db_file.name) try: if os.path.exists(sym_link_path): os.remove(sym_link_path) except OSError: logger.error('Unable to delete sym link %s', sym_link_path)
def test_ilist_files_default_suffix(self): new_tree_list = list(ilist_files(self.temp_dir)) expected_files = ['file1.nc', 'dir1/file3.nc', 'dir1/file4.nc'] expected_tree_list = [ self.temp_dir.joinpath(ef).as_posix() for ef in expected_files ] new_tree_list.sort() expected_tree_list.sort() self.assertEqual(new_tree_list, expected_tree_list)
def test_ilist_files_ignore_symlinks(self): new_tree_list = list(ilist_files(self.temp_dir, ignore_symlinks=True)) expected_files = [ 'file1.nc', 'dir1/file3.nc', ] expected_tree_list = [ self.temp_dir.joinpath(ef).as_posix() for ef in expected_files ] new_tree_list.sort() expected_tree_list.sort() self.assertEqual(new_tree_list, expected_tree_list)
def main(args): """Main entry point""" base_dir = Settings.get_solo().base_output_dir for extracted_file in ilist_files(args.top_dir): found_name = os.path.basename(extracted_file) try: data_file = DataFile.objects.get(name=found_name) except django.core.exceptions.ObjectDoesNotExist: logger.warning('Cannot find DMT entry. Skipping {}'. format(extracted_file)) continue found_checksum = adler32(extracted_file) if not found_checksum == data_file.checksum_set.first().checksum_value: logger.warning("Checksum doesn't match. Skipping {}". format(found_name)) continue dest_dir = os.path.join(get_gws_any_dir(extracted_file), 'stream1', construct_drs_path(data_file)) dest_path = os.path.join(dest_dir, found_name) if os.path.exists(dest_path): logger.warning('Skipping {} as it already exists at {}'. format(found_name, dest_path)) continue # create the directory if it doesn't exist if not os.path.exists(dest_dir): os.makedirs(dest_dir) os.rename(extracted_file, dest_path) # create a link from the base dir if not is_same_gws(dest_path, base_dir): link_dir = os.path.join(base_dir, construct_drs_path(data_file)) link_path = os.path.join(link_dir, data_file.name) if not os.path.exists(link_dir): os.makedirs(link_dir) os.symlink(dest_path, link_path) data_file.online = True data_file.directory = dest_dir data_file.save()
def main(args): """ Main entry point """ for path in ilist_files(args.top_path, ignore_symlinks=True): data_file = Path(path) try: django_file = DataFile.objects.get(name=data_file.name) except django.core.exceptions.ObjectDoesNotExist: logger.debug(f'Not in DMT: {path}') continue if django_file.directory.startswith('/badc'): if not args.dryrun: action = 'Deleting' data_file.unlink() delete_drs_dir(str(data_file.parent)) else: action = 'Deletable' logger.debug(f'{action}: {path}')
def scan_file_structure(directory): """ Start the scan of the file structure. :param str directory: the top level directory to scan """ logger.debug('Starting file structure scan.') for nc_file in ilist_files(directory): nc_file_name = os.path.basename(nc_file) db_files = DataFile.objects.filter(name=nc_file_name) if db_files.count() == 0: logger.error('File not found in database: {}'.format(nc_file)) elif db_files.count() > 1: logger.error('{} entries found in database for file: {}'. format(db_files.count(), nc_file)) else: db_file = db_files.first() # Check for broken symbolic links # os.path.exists() returns False for broken links if not os.path.exists(nc_file): os.remove(nc_file) if db_file.directory: db_path = os.path.join(db_file.directory, db_file.name) if os.path.exists(db_path): logger.warning('Replacing broken link for file {}'. format(db_file.name)) os.symlink(os.path.join(db_file.directory, db_file.name), nc_file) else: logger.warning('Removing broken link for file {}'. format(db_file.name)) if db_file.online: db_file.online = False db_file.save() continue actual_path = os.path.realpath(nc_file) actual_dir = os.path.dirname(actual_path) db_file.refresh_from_db() if not db_file.online: logger.warning('File status changed to online: {}'. format(nc_file)) db_file.online = True db_file.directory = actual_dir db_file.save() if db_file.directory is None: db_file.directory = actual_dir db_file.save() if db_file.directory != actual_dir: if db_file.directory.startswith(CEDA_BASE): # This file is believed to be in the archive logger.warning('File {} is in the CEDA archive according ' 'to the database.'.format(nc_file)) else: logger.warning('Directory for file {} changed from {} ' 'to {}'.format(nc_file_name, db_file.directory, actual_dir)) db_file.directory = actual_dir db_file.save() logger.debug('Completed file structure scan.')
def main(args): """ Main entry point """ for json_file in ilist_files(INPUT_JSON_DIR, '.json'): with open(json_file, 'r') as fh: metadata = json.load(fh) for nc_file in metadata: nc_file['activity_id'] = { "__module__": "pdata_app.models", "__kwargs__": { "short_name": "HighResMIP" }, "__class__": "ActivityId" } nc_file['experiment'] = { "__module__": "pdata_app.models", "__kwargs__": { "short_name": "highresSST-present" }, "__class__": "Experiment" } nc_file['climate_model'] = { "__module__": "pdata_app.models", "__kwargs__": { "short_name": "HadGEM3-GC31-LM" }, "__class__": "ClimateModel" } nc_file['institute'] = { "__module__": "pdata_app.models", "__kwargs__": { "short_name": "MOHC" }, "__class__": "Institute" } filename = nc_file['basename'] var_name, table_name = filename.split('_')[0:2] if table_name.startswith('Prim'): nc_file['project'] = { "__module__": "pdata_app.models", "__kwargs__": { "short_name": "PRIMAVERA" }, "__class__": "Project" } else: nc_file['project'] = { "__module__": "pdata_app.models", "__kwargs__": { "short_name": "CMIP6" }, "__class__": "Project" } cmor_name = _get_cmor_name(var_name, table_name) nc_file['variable'] = { "__module__": "pdata_app.models", "__kwargs__": { "cmor_name": cmor_name, "table_name": table_name }, "__class__": "VariableRequest" } nc_file['data_request'] = { "__module__": "pdata_app.models", "__kwargs__": { "variable_request__cmor_name": cmor_name, "variable_request__table_name": table_name, "climate_model__short_name": "HadGEM3-GC31-LM", "experiment__short_name": "highresSST-present", "institute__short_name": "MOHC" }, "__class__": "DataRequest" } with open(os.path.join(OUTPUT_JSON_DIR, os.path.basename(json_file)), 'w') as fh: json.dump(metadata, fh, indent=4)