def run_h5_convert(results_type=None): tantalus_api = TantalusApi() remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml: logging.info('found filename {}, skipping conversion'.format( existing_filename)) else: print(result["id"]) logging.info('no yaml found') except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
def main( storage_name, dataset_type=None, dataset_id=None, tag_name=None, all_file_instances=False, dry_run=False, fix_corrupt=False, remove_missing=False, ): logging.info('checking integrity of storage {}'.format(storage_name)) tantalus_api = TantalusApi() if all_file_instances: file_instances = tantalus_api.list('file_instance', storage__name=storage_name) else: file_instances = get_dataset_file_instances( tantalus_api, storage_name, dataset_type, dataset_id=dataset_id, tag_name=tag_name) for file_instance in file_instances: logging.info('checking file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if file_instance['is_deleted']: logging.info('file instance {} marked as deleted'.format( file_instance['id'])) continue file_corrupt = False file_missing = False try: tantalus_api.check_file(file_instance) except DataCorruptionError: file_corrupt = True logging.exception('check file failed') except DataMissingError: file_missing = True logging.exception('missing file') if file_corrupt and fix_corrupt: logging.info('updating file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: tantalus_api.update_file(file_instance) if file_missing and remove_missing: logging.info('deleting file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: file_instance = tantalus_api.update( 'file_instance', id=file_instance['id'], is_deleted=True, )
def fix_bams(jira_ticket=None, dry_run=False): tantalus_api = TantalusApi() analyses_list = [] storage_name = "singlecellresults" if jira_ticket is not None: analyses_list.append(tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] filename = f'{jira_ticket}/results/bams/metadata.yaml' logging.info(f'adding file {filename}') if not dry_run: file_instance, file_resource = tantalus_api.add_file(storage_name, filename) # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: dataset_id = dataset['id'] logging.info(f'adding file to dataset {dataset_id}') if not dry_run: file_resource_ids = dataset['file_resources'] file_resource_ids = file_resource_ids.append(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=file_resource_ids)
def download_datasets(results_type, from_storage_name, to_storage_name, dataset_id=None, jira_ticket=None): ''' Download a set of datasets by type. ''' tantalus_api = TantalusApi() if dataset_id is not None: datasets = tantalus_api.list('results', id=dataset_id) elif jira_ticket is not None: datasets = tantalus_api.list('results', results_type=results_type, analysis__jira_ticket=jira_ticket) else: datasets = tantalus_api.list('results', results_type=results_type) dataset_ids = list() for dataset in datasets: dataset_ids.append(dataset['id']) # Download most recent first dataset_ids = reversed(sorted(dataset_ids)) failed = False for dataset_id in dataset_ids: try: transfer_dataset(tantalus_api, dataset_id, 'resultsdataset', from_storage_name, to_storage_name) except: logging.exception(f'failed to download {dataset_id}') failed = True if failed: raise Exception('one or more downloads failed')
def glob_microscope_data(filepaths, storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() chip_paths = collections.defaultdict(set) chip_libraries = collections.defaultdict(set) for filepath in filepaths: match = re.match( r".*/single_cell_indexing/Microscope/(\d+)_(A\d+[A-Z]*)", filepath) if match is None: logging.warning('skipping malformed {}'.format(filepath)) continue fields = match.groups() date = fields[0] chip_id = fields[1] libraries = list( tantalus_api.list('dna_library', library_id__startswith=chip_id)) if len(libraries) == 0: logging.error( 'skipping file with unknown library {}'.format(filepath)) continue library_ids = set([library['library_id'] for library in libraries]) chip_paths[chip_id].add(filepath) chip_libraries[chip_id].update(library_ids) for chip_id in chip_paths: add_microscope_results( chip_paths[chip_id], chip_id, chip_libraries[chip_id], storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def catalog_cellenone_datasets(storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() for dataset in tantalus_api.list('resultsdataset', results_type='CELLENONE'): # HACK: Check for metadata yaml file in dataset found_metadata = False try: file_resource = tantalus_api.get( 'file_resource', resultsdataset__id=dataset['id'], filename__endswith='metadata.yaml') found_metadata = True except NotFoundError: logging.info(f"no metadata for dataset {dataset['id']}") if found_metadata: logging.info( f"found metadata for dataset {dataset['id']}, skipping") continue try: process_cellenone_dataset(dataset, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name) except KeyboardInterrupt: raise except: logging.exception(f"catalog failed for dataset {dataset['id']}")
from dbclients.tantalus import TantalusApi from dbclients.colossus import ColossusApi import logging tantalus_api = TantalusApi() colossus_api = ColossusApi() if __name__ == '__main__': print "STARTING" colossus_analyses = colossus_api.list('analysis_information') tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align") analysis_lane_dict = {} for analysis in tantalus_analyses: lane_set = set() for input_dataset in analysis['input_datasets']: dataset = tantalus_api.get('sequencedataset',id=input_dataset) for lane in dataset['sequence_lanes']: lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number']))) analysis_lane_dict[analysis['name']] = lane_set print analysis_lane_dict for analysis in colossus_analyses: key = analysis['analysis_jira_ticket'] + '_align' if key in analysis_lane_dict.keys(): lanes = []
def main( storage_name, dry_run=False, check_remote=None, ): tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_storage_client = None if check_remote is not None: remote_storage_client = tantalus_api.get_storage_client(check_remote) file_instances = tantalus_api.list('file_instance', storage__name=storage_name, is_deleted=True) # DEBUG: check whether we are getting back # consistent ordered results from tantalus file_instances = list(file_instances) file_instance_ids = set([f['id'] for f in file_instances]) if len(file_instances) != len(file_instance_ids): raise Exception('received duplicate results from tantalus') logging.info('processing {} file instances'.format(len(file_instance_ids))) logging.info('processing the following file instances: {}'.format( str(file_instance_ids))) for file_instance in file_instances: file_resource = tantalus_api.get( 'file_resource', id=file_instance['file_resource']['id']) all_file_instances = list( tantalus_api.list('file_instance', file_resource=file_resource['id'])) logging.info( 'checking file instance {}, file resource {}, filepath {}'.format( file_instance['id'], file_resource['id'], file_instance['filepath'])) sequencedatasets = tantalus_api.list( 'sequencedataset', file_resources__id=file_resource['id']) resultsdatasets = tantalus_api.list( 'resultsdataset', file_resources__id=file_resource['id']) sequencedataset_ids = list(set([a['id'] for a in sequencedatasets])) resultsdataset_ids = list(set([a['id'] for a in resultsdatasets])) logging.info( 'file resource {} belongs to sequencedataset {} and resultsdatasets {}' .format(file_resource['id'], sequencedataset_ids, resultsdataset_ids)) # Optionally check for a remote version if remote_storage_client: remote_instance = None for other_instance in file_resource['file_instances']: if other_instance['storage']['name'] == check_remote: remote_instance = other_instance if not remote_instance: logging.info( 'not deleting file instance {}, no other instance'.format( file_instance['id'])) continue if remote_instance['is_deleted']: logging.info( 'not deleting file instance {}, other instance {} deleted'. format(file_instance['id'], other_instance['id'])) continue if not remote_storage_client.exists(file_resource['filename']): logging.info( 'not deleting file instance {}, other instance {} doesnt exist' .format(file_instance['id'], other_instance['id'])) continue logging.info( 'deletion ok for file instance {}, found other instance {}'. format(file_instance['id'], other_instance['id'])) # Delete the file from the filesystem logging.info('deleting file {}'.format(file_instance['filepath'])) if not dry_run: try: storage_client.delete(file_resource['filename']) except FileNotFoundError: logging.exception('file already deleted') # Delete the instance model from tantalus logging.info('deleting file instance {}'.format(file_instance['id'])) if not dry_run: tantalus_api.delete('file_instance', id=file_instance['id']) # If this is the only file instance for this file resource, delete the file resource if len(all_file_instances) == 1: assert all_file_instances[0]['id'] == file_instance['id'] logging.info('deleting file resource {}'.format( file_resource['id'])) if not dry_run: tantalus_api.delete('file_resource', id=file_resource['id'])
def check_indices(library_id=None): tantalus_api = TantalusApi() colossus_api = ColossusApi() if library_id is None: library_ids = set([a['pool_id'] for a in colossus_api.list('library')]) else: library_ids = [library_id] for library_id in library_ids: # Get colossus sublibrary indices sublibraries = colossus_api.list('sublibraries', library__pool_id=library_id) colossus_indices = set( [a['primer_i7'] + '-' + a['primer_i5'] for a in sublibraries]) datasets = tantalus_api.list( 'sequence_dataset', library__library_id=library_id, library__library_type__name='SC_WGS', dataset_type='FQ', ) lane_datasets = collections.defaultdict(list) for dataset in datasets: assert len(dataset['sequence_lanes']) == 1 flowcell_lane = '_'.join([ dataset['sequence_lanes'][0]['flowcell_id'], dataset['sequence_lanes'][0]['lane_number'], ]) lane_datasets[flowcell_lane].append(dataset) for flowcell_lane in lane_datasets: # Get tantalus sublibraries and indices tantalus_indices = set() tantalus_dataset_ids = [] tantalus_sequencing_centre = set() for dataset in lane_datasets[flowcell_lane]: file_resources = list( tantalus_api.list('file_resource', sequencedataset__id=dataset['id'])) tantalus_indices.update( set([ a['sequencefileinfo']['index_sequence'] for a in file_resources ])) tantalus_dataset_ids.append(dataset['id']) tantalus_sequencing_centre.update([ a['sequencing_centre'] for a in dataset['sequence_lanes'] ]) assert len(tantalus_sequencing_centre) == 1 tantalus_sequencing_centre = list(tantalus_sequencing_centre)[0] if len(colossus_indices - tantalus_indices) > 0: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: {} in colossus but not tantalus' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre, len(colossus_indices - tantalus_indices))) if len(tantalus_indices - colossus_indices) > 0: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: {} in tantalus but not colossus' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre, len(tantalus_indices - colossus_indices))) if tantalus_indices == colossus_indices: print( 'library {}, datasets {}, lane {}, sequencing_centre {}: OK' .format(library_id, tantalus_dataset_ids, flowcell_lane, tantalus_sequencing_centre))
def fix_bams(jira_ticket=None, dry_run=False): logging.info(f'dry run: {dry_run}') tantalus_api = TantalusApi() SC_WGS_BAM_DIR_TEMPLATE = os.path.join( 'single_cell_indexing', 'bam', '{library_id}', '{ref_genome}', '{aligner_name}', 'numlanes_{number_lanes}', '{jira_ticket}', ) reference_genome_map = { 'HG19': 'grch37', 'MM10': 'mm10', } analyses_list = [] from_storage_name = "singlecellresults" to_storage_name = "singlecellblob" from_storage_client = tantalus_api.get_storage_client(from_storage_name) to_storage_client = tantalus_api.get_storage_client(to_storage_name) to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id'] if jira_ticket is not None: analyses_list.append( tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] print(f"moving bams for {jira_ticket}") # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: # Get number of lanes from dataset for use with filepath lanes = set() for sequence_lane in dataset['sequence_lanes']: lane = "{}_{}".format(sequence_lane['flowcell_id'], sequence_lane['lane_number']) lanes.add(lane) number_lanes = len(lanes) try: file_instances = tantalus_api.get_dataset_file_instances( dataset["id"], "sequencedataset", from_storage_name, ) except dbclients.tantalus.DataNotOnStorageError: logging.info( f'dataset {dataset["id"]} not on {from_storage_name}, skipping' ) continue for file_instance in file_instances: blobname = file_instance["file_resource"]["filename"] # get url of source blob blob_url = from_storage_client.get_url(blobname) bam_filename = blobname.split("/bams/")[1] new_blobname = os.path.join( SC_WGS_BAM_DIR_TEMPLATE.format( library_id=dataset["library"]["library_id"], ref_genome=reference_genome_map[ dataset["reference_genome"]], aligner_name=dataset["aligner"], number_lanes=number_lanes, jira_ticket=jira_ticket, ), bam_filename, ) # copy blob to desired storage account with new blobname blob_filepath = f"{to_storage_client.prefix}/{new_blobname}" logging.info( f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}' ) if not dry_run: to_storage_client.blob_service.copy_blob( container_name="data", blob_name=new_blobname, copy_source=blob_url, ) file_resource_id = file_instance['file_resource']['id'] file_instance_id = file_instance['id'] logging.info( f'updating file resource {file_resource_id} to have filename {new_blobname}' ) if not dry_run: tantalus_api.update('file_resource', id=file_resource_id, filename=new_blobname) logging.info( f'updating file instance {file_instance_id} to have storage with id {to_storage_id}' ) if not dry_run: tantalus_api.update('file_instance', id=file_instance_id, storage=to_storage_id)
def run_h5_convert(cache_dir, dataset_id=None, results_type=None, redo=False, dry_run=False, check_done=False): tantalus_api = TantalusApi() local_cache_client = tantalus_api.get_cache_client(cache_dir) remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if dataset_id is not None: results_list = [tantalus_api.get("resultsdataset", id=dataset_id)] logging.info('converting results with id {}'.format(dataset_id)) elif results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: # Destruct outputs csv.yaml directly, check non destruct files if 'destruct' in existing_filename: continue if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml and check_done: logging.info('found filename {}, skipping conversion'.format( existing_filename)) continue file_resource_ids = [] filepaths_to_clean = [] for file_instance in file_instances: if not file_instance['file_resource']['filename'].endswith( '.h5'): continue datamanagement.transfer_files.cache_file( tantalus_api, file_instance, cache_dir) h5_filepath = local_cache_client.get_url( file_instance['file_resource']['filename']) filepaths_to_clean.append(h5_filepath) logging.info('converting {}'.format(h5_filepath)) for key, csv_filepath in get_h5_csv_info(h5_filepath): if not csv_filepath.startswith(cache_dir): raise Exception( 'unexpected csv path {}'.format(csv_filepath)) csv_filename = csv_filepath[len(cache_dir):] csv_filename = csv_filename.lstrip('/') if csv_filename in existing_filenames and not redo: logging.info( 'file {} already exists, not converting'.format( csv_filename)) continue if dry_run: logging.info('would convert {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) continue logging.info('converting {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) convert_h5(h5_filepath, key, csv_filepath) yaml_filename = csv_filename + '.yaml' yaml_filepath = csv_filepath + '.yaml' fileinfo_to_add = [ (csv_filename, csv_filepath), (yaml_filename, yaml_filepath), ] for filename, filepath in fileinfo_to_add: logging.info('creating file {} from path {}'.format( filename, filepath)) remote_storage_client.create(filename, filepath, update=redo) remote_filepath = os.path.join( remote_storage_client.prefix, filename) logging.info('adding file {} from path {}'.format( filename, remote_filepath)) (file_resource, file_instance) = tantalus_api.add_file( remote_storage_name, remote_filepath, update=True) #redo) file_resource_ids.append(file_resource["id"]) filepaths_to_clean.append(filepath) if len(file_resource_ids) == 0: logging.warning('no files added') continue logging.info('adding file resources {} to dataset {}'.format( file_resource_ids, result["id"])) tantalus_api.update( "resultsdataset", result["id"], file_resources=result["file_resources"] + file_resource_ids, ) for filepath in filepaths_to_clean: logging.info('removing file {}'.format(filepath)) os.remove(filepath) except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
from dbclients.tantalus import TantalusApi from dbclients.colossus import ColossusApi import time if __name__ == '__main__': print "TANTALUS CREATING..." tantalus_api = TantalusApi() print "COLOSSUS CREATING..." colossus_api = ColossusApi() instances = tantalus_api.list("file_instance") for instance in instances: print instance["filepath"]
from datamanagement.utils.gsc import GSCAPI from datamanagement.utils.constants import LOGGING_FORMAT from dbclients.tantalus import TantalusApi from dbclients.basicclient import NotFoundError if __name__ == '__main__': logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO) gsc_api = GSCAPI() tantalus_api = TantalusApi() # List of relevant libraries from GSC lanes lanes = list(tantalus_api.list('sequencing_lane', sequencing_centre='GSC')) libraries = set() for lane in lanes: library = tantalus_api.get('dna_library', id=lane['dna_library']) if library['library_type'] == 'WGS': libraries.add(library['library_id']) lane_fixes = [] for library_id in libraries: infos = gsc_api.query("library?name={}".format(library_id)) if len(infos) == 0: logging.warning('unable to find {}'.format(library_id))
def fix(): tantalus_api = TantalusApi() datasets = list( tantalus_api.list( 'sequence_dataset', dataset_type='BAM', library__library_type__name='WGS', )) for dataset in datasets: bams = {} bais = {} specs = {} for file_resource_id in dataset['file_resources']: file_resource = tantalus_api.get('file_resource', id=file_resource_id) if file_resource['filename'].endswith('.bam'): bams[file_resource_id] = file_resource['filename'] elif file_resource['filename'].endswith('.spec'): specs[file_resource_id] = file_resource['filename'] elif file_resource['filename'].endswith('.bam.bai'): bais[file_resource_id] = file_resource['filename'] if len(bams) == 0 and len(specs) == 0: print(dataset['id']) elif len(bams) > 1: logging.info(f"fixing {dataset['name']}, {bams}") to_remove_bam_id = max(bams.keys()) to_remove_bai_id = None for id_, bai in bais.items(): if bai.startswith(bams[to_remove_bam_id]): assert to_remove_bai_id is None to_remove_bai_id = id_ break assert to_remove_bai_id is not None logging.info((to_remove_bam_id, bams[to_remove_bam_id], to_remove_bai_id, bais[to_remove_bai_id])) new_file_resources = dataset['file_resources'] new_file_resources.remove(to_remove_bam_id) new_file_resources.remove(to_remove_bai_id) logging.info( f"updating {dataset['id']} to have files {new_file_resources}") tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=new_file_resources) assert dataset["name"].endswith(str(dataset["version_number"])) similar_datasets = list( tantalus_api.list( "sequence_dataset", name=dataset["name"], )) new_version_number = max(d['version_number'] for d in similar_datasets) + 1 new_dataset_params = dict( sample=dataset['sample']['id'], library=dataset['library']['id'], sequence_lanes=[l['id'] for l in dataset['sequence_lanes']], aligner=dataset['aligner'], reference_genome=dataset['reference_genome'], name=dataset['name'][:-1] + str(new_version_number), dataset_type=dataset['dataset_type'], version_number=new_version_number, file_resources=[to_remove_bam_id, to_remove_bai_id], ) logging.info(new_dataset_params) new_dataset = tantalus_api.create('sequencedataset', **new_dataset_params) logging.info(new_dataset)
def main( storage_name, dataset_type, dataset_id=None, tag_name=None, check_remote=None, dry_run=False, ): logging.info('cleanup up storage {}'.format(storage_name)) if check_remote: logging.info('checking remote {}'.format(check_remote)) else: logging.warning('not checking remote') tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_client = None if check_remote is not None: remote_client = tantalus_api.get_storage_client(check_remote) if dataset_id is None and tag_name is None: raise ValueError('require either dataset id or tag name') if dataset_id is not None and tag_name is not None: raise ValueError('require exactly one of dataset id or tag name') if dataset_id is not None: logging.info('cleanup up dataset {}, {}'.format( dataset_id, dataset_type)) datasets = tantalus_api.list(dataset_type, id=dataset_id) if tag_name is not None: logging.info('cleanup up tag {}'.format(tag_name)) datasets = tantalus_api.list(dataset_type, tags__name=tag_name) total_data_size = 0 file_num_count = 0 for dataset in datasets: logging.info('checking dataset with id {}, name {}'.format( dataset['id'], dataset['name'])) # Optionally skip datasets not present and intact on the remote storage if check_remote is not None: if not tantalus_api.is_dataset_on_storage( dataset['id'], 'sequencedataset', check_remote): logging.warning( 'not deleting dataset with id {}, not on remote storage '. format(dataset['id'], check_remote)) continue # For each file instance on the remote, check if it exists and has the correct size in tantalus remote_file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, check_remote): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') remote_file_size_check = False # Skip this dataset if any files failed if not remote_file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], check_remote)) continue # Check consistency with the removal storage file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') file_size_check = False # Skip this dataset if any files failed if not file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], storage_name)) continue # Delete all files for this dataset for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): if dry_run: logging.info( "would delete file instance with id {}, filepath {}". format(file_instance['id'], file_instance['filepath'])) else: logging.info( "deleting file instance with id {}, filepath {}".format( file_instance['id'], file_instance['filepath'])) tantalus_api.update("file_instance", id=file_instance['id'], is_deleted=True) total_data_size += file_instance['file_resource']['size'] file_num_count += 1 logging.info("deleted a total of {} files with size {} bytes".format( file_num_count, total_data_size))