def glob_cellenone_data(filepaths, storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() for filepath in filepaths: match = re.match(r".*/single_cell_indexing/Cellenone/Cellenone_images/(\d+)_(A\d+[A-Z]*)", filepath) if match is None: logging.warning('skipping malformed {}'.format(filepath)) continue fields = match.groups() date = fields[0] library_id = fields[1] try: tantalus_api.get('dna_library', library_id=library_id) except NotFoundError: logging.warning('skipping file with unknown library {}'.format(filepath)) continue try: process_cellenone_images( library_id, filepath, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, ) except ValueError: logging.exception(f'unable to process {library_id}, {filepath}')
def add_microscope_results(filepaths, chip_id, library_ids, storage_name, tag_name=None, update=False, remote_storage_name=None): colossus_api = ColossusApi() tantalus_api = TantalusApi() results_name = 'MICROSCOPE_{}'.format(chip_id) results_type = 'MICROSCOPE' results_version = None try: existing_results = tantalus_api.get('results', name=results_name) except NotFoundError: existing_results = None if existing_results is not None and not update: logging.info(f'results for {chip_id} exist, not processing') return results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=library_ids, recursive=True, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def cache_tagged_datasets(tag_name, from_storage_name, cache_directory, suffix_filter=None): """ Cache a set of tagged datasets """ tantalus_api = TantalusApi() tag = tantalus_api.get("tag", name=tag_name) for dataset_id in tag['sequencedataset_set']: cache_dataset(tantalus_api, dataset_id, "sequencedataset", from_storage_name, cache_directory, suffix_filter=suffix_filter) for dataset_id in tag['resultsdataset_set']: cache_dataset(tantalus_api, dataset_id, "resultsdataset", from_storage_name, cache_directory, suffix_filter=suffix_filter)
def process_cellenone_images( library_id, source_dir, storage_name, tag_name=None, update=False, remote_storage_name=None, ): tantalus_api = TantalusApi() results_name = 'CELLENONE_IMAGES_{}'.format(library_id) results_type = 'CELLENONE_IMAGES' results_version = 'v1' try: existing_results = tantalus_api.get('results', name=results_name) except NotFoundError: existing_results = None if existing_results is not None and not update: logging.info(f'results for {library_id} exist, not processing') return storage = tantalus_api.get('storage', name=storage_name) storage_directory = storage['storage_directory'] destination_dir = os.path.join( storage_directory, 'single_cell_indexing', 'Cellenone', 'Cellenone_processed', library_id, results_version, ) try: os.makedirs(destination_dir) except: pass with tempfile.TemporaryDirectory() as temp_dir: filepaths = catalog_images(library_id, source_dir, destination_dir, temp_dir) results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=[library_id], recursive=False, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def process_cellenone_dataset(dataset, storage_name, tag_name=None, update=False, remote_storage_name=None): assert len(dataset['libraries']) == 1 library_id = dataset['libraries'][0]['library_id'] tantalus_api = TantalusApi() if not tantalus_api.is_dataset_on_storage(dataset['id'], 'resultsdataset', storage_name): raise ValueError( f"dataset {dataset['id']} not on storage {storage_name}") # Assume all files in the raw dataset are under the directory: # single_cell_indexing/Cellenone/Cellenone_images/{date}_{library_id} filename_prefix = 'single_cell_indexing/Cellenone/Cellenone_images/' source_dir = None for file_resource in tantalus_api.get_dataset_file_resources( dataset['id'], 'resultsdataset'): if source_dir is None: if not file_resource['filename'].startswith(filename_prefix): raise ValueError( f"file {file_resource['filename']} is not in directory {filename_prefix}" ) library_subdir = file_resource['filename'].split('/')[3] if not library_subdir.endswith(library_id): raise ValueError( f"file {file_resource['filename']} is not in a directory ending with {library_id}" ) source_dir = '/'.join(file_resource['filename'].split('/')[:4]) elif not file_resource['filename'].startswith(source_dir): raise ValueError( f"file {file_resource['filename']} is not in directory {source_dir}" ) assert source_dir is not None source_dir = tantalus_api.get_filepath(storage_name, source_dir) process_cellenone_images( library_id, source_dir, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def add_analysis(**kwargs): tantalus_api = TantalusApi() #Create new analysis object analysis = tantalus_api.get_or_create("analysis", name=kwargs['name'], jira_ticket=kwargs['jira_id'], analysis_type=kwargs['type'], version=kwargs['version']) logging.info("Successfully created analysis with ID {}".format( analysis["id"]))
def transfer_tagged_datasets(tag_name, from_storage_name, to_storage_name): """ Transfer a set of tagged datasets """ tantalus_api = TantalusApi() tag = tantalus_api.get("tag", name=tag_name) for dataset_id in tag['sequencedataset_set']: transfer_dataset(tantalus_api, dataset_id, "sequencedataset", from_storage_name, to_storage_name) for dataset_id in tag['resultsdataset_set']: transfer_dataset(tantalus_api, dataset_id, "resultsdataset", from_storage_name, to_storage_name)
def main( storage_name, dataset_type=None, dataset_id=None, tag_name=None, all_file_instances=False, dry_run=False, fix_corrupt=False, remove_missing=False, ): logging.info('checking integrity of storage {}'.format(storage_name)) tantalus_api = TantalusApi() if all_file_instances: file_instances = tantalus_api.list('file_instance', storage__name=storage_name) else: file_instances = get_dataset_file_instances( tantalus_api, storage_name, dataset_type, dataset_id=dataset_id, tag_name=tag_name) for file_instance in file_instances: logging.info('checking file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if file_instance['is_deleted']: logging.info('file instance {} marked as deleted'.format( file_instance['id'])) continue file_corrupt = False file_missing = False try: tantalus_api.check_file(file_instance) except DataCorruptionError: file_corrupt = True logging.exception('check file failed') except DataMissingError: file_missing = True logging.exception('missing file') if file_corrupt and fix_corrupt: logging.info('updating file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: tantalus_api.update_file(file_instance) if file_missing and remove_missing: logging.info('deleting file instance {} with path {}'.format( file_instance['id'], file_instance['filepath'])) if not dry_run: file_instance = tantalus_api.update( 'file_instance', id=file_instance['id'], is_deleted=True, )
def add_cellenone_results(filepaths, library_id, storage_name, tag_name=None, update=False, remote_storage_name=None): colossus_api = ColossusApi() tantalus_api = TantalusApi() results_name = 'CELLENONE_{}'.format(library_id) results_type = 'CELLENONE' results_version = None results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=[library_id], recursive=True, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def main(storage_name, bam_file_path, **kwargs): """ Imports the bam into tantalus by creating a sequence dataset and file resources """ logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO) tantalus_api = TantalusApi() sample = None if kwargs.get('sample_id') is not None: sample = tantalus_api.get_or_create( 'sample', sample_id=kwargs['sample_id'], ) library = None if kwargs.get('library_id') is not None: if kwargs.get('library_type') is not None and kwargs.get( 'index_format') is not None: library = tantalus_api.get_or_create( 'dna_library', library_id=kwargs['library_id'], library_type=kwargs['library_type'], index_format=kwargs['index_format'], ) else: library = tantalus_api.get( 'dna_library', library_id=kwargs['library_id'], ) dataset = import_bam( storage_name, bam_file_path, sample=sample, library=library, read_type=kwargs.get('read_type'), ref_genome=kwargs.get('ref_genome'), update=kwargs.get('update'), tag_name=kwargs.get('tag_name'), ) print("dataset {}".format(dataset["id"]))
def transfer_inputs(dataset_ids, results_ids, from_storage, to_storage): tantalus_api = TantalusApi() for dataset_id in dataset_ids: transfer_dataset(tantalus_api, dataset_id, 'sequencedataset', from_storage, to_storage) for results_id in results_ids: transfer_dataset(tantalus_api, results_id, 'resultsdataset', from_storage, to_storage)
def catalog_cellenone_dataset(library_id, storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() dataset = tantalus_api.get('resultsdataset', results_type='CELLENONE', libraries__library_id=library_id) process_cellenone_dataset(dataset, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name)
def glob_microscope_data(filepaths, storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() chip_paths = collections.defaultdict(set) chip_libraries = collections.defaultdict(set) for filepath in filepaths: match = re.match( r".*/single_cell_indexing/Microscope/(\d+)_(A\d+[A-Z]*)", filepath) if match is None: logging.warning('skipping malformed {}'.format(filepath)) continue fields = match.groups() date = fields[0] chip_id = fields[1] libraries = list( tantalus_api.list('dna_library', library_id__startswith=chip_id)) if len(libraries) == 0: logging.error( 'skipping file with unknown library {}'.format(filepath)) continue library_ids = set([library['library_id'] for library in libraries]) chip_paths[chip_id].add(filepath) chip_libraries[chip_id].update(library_ids) for chip_id in chip_paths: add_microscope_results( chip_paths[chip_id], chip_id, chip_libraries[chip_id], storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def glob_cellenone_data(filepaths, storage_name, tag_name=None, update=False, skip_existing=False, remote_storage_name=None): tantalus_api = TantalusApi() library_paths = collections.defaultdict(set) for filepath in filepaths: match = re.match( r".*/single_cell_indexing/Cellenone/Cellenone_images/(\d+)_(A\d+[A-Z]*)/?$", filepath) if match is None: logging.warning('skipping malformed {}'.format(filepath)) continue fields = match.groups() date = fields[0] library_id = fields[1] try: tantalus_api.get('dna_library', library_id=library_id) except NotFoundError: logging.warning( 'skipping file with unknown library {}'.format(filepath)) continue logging.info(f'queueing library {library_id} data from {filepath}') library_paths[library_id].add(filepath) for library_id in library_paths: add_cellenone_results( library_paths[library_id], library_id, storage_name, tag_name=tag_name, update=update, skip_existing=skip_existing, remote_storage_name=remote_storage_name, )
def transfer_dataset_cmd(dataset_id, dataset_model, from_storage_name, to_storage_name, suffix_filter=None): tantalus_api = TantalusApi() transfer_dataset(tantalus_api, dataset_id, dataset_model, from_storage_name, to_storage_name, suffix_filter=suffix_filter)
def cache_dataset_cmd(dataset_id, dataset_model, from_storage_name, cache_directory, suffix_filter=None): tantalus_api = TantalusApi() cache_dataset(tantalus_api, dataset_id, dataset_model, from_storage_name, cache_directory, suffix_filter=suffix_filter)
def add_cellenone_results(filepaths, library_id, storage_name, tag_name=None, update=False, skip_existing=False, remote_storage_name=None): colossus_api = ColossusApi() tantalus_api = TantalusApi() results_name = 'CELLENONE_{}'.format(library_id) results_type = 'CELLENONE' results_version = None try: existing_results = tantalus_api.get('resultsdataset', name=results_name, results_type=results_type) except NotFoundError: existing_results = None if skip_existing and existing_results is not None: return existing_results results_dataset = add_generic_results( filepaths=filepaths, storage_name=storage_name, results_name=results_name, results_type=results_type, results_version=results_version, library_ids=[library_id], recursive=True, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, ) return results_dataset
def download_datasets(results_type, from_storage_name, to_storage_name, dataset_id=None, jira_ticket=None): ''' Download a set of datasets by type. ''' tantalus_api = TantalusApi() if dataset_id is not None: datasets = tantalus_api.list('results', id=dataset_id) elif jira_ticket is not None: datasets = tantalus_api.list('results', results_type=results_type, analysis__jira_ticket=jira_ticket) else: datasets = tantalus_api.list('results', results_type=results_type) dataset_ids = list() for dataset in datasets: dataset_ids.append(dataset['id']) # Download most recent first dataset_ids = reversed(sorted(dataset_ids)) failed = False for dataset_id in dataset_ids: try: transfer_dataset(tantalus_api, dataset_id, 'resultsdataset', from_storage_name, to_storage_name) except: logging.exception(f'failed to download {dataset_id}') failed = True if failed: raise Exception('one or more downloads failed')
def catalog_cellenone_datasets(storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() for dataset in tantalus_api.list('resultsdataset', results_type='CELLENONE'): # HACK: Check for metadata yaml file in dataset found_metadata = False try: file_resource = tantalus_api.get( 'file_resource', resultsdataset__id=dataset['id'], filename__endswith='metadata.yaml') found_metadata = True except NotFoundError: logging.info(f"no metadata for dataset {dataset['id']}") if found_metadata: logging.info( f"found metadata for dataset {dataset['id']}, skipping") continue try: process_cellenone_dataset(dataset, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name) except KeyboardInterrupt: raise except: logging.exception(f"catalog failed for dataset {dataset['id']}")
def add_cellenone_data(library_id, cellenone_filepath, storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() process_cellenone_images( library_id, cellenone_filepath, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def add_cellenone_data(filepaths, library_id, storage_name, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() add_cellenone_results( filepaths, library_id, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def main(**kwargs): try: df = pd.read_csv(kwargs["ids"]) except IOError: raise Exception("The file {} could not be opened for reading".format( kwargs["sample_ids"])) tantalus_api = TantalusApi() col_name = kwargs["id_type"] + "_id" df = df.apply(get_filepath, args=( tantalus_api, col_name, ), axis=1) df[[col_name, "shahlab_path", "blob_path", "rocks_path"]].to_csv(kwargs["output_file"], index=False)
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a all FQ datasets for a library id. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) for dataset_info, metadata in create_lane_fastq_metadata( tantalus_api, library_id): metadata_filename = os.path.join(dataset_info['base_dir'], 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write( yaml.dump(metadata, default_flow_style=False).encode()) logging.info(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) logging.info(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file( storage_name, metadata_filepath, update=True) for dataset_id in dataset_info['dataset_ids']: dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def add_generic_dataset(**kwargs): tantalus_api = TantalusApi() file_resource_pks = [] sample = tantalus_api.get("sample", sample_id=kwargs['sample_id']) library = tantalus_api.get("dna_library", library_id=kwargs['library_id']) #Add the file resource to tantalus for filepath in kwargs['filepaths']: logging.info( "Adding file resource for {} to Tantalus".format(filepath)) resource, instance = tantalus_api.add_file( storage_name=kwargs['storage_name'], filepath=filepath, update=kwargs['update']) file_resource_pks.append(resource["id"]) if "tag_name" in kwargs: tag = tantalus_api.get("tag", name=kwargs["tag_name"]) tags = [tag["id"]] else: tags = [] ref_genome = kwargs.get("reference_genome") aligner = kwargs.get("aligner") if "sequence_lane_pks" in kwargs: sequence_pks = map(str, kwargs["sequence_lane_pks"]) #Add the dataset to tantalus sequence_dataset = tantalus_api.get_or_create( "sequence_dataset", name=kwargs['dataset_name'], dataset_type=kwargs['dataset_type'], sample=sample["id"], library=library["id"], sequence_lanes=sequence_pks, file_resources=file_resource_pks, reference_genome=ref_genome, aligner=aligner, tags=tags, ) logging.info("Succesfully created sequence dataset with ID {}".format( sequence_dataset["id"]))
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a dataset and add to tantalus. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id) metadata_filename = os.path.join(base_dir, 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode()) print(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) print(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file(storage_name, metadata_filepath, update=True) dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def run_h5_convert(results_type=None): tantalus_api = TantalusApi() remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml: logging.info('found filename {}, skipping conversion'.format( existing_filename)) else: print(result["id"]) logging.info('no yaml found') except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
def fix_bams(jira_ticket=None, dry_run=False): tantalus_api = TantalusApi() analyses_list = [] storage_name = "singlecellresults" if jira_ticket is not None: analyses_list.append(tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] filename = f'{jira_ticket}/results/bams/metadata.yaml' logging.info(f'adding file {filename}') if not dry_run: file_instance, file_resource = tantalus_api.add_file(storage_name, filename) # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: dataset_id = dataset['id'] logging.info(f'adding file to dataset {dataset_id}') if not dry_run: file_resource_ids = dataset['file_resources'] file_resource_ids = file_resource_ids.append(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=file_resource_ids)
from datetime import datetime from collections import defaultdict from workflows.unanalyzed_data import * import datamanagement.templates as templates from dbclients.tantalus import TantalusApi from dbclients.colossus import ColossusApi from dbclients.basicclient import NotFoundError from workflows.utils import file_utils from workflows.utils import saltant_utils from workflows.utils.colossus_utils import get_ref_genome tantalus_api = TantalusApi() colossus_api = ColossusApi() log = logging.getLogger('sisyphus') log.setLevel(logging.DEBUG) stream_handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') stream_handler.setFormatter(formatter) log.addHandler(stream_handler) log.propagate = False def get_sequencings(library_info): ''' Given library id (str), return list of sequencings
from dbclients.tantalus import TantalusApi from dbclients.colossus import ColossusApi import logging tantalus_api = TantalusApi() colossus_api = ColossusApi() if __name__ == '__main__': print "STARTING" colossus_analyses = colossus_api.list('analysis_information') tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align") analysis_lane_dict = {} for analysis in tantalus_analyses: lane_set = set() for input_dataset in analysis['input_datasets']: dataset = tantalus_api.get('sequencedataset',id=input_dataset) for lane in dataset['sequence_lanes']: lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number']))) analysis_lane_dict[analysis['name']] = lane_set print analysis_lane_dict for analysis in colossus_analyses: key = analysis['analysis_jira_ticket'] + '_align' if key in analysis_lane_dict.keys(): lanes = []
def main( storage_name, dry_run=False, check_remote=None, ): tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_storage_client = None if check_remote is not None: remote_storage_client = tantalus_api.get_storage_client(check_remote) file_instances = tantalus_api.list('file_instance', storage__name=storage_name, is_deleted=True) # DEBUG: check whether we are getting back # consistent ordered results from tantalus file_instances = list(file_instances) file_instance_ids = set([f['id'] for f in file_instances]) if len(file_instances) != len(file_instance_ids): raise Exception('received duplicate results from tantalus') logging.info('processing {} file instances'.format(len(file_instance_ids))) logging.info('processing the following file instances: {}'.format( str(file_instance_ids))) for file_instance in file_instances: file_resource = tantalus_api.get( 'file_resource', id=file_instance['file_resource']['id']) all_file_instances = list( tantalus_api.list('file_instance', file_resource=file_resource['id'])) logging.info( 'checking file instance {}, file resource {}, filepath {}'.format( file_instance['id'], file_resource['id'], file_instance['filepath'])) sequencedatasets = tantalus_api.list( 'sequencedataset', file_resources__id=file_resource['id']) resultsdatasets = tantalus_api.list( 'resultsdataset', file_resources__id=file_resource['id']) sequencedataset_ids = list(set([a['id'] for a in sequencedatasets])) resultsdataset_ids = list(set([a['id'] for a in resultsdatasets])) logging.info( 'file resource {} belongs to sequencedataset {} and resultsdatasets {}' .format(file_resource['id'], sequencedataset_ids, resultsdataset_ids)) # Optionally check for a remote version if remote_storage_client: remote_instance = None for other_instance in file_resource['file_instances']: if other_instance['storage']['name'] == check_remote: remote_instance = other_instance if not remote_instance: logging.info( 'not deleting file instance {}, no other instance'.format( file_instance['id'])) continue if remote_instance['is_deleted']: logging.info( 'not deleting file instance {}, other instance {} deleted'. format(file_instance['id'], other_instance['id'])) continue if not remote_storage_client.exists(file_resource['filename']): logging.info( 'not deleting file instance {}, other instance {} doesnt exist' .format(file_instance['id'], other_instance['id'])) continue logging.info( 'deletion ok for file instance {}, found other instance {}'. format(file_instance['id'], other_instance['id'])) # Delete the file from the filesystem logging.info('deleting file {}'.format(file_instance['filepath'])) if not dry_run: try: storage_client.delete(file_resource['filename']) except FileNotFoundError: logging.exception('file already deleted') # Delete the instance model from tantalus logging.info('deleting file instance {}'.format(file_instance['id'])) if not dry_run: tantalus_api.delete('file_instance', id=file_instance['id']) # If this is the only file instance for this file resource, delete the file resource if len(all_file_instances) == 1: assert all_file_instances[0]['id'] == file_instance['id'] logging.info('deleting file resource {}'.format( file_resource['id'])) if not dry_run: tantalus_api.delete('file_resource', id=file_resource['id'])