def run_h5_convert(results_type=None): tantalus_api = TantalusApi() remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml: logging.info('found filename {}, skipping conversion'.format( existing_filename)) else: print(result["id"]) logging.info('no yaml found') except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a all FQ datasets for a library id. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) for dataset_info, metadata in create_lane_fastq_metadata( tantalus_api, library_id): metadata_filename = os.path.join(dataset_info['base_dir'], 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write( yaml.dump(metadata, default_flow_style=False).encode()) logging.info(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) logging.info(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file( storage_name, metadata_filepath, update=True) for dataset_id in dataset_info['dataset_ids']: dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a dataset and add to tantalus. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id) metadata_filename = os.path.join(base_dir, 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode()) print(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) print(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file(storage_name, metadata_filepath, update=True) dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def main( storage_name, dry_run=False, check_remote=None, ): tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_storage_client = None if check_remote is not None: remote_storage_client = tantalus_api.get_storage_client(check_remote) file_instances = tantalus_api.list('file_instance', storage__name=storage_name, is_deleted=True) # DEBUG: check whether we are getting back # consistent ordered results from tantalus file_instances = list(file_instances) file_instance_ids = set([f['id'] for f in file_instances]) if len(file_instances) != len(file_instance_ids): raise Exception('received duplicate results from tantalus') logging.info('processing {} file instances'.format(len(file_instance_ids))) logging.info('processing the following file instances: {}'.format( str(file_instance_ids))) for file_instance in file_instances: file_resource = tantalus_api.get( 'file_resource', id=file_instance['file_resource']['id']) all_file_instances = list( tantalus_api.list('file_instance', file_resource=file_resource['id'])) logging.info( 'checking file instance {}, file resource {}, filepath {}'.format( file_instance['id'], file_resource['id'], file_instance['filepath'])) sequencedatasets = tantalus_api.list( 'sequencedataset', file_resources__id=file_resource['id']) resultsdatasets = tantalus_api.list( 'resultsdataset', file_resources__id=file_resource['id']) sequencedataset_ids = list(set([a['id'] for a in sequencedatasets])) resultsdataset_ids = list(set([a['id'] for a in resultsdatasets])) logging.info( 'file resource {} belongs to sequencedataset {} and resultsdatasets {}' .format(file_resource['id'], sequencedataset_ids, resultsdataset_ids)) # Optionally check for a remote version if remote_storage_client: remote_instance = None for other_instance in file_resource['file_instances']: if other_instance['storage']['name'] == check_remote: remote_instance = other_instance if not remote_instance: logging.info( 'not deleting file instance {}, no other instance'.format( file_instance['id'])) continue if remote_instance['is_deleted']: logging.info( 'not deleting file instance {}, other instance {} deleted'. format(file_instance['id'], other_instance['id'])) continue if not remote_storage_client.exists(file_resource['filename']): logging.info( 'not deleting file instance {}, other instance {} doesnt exist' .format(file_instance['id'], other_instance['id'])) continue logging.info( 'deletion ok for file instance {}, found other instance {}'. format(file_instance['id'], other_instance['id'])) # Delete the file from the filesystem logging.info('deleting file {}'.format(file_instance['filepath'])) if not dry_run: try: storage_client.delete(file_resource['filename']) except FileNotFoundError: logging.exception('file already deleted') # Delete the instance model from tantalus logging.info('deleting file instance {}'.format(file_instance['id'])) if not dry_run: tantalus_api.delete('file_instance', id=file_instance['id']) # If this is the only file instance for this file resource, delete the file resource if len(all_file_instances) == 1: assert all_file_instances[0]['id'] == file_instance['id'] logging.info('deleting file resource {}'.format( file_resource['id'])) if not dry_run: tantalus_api.delete('file_resource', id=file_resource['id'])
jira_api = JIRA('https://www.bcgsc.ca/jira/', basic_auth=(JIRA_USER, JIRA_PASSWORD)) filename_pattern_map = { "*_1_*.raw.fastq.gz": (1, True), "*_2_*.raw.fastq.gz": (2, True), "*_1_*.fastq.gz": (1, True), "*_2_*.fastq.gz": (2, True), } sequencing_instrument_map = { 'HiSeqX': 'HX', 'HiSeq2500': 'H2500', 'NextSeq550': 'NextSeq550', } storage_client = tantalus_api.get_storage_client("scrna_fastq") def get_existing_fastq_data(tantalus_api, library): ''' Get the current set of fastq data in tantalus. Args: library (str): tenx library name Returns: existing_data: set of lanes (flowcell_id, lane_number) ''' existing_flowcell_ids = [] lanes = tantalus_api.list('sequencing_lane',
def fix_bams(jira_ticket=None, dry_run=False): logging.info(f'dry run: {dry_run}') tantalus_api = TantalusApi() SC_WGS_BAM_DIR_TEMPLATE = os.path.join( 'single_cell_indexing', 'bam', '{library_id}', '{ref_genome}', '{aligner_name}', 'numlanes_{number_lanes}', '{jira_ticket}', ) reference_genome_map = { 'HG19': 'grch37', 'MM10': 'mm10', } analyses_list = [] from_storage_name = "singlecellresults" to_storage_name = "singlecellblob" from_storage_client = tantalus_api.get_storage_client(from_storage_name) to_storage_client = tantalus_api.get_storage_client(to_storage_name) to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id'] if jira_ticket is not None: analyses_list.append( tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete")) else: # Get all completed align analyses ran with specific version # the bams associated to these analyses are in the wrong storage account for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'): analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version) analyses_list += [a for a in analyses] for analysis in analyses_list: jira_ticket = analysis["jira_ticket"] print(f"moving bams for {jira_ticket}") # get all bam datasets associated with the jira ticket bam_datasets = tantalus_api.list( "sequencedataset", dataset_type="BAM", analysis__jira_ticket=jira_ticket, ) for dataset in bam_datasets: # Get number of lanes from dataset for use with filepath lanes = set() for sequence_lane in dataset['sequence_lanes']: lane = "{}_{}".format(sequence_lane['flowcell_id'], sequence_lane['lane_number']) lanes.add(lane) number_lanes = len(lanes) try: file_instances = tantalus_api.get_dataset_file_instances( dataset["id"], "sequencedataset", from_storage_name, ) except dbclients.tantalus.DataNotOnStorageError: logging.info( f'dataset {dataset["id"]} not on {from_storage_name}, skipping' ) continue for file_instance in file_instances: blobname = file_instance["file_resource"]["filename"] # get url of source blob blob_url = from_storage_client.get_url(blobname) bam_filename = blobname.split("/bams/")[1] new_blobname = os.path.join( SC_WGS_BAM_DIR_TEMPLATE.format( library_id=dataset["library"]["library_id"], ref_genome=reference_genome_map[ dataset["reference_genome"]], aligner_name=dataset["aligner"], number_lanes=number_lanes, jira_ticket=jira_ticket, ), bam_filename, ) # copy blob to desired storage account with new blobname blob_filepath = f"{to_storage_client.prefix}/{new_blobname}" logging.info( f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}' ) if not dry_run: to_storage_client.blob_service.copy_blob( container_name="data", blob_name=new_blobname, copy_source=blob_url, ) file_resource_id = file_instance['file_resource']['id'] file_instance_id = file_instance['id'] logging.info( f'updating file resource {file_resource_id} to have filename {new_blobname}' ) if not dry_run: tantalus_api.update('file_resource', id=file_resource_id, filename=new_blobname) logging.info( f'updating file instance {file_instance_id} to have storage with id {to_storage_id}' ) if not dry_run: tantalus_api.update('file_instance', id=file_instance_id, storage=to_storage_id)
def rename_fastqs(dataset_id, storage_name, dry_run=False, check_only=False): logging.info(f'dataset: {dataset_id}') logging.info(f'dry run: {dry_run}') tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) dataset = tantalus_api.get('sequencedataset', id=dataset_id) file_instances = tantalus_api.get_dataset_file_instances( dataset['id'], 'sequencedataset', storage_name, ) for file_instance in file_instances: filename = file_instance['file_resource']['filename'] if os.path.basename(filename) == 'metadata.yaml': continue assert len(dataset['sequence_lanes']) == 1 parts = filename.split('/') basename = os.path.basename(filename) non_conforming = False try: assert parts[0] == 'single_cell_indexing' assert parts[1] == 'fastq' assert parts[2] == dataset['library']['library_id'] assert parts[3].split( '_')[0] == dataset['sequence_lanes'][0]['flowcell_id'] assert parts[3].split( '_')[1] == dataset['sequence_lanes'][0]['lane_number'] assert parts[4] == dataset['sample']['sample_id'] except AssertionError: non_conforming = True if check_only: if non_conforming: raise Exception(f'filename {filename} does not conform') continue new_filename = SC_WGS_FQ_TEMPLATE.format( dlp_library_id=dataset['library']['library_id'], flowcell_id=dataset['sequence_lanes'][0]['flowcell_id'], lane_number=dataset['sequence_lanes'][0]['lane_number'], cell_sample_id=dataset['sample']['sample_id'], cell_filename=basename, ) if new_filename == filename: logging.info(f'skipping conforming {filename} on {storage_name}') continue logging.info( f'renaming {filename} to {new_filename} on {storage_name}') if not dry_run: if not storage_client.exists(new_filename): storage_client.copy(filename, new_filename, wait=True) tantalus_api.swap_file(file_instance, new_filename) storage_client.delete(filename)
def run_h5_convert(cache_dir, dataset_id=None, results_type=None, redo=False, dry_run=False, check_done=False): tantalus_api = TantalusApi() local_cache_client = tantalus_api.get_cache_client(cache_dir) remote_storage_client = tantalus_api.get_storage_client( remote_storage_name) if dataset_id is not None: results_list = [tantalus_api.get("resultsdataset", id=dataset_id)] logging.info('converting results with id {}'.format(dataset_id)) elif results_type is not None: results_list = tantalus_api.list("resultsdataset", results_type=results_type) logging.info( 'converting results with results type {}'.format(results_type)) else: results_list = tantalus_api.list("resultsdataset") logging.info('converting all results') for result in results_list: logging.info('processing results dataset {}'.format(result['id'])) try: file_instances = tantalus_api.get_dataset_file_instances( result["id"], "resultsdataset", remote_storage_name, ) existing_filenames = set( [i['file_resource']['filename'] for i in file_instances]) found_csv_yaml = False for existing_filename in existing_filenames: # Destruct outputs csv.yaml directly, check non destruct files if 'destruct' in existing_filename: continue if existing_filename.endswith('.csv.gz.yaml'): found_csv_yaml = True break if found_csv_yaml and check_done: logging.info('found filename {}, skipping conversion'.format( existing_filename)) continue file_resource_ids = [] filepaths_to_clean = [] for file_instance in file_instances: if not file_instance['file_resource']['filename'].endswith( '.h5'): continue datamanagement.transfer_files.cache_file( tantalus_api, file_instance, cache_dir) h5_filepath = local_cache_client.get_url( file_instance['file_resource']['filename']) filepaths_to_clean.append(h5_filepath) logging.info('converting {}'.format(h5_filepath)) for key, csv_filepath in get_h5_csv_info(h5_filepath): if not csv_filepath.startswith(cache_dir): raise Exception( 'unexpected csv path {}'.format(csv_filepath)) csv_filename = csv_filepath[len(cache_dir):] csv_filename = csv_filename.lstrip('/') if csv_filename in existing_filenames and not redo: logging.info( 'file {} already exists, not converting'.format( csv_filename)) continue if dry_run: logging.info('would convert {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) continue logging.info('converting {}, key {} to {}'.format( h5_filepath, key, csv_filepath)) convert_h5(h5_filepath, key, csv_filepath) yaml_filename = csv_filename + '.yaml' yaml_filepath = csv_filepath + '.yaml' fileinfo_to_add = [ (csv_filename, csv_filepath), (yaml_filename, yaml_filepath), ] for filename, filepath in fileinfo_to_add: logging.info('creating file {} from path {}'.format( filename, filepath)) remote_storage_client.create(filename, filepath, update=redo) remote_filepath = os.path.join( remote_storage_client.prefix, filename) logging.info('adding file {} from path {}'.format( filename, remote_filepath)) (file_resource, file_instance) = tantalus_api.add_file( remote_storage_name, remote_filepath, update=True) #redo) file_resource_ids.append(file_resource["id"]) filepaths_to_clean.append(filepath) if len(file_resource_ids) == 0: logging.warning('no files added') continue logging.info('adding file resources {} to dataset {}'.format( file_resource_ids, result["id"])) tantalus_api.update( "resultsdataset", result["id"], file_resources=result["file_resources"] + file_resource_ids, ) for filepath in filepaths_to_clean: logging.info('removing file {}'.format(filepath)) os.remove(filepath) except NotFoundError: logging.exception('no files found for conversion') except KeyboardInterrupt: raise except Exception: logging.exception('conversion failed')
def import_bam(storage_name, bam_file_path, sample=None, library=None, lane_infos=None, read_type=None, ref_genome=None, tag_name=None, update=False): """ Imports bam into tantalus Args: storage_name: (string) name of destination storage bam_file_path: (string) filepath to bam on destination storage sample: (dict) contains sample_id library: (dict) contains library_id, library_type, index_format lane_infos: (dict) contains flowcell_id, lane_number, adapter_index_sequence, sequencing_cenre, read_type, reference_genome, aligner read_type: (string) read type for the run tag_name: (string) update: (boolean) Returns: sequence_dataset: (dict) sequence dataset created on tantalus """ tantalus_api = TantalusApi() # Get a url allowing access regardless of whether the file # is in cloud or local storage storage_client = tantalus_api.get_storage_client(storage_name) bam_filename = tantalus_api.get_file_resource_filename( storage_name, bam_file_path) bam_url = storage_client.get_url(bam_filename) bam_header = pysam.AlignmentFile(bam_url).header bam_header_info = get_bam_header_info(bam_header) if ref_genome is None: ref_genome = get_bam_ref_genome(bam_header) aligner_name = get_bam_aligner_name(bam_header) logging.info( f"bam header shows reference genome {ref_genome} and aligner {aligner_name}" ) bai_file_path = None if storage_client.exists(bam_filename + ".bai"): bai_file_path = bam_file_path + ".bai" else: logging.info(f"no bam index found at {bam_filename + '.bai'}") # If no sample was specified assume it exists in tantalus and # search for it based on header info if sample is None: if len(bam_header_info["sample_ids"]) != 1: raise ValueError( f"found sample_ids={bam_header_info['sample_ids']}, please specify override sample id" ) sample_id = list(bam_header_info["sample_ids"])[0] sample = tantalus_api.get('sample', sample_id=sample_id) # If no library was specified assume it exists in tantalus and # search for it based on header info if library is None: if len(bam_header_info["library_ids"]) != 1: raise ValueError( f"found library_ids={bam_header_info['library_ids']}, please specify override library id" ) library_id = list(bam_header_info["library_ids"])[0] library = tantalus_api.get('dna_library', library_id=library_id) # Default paired end reads if read_type is None: read_type = 'P' # If no lane infos were specified create them from header info if lane_infos is None: lane_infos = [] for lane in bam_header_info["sequence_lanes"]: lane_info = { "flowcell_id": lane["flowcell_id"], "lane_number": lane["lane_number"], "library_id": lane["library_id"], "sequencing_centre": lane["sequencing_centre"], "read_type": read_type, } lane_infos.append(lane_info) # Add the sequence dataset to Tantalus sequence_dataset = add_sequence_dataset( tantalus_api, storage_name=storage_name, sample=sample, library=library, dataset_type="BAM", sequence_lanes=lane_infos, bam_file_path=bam_file_path, reference_genome=ref_genome, aligner=aligner_name, bai_file_path=bai_file_path, tag_name=tag_name, update=update, ) return sequence_dataset
def add_generic_results(filepaths, storage_name, results_name, results_type, results_version, sample_ids=(), library_ids=(), analysis_pk=None, recursive=False, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) sample_pks = [] for sample_id in sample_ids: samples = tantalus_api.get( "sample", sample_id=sample_id, ) sample_pks.append(samples['id']) library_pks = [] for library_id in library_ids: librarys = tantalus_api.get( "dna_library", library_id=library_id, ) library_pks.append(librarys['id']) #Add the file resource to tantalus file_resource_pks = [] for filepath in filepaths: if recursive: logging.info("Recursing directory {}".format(filepath)) filename_prefix = tantalus_api.get_file_resource_filename( storage_name, filepath) add_filepaths = [] for filename in storage_client.list(filename_prefix): add_filepaths.append( tantalus_api.get_filepath(storage_name, filename)) else: add_filepaths = [filepath] for add_filepath in add_filepaths: logging.info( "Adding file resource for {} to Tantalus".format(add_filepath)) resource, instance = tantalus_api.add_file( storage_name=storage_name, filepath=add_filepath, update=update, ) file_resource_pks.append(resource["id"]) results_dataset_fields = dict( name=results_name, results_type=results_type, results_version=results_version, analysis=analysis_pk, samples=sample_pks, libraries=library_pks, file_resources=file_resource_pks, ) #Add the dataset to tantalus try: results_id = tantalus_api.get( "results", name=results_dataset_fields["name"])["id"] except NotFoundError: results_id = None if update and results_id is not None: logging.warning("results dataset {} exists, updating".format( results_dataset_fields["name"])) results_dataset = tantalus_api.update("results", id=results_id, **results_dataset_fields) else: logging.info("creating results dataset {}".format( results_dataset_fields["name"])) results_dataset = tantalus_api.get_or_create("results", **results_dataset_fields) if tag_name is not None: tantalus_api.tag(tag_name, resultsdataset_set=[results_id]) logging.info("Succesfully created sequence dataset with ID {}".format( results_dataset["id"])) if remote_storage_name is not None: transfer_files.transfer_dataset(tantalus_api, results_dataset['id'], "resultsdataset", storage_name, remote_storage_name) return results_dataset
import os import click import logging from dbclients.tantalus import TantalusApi from dbclients.basicclient import NotFoundError from datamanagement.utils.constants import LOGGING_FORMAT logging.basicConfig(format=LOGGING_FORMAT, level=logging.INFO) logging.getLogger("azure.storage.common.storageclient").setLevel( logging.WARNING) tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client("singlecellresults") @click.command() @click.option('--jira_ticket') def add_missing_annotations_files(jira_ticket=None): """ There exists QC runs that have align and hmmcopy but no annotations results on Tantalus. These analyses were ran with version v0.2.25 of the single cell pipeline. Thus, filter for all align objects ran with v0.2.25 and see if the ticket has annotation blobs on Azure. If so, check if there exists an annotations result dataset for that ticket and create it and an annotations analysis if not. If result dataset already exist, iterate through annotation blobs and add to Tantalus if it has not been tracked. """ if jira_ticket is None: analyses = list(
def main( storage_name, dataset_type, dataset_id=None, tag_name=None, check_remote=None, dry_run=False, ): logging.info('cleanup up storage {}'.format(storage_name)) if check_remote: logging.info('checking remote {}'.format(check_remote)) else: logging.warning('not checking remote') tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) remote_client = None if check_remote is not None: remote_client = tantalus_api.get_storage_client(check_remote) if dataset_id is None and tag_name is None: raise ValueError('require either dataset id or tag name') if dataset_id is not None and tag_name is not None: raise ValueError('require exactly one of dataset id or tag name') if dataset_id is not None: logging.info('cleanup up dataset {}, {}'.format( dataset_id, dataset_type)) datasets = tantalus_api.list(dataset_type, id=dataset_id) if tag_name is not None: logging.info('cleanup up tag {}'.format(tag_name)) datasets = tantalus_api.list(dataset_type, tags__name=tag_name) total_data_size = 0 file_num_count = 0 for dataset in datasets: logging.info('checking dataset with id {}, name {}'.format( dataset['id'], dataset['name'])) # Optionally skip datasets not present and intact on the remote storage if check_remote is not None: if not tantalus_api.is_dataset_on_storage( dataset['id'], 'sequencedataset', check_remote): logging.warning( 'not deleting dataset with id {}, not on remote storage '. format(dataset['id'], check_remote)) continue # For each file instance on the remote, check if it exists and has the correct size in tantalus remote_file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, check_remote): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') remote_file_size_check = False # Skip this dataset if any files failed if not remote_file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], check_remote)) continue # Check consistency with the removal storage file_size_check = True for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): try: tantalus_api.check_file(file_instance) except DataError: logging.exception('check file failed') file_size_check = False # Skip this dataset if any files failed if not file_size_check: logging.warning( "skipping dataset {} that failed check on {}".format( dataset['id'], storage_name)) continue # Delete all files for this dataset for file_instance in tantalus_api.get_dataset_file_instances( dataset['id'], dataset_type, storage_name): if dry_run: logging.info( "would delete file instance with id {}, filepath {}". format(file_instance['id'], file_instance['filepath'])) else: logging.info( "deleting file instance with id {}, filepath {}".format( file_instance['id'], file_instance['filepath'])) tantalus_api.update("file_instance", id=file_instance['id'], is_deleted=True) total_data_size += file_instance['file_resource']['size'] file_num_count += 1 logging.info("deleted a total of {} files with size {} bytes".format( file_num_count, total_data_size))