def clean_request_name(request_name: str, whitelist: str = VALID_REQUEST_NAME_CHARS, char_limit: int = REQUEST_NAME_CHAR_LIMIT) -> str: """ Removes invalid characters from an API request name. """ return path_utils.clean_filename( filename=request_name, whitelist=whitelist, char_limit=char_limit).replace(':','_')
if (os.path.basename(basename) != basename or unquote(posixpath.basename(urlpath)) != basename): raise ValueError # reject '%2f' or 'dir%5Cbasename.ext' on Windows return basename #%% Enumerate blobs to files list_files = [] # folder_name = folder_names[0] for folder_name in folder_names: list_file = os.path.join( filename_base, job_set_name + '_' + path_utils.clean_filename(folder_name) + '_all.json') # If this is intended to be a folder, it needs to end in '/', otherwise files that start # with the same string will match too folder_name_suffix = folder_name folder_name_suffix = folder_name_suffix.replace('\\', '/') if (not len(folder_name) == 0) and (not folder_name_suffix.endswith('/')): folder_name_suffix = folder_name_suffix + '/' prefix = container_prefix + folder_name_suffix file_list = prepare_api_submission.enumerate_blobs_to_file( output_file=list_file, account_name=account_name, sas_token=read_only_sas_token, container_name=container_name, account_key=None, rmatch=None,
def enumerate_prefix(prefix, sas_url, output_folder, get_sizes=False): account_name = sas_blob_utils.get_account_from_uri(sas_url) container_name = sas_blob_utils.get_container_from_uri(sas_url) ro_sas_token = sas_blob_utils.get_sas_token_from_uri(sas_url) assert not ro_sas_token.startswith('?') ro_sas_token = '?' + ro_sas_token storage_account_url_blob = 'https://' + account_name + '.blob.core.windows.net' # prefix = prefixes[0]; print(prefix) print('Starting enumeration for prefix {}'.format(prefix)) # Open the output file fn = path_utils.clean_filename(prefix) output_file = os.path.join(output_folder, fn) # Create the container blob_service_client = BlobServiceClient( account_url=storage_account_url_blob, credential=ro_sas_token) container_client = blob_service_client.get_container_client(container_name) # Enumerate with open(output_file, 'w') as output_f: continuation_token = '' hit_debug_limit = False i_blob = 0 while (continuation_token is not None) and (not hit_debug_limit): blobs_iter = container_client.list_blobs( name_starts_with=prefix, results_per_page=n_blobs_per_page).by_page( continuation_token=continuation_token) # This is a paged list of BlobProperties objects blobs = next(blobs_iter) n_blobs_this_page = 0 for blob in blobs: i_blob += 1 n_blobs_this_page += 1 if (debug_max_files > 0) and (i_blob > debug_max_files): print('Hit debug path limit for prefix {}'.format(prefix)) i_blob -= 1 hit_debug_limit = True break else: size_string = '' if get_sizes: size_string = '\t' + str(blob.size) output_f.write(blob.name + size_string + '\n') # print('Enumerated {} blobs'.format(n_blobs_this_page)) cnt.increment(n=n_blobs_this_page) continuation_token = blobs_iter.continuation_token if sleep_time_per_page > 0: time.sleep(sleep_time_per_page) # ...while we're enumerating # ...with open(output_file) print('Finished enumerating {} blobs for prefix {}'.format(i_blob, prefix))
if (os.path.basename(basename) != basename or unquote(posixpath.basename(urlpath)) != basename): raise ValueError # reject '%2f' or 'dir%5Cbasename.ext' on Windows return basename #%% Enumerate blobs to files # file_lists_by_folder will contain a list of local JSON file names, # each JSON file contains a list of blob names corresponding to an API taskgroup file_lists_by_folder = [] # folder_name = folder_names[0] for folder_name in folder_names: clean_folder_name = path_utils.clean_filename(folder_name) json_filename = f'{base_task_name}_{clean_folder_name}_all.json' list_file = os.path.join(filename_base, json_filename) # If this is intended to be a folder, it needs to end in '/', otherwise # files that start with the same string will match too folder_name = folder_name.replace('\\', '/') if len(folder_name) > 0 and (not folder_name.endswith('/')): folder_name = folder_name + '/' prefix = container_prefix + folder_name file_list = ai4e_azure_utils.enumerate_blobs_to_file( output_file=list_file, account_name=storage_account_name, container_name=container_name, sas_token=read_only_sas_token, blob_prefix=prefix)
basename = posixpath.basename(unquote(urlpath)) if (os.path.basename(basename) != basename or unquote(posixpath.basename(urlpath)) != basename): raise ValueError # reject '%2f' or 'dir%5Cbasename.ext' on Windows return basename #%% Enumerate blobs to files list_files = [] # folder_name = folder_names[0] for folder_name in folder_names: list_file = os.path.join(filename_base,job_set_name + '_' + path_utils.clean_filename(folder_name) + '_all.json') # If this is intended to be a folder, it needs to end in '/', otherwise files that start # with the same string will match too folder_name_suffix = folder_name folder_name_suffix = folder_name_suffix.replace('\\','/') if (not len(folder_name) == 0) and (not folder_name_suffix.endswith('/')): folder_name_suffix = folder_name_suffix + '/' prefix = container_prefix + folder_name_suffix file_list = prepare_api_submission.enumerate_blobs_to_file(output_file=list_file, account_name=account_name,sas_token=read_only_sas_token, container_name=container_name, account_key=None, rmatch=None,prefix=prefix) list_files.append(list_file)
def list_blobs_in_container(container_name,account_name,sas_token,output_folder,prefix=None): if not sas_token.startswith('?'): sas_token = '?' + sas_token storage_account_url_blob = 'https://' + account_name + '.blob.core.windows.net' # prefix = prefixes[0]; print(prefix) print('Starting enumeration for container {}'.format(container_name)) # Open the output file fn = path_utils.clean_filename(container_name) + '.log' output_file = os.path.join(output_folder,fn) # Create the container blob_service_client = BlobServiceClient( account_url=storage_account_url_blob, credential=sas_token) container_client = blob_service_client.get_container_client(container_name) # Enumerate with open(output_file,'w') as output_f: continuation_token = '' hit_debug_limit = False i_blob = 0 while (continuation_token is not None) and (not hit_debug_limit): blobs_iter = container_client.list_blobs( name_starts_with=prefix, results_per_page=n_blobs_per_page).by_page( continuation_token=continuation_token) blobs = next(blobs_iter) n_blobs_this_page = 0 for blob in blobs: i_blob += 1 n_blobs_this_page += 1 if (debug_max_files > 0) and (i_blob > debug_max_files): print('Hit debug path limit for prefix {}'.format(prefix)) i_blob -= 1 hit_debug_limit = True break else: output_f.write(blob.name + '\n') # print('Enumerated {} blobs'.format(n_blobs_this_page)) cnt.increment(n=n_blobs_this_page) continuation_token = blobs_iter.continuation_token if sleep_time_per_page > 0: time.sleep(sleep_time_per_page) # ...while we're enumerating # ...with open(output_file) print('Finished enumerating {} blobs for container {}'.format( i_blob,container_name))