def iter_resources_objects(self, container_resource, resources, container_id): """ Recursive function to add resource data to the resources list. """ folder_data = [] for resource in container_resource['data']: kind = resource['attributes']['kind'] if kind == 'file': file = File(resource, self.session) file_obj = { 'kind': file.kind, 'kind_name': file.kind_name, 'id': file.id, 'container': container_id, 'title': file.title } resources.append(file_obj) elif kind == 'folder': folder = Folder(resource, self.session) folder_obj = { 'kind': folder.kind, 'kind_name': folder.kind_name, 'id': folder.id, 'container': container_id, 'title': folder.title } resources.append(folder_obj) # Keep track of all folders' file urls that need to be called. folder_data.append({ 'url': folder._files_url, 'id': folder.id, 'path': folder.materialized_path }) # Asynchronously call all folder file urls to get the folder's top level resources. all_folders_resources = run_urls_async_with_pagination( self, [folder_dict['url'] for folder_dict in folder_data]) # For each folder, get it's container_id and resources for folder_resources in all_folders_resources: if folder_resources['data']: resource_attr = folder_resources['data'][0]['attributes'] if resource_attr['kind'] == 'folder': parent_path = resource_attr[ 'materialized_path'][:-len(resource_attr['name']) - 1] else: parent_path = resource_attr[ 'materialized_path'][:-len(resource_attr['name'])] # Find the corresponding parent_path in the folder_data list of dictionaries so we # can get the container id for this resource. container_id = get_dictionary_from_list( folder_data, 'path', parent_path)['id'] self.iter_resources_objects(folder_resources, resources, container_id)
def create_upload_metadata(instance, file_metadata_list, action_metadata, project_id, resources_ignored, resources_updated): """ Create FTS file metadata for the action's resources. Parameters ---------- instance: BaseResource Class Instance Class instance for the action file_metadata_list: list List of file metadata brought back from the upload function action_metadata: dict Metadata about the action itself project_id: str ID of the project the resource metadata should be uploaded to resources_ignored: list List of resource string paths that were ignored during upload resources_updated: list List of resource string paths that were updated during upload Returns ------- Returns the result of schema validation against the final FTS metadata. Will be True if valid and an error string if invalid. """ instance.action_metadata['destinationUsername'] = action_metadata[ 'destinationUsername'] # Put the file metadata in the correct file list instance.action_metadata['files'] = build_file_dict( instance.action_metadata['files']['created'], resources_ignored, resources_updated, 'destinationPath') for resource in file_metadata_list: # Get the resource's metadata dict that has already been created fts_metadata_entry = get_dictionary_from_list( instance.new_fts_metadata_files, 'destinationPath', resource['actionRootPath'][len(instance.data_directory):]) # Add destination metadata fts_metadata_entry['destinationHashes'] = {} if resource['destinationHash']: fts_metadata_entry['destinationHashes'][ instance.hash_algorithm] = resource['destinationHash'] fts_metadata_entry['destinationPath'] = resource['destinationPath'] fts_metadata_entry['failedFixityInfo'] += resource[ 'failed_fixity_info'] # Create FTS metadata object from presqt.api_v1.utilities import create_fts_metadata instance.fts_metadata_data = create_fts_metadata( instance.all_keywords, instance.action_metadata, instance.source_fts_metadata_actions, instance.extra_metadata) # Write the metadata file to the destination target and validate the metadata file metadata_validation = write_and_validate_metadata( instance, project_id, instance.fts_metadata_data) return metadata_validation
def github_download_resource(token, resource_id): """ Fetch the requested resource from GitHub along with its hash information. Parameters ---------- token : str User's GitHub token resource_id : str ID of the resource requested Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: header, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) project_url = 'https://api.github.com/repositories/{}'.format(resource_id) response = requests.get(project_url, headers=header) if response.status_code != 200: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format( resource_id), status.HTTP_404_NOT_FOUND) data = response.json() repo_name = data['name'] # Strip off the unnecessary {+path} that's included in the url # Example: https://api.github.com/repos/eggyboi/djangoblog/contents/{+path} becomes # https://api.github.com/repos/eggyboi/djangoblog/contents contents_url = data['contents_url'].partition('/{+path}')[0] files, empty_containers, action_metadata = download_content( username, contents_url, header, repo_name, []) file_urls = [file['file'] for file in files] loop = asyncio.new_event_loop() download_data = loop.run_until_complete(async_main(file_urls, header)) # Go through the file dictionaries and replace the file path with the binary_content for file in files: file['file'] = get_dictionary_from_list(download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata }
def zenodo_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from Zenodo along with its hash information. Parameters ---------- token : str User's Zenodo token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'metadata': { 'sourcePath': '/full/path/at/source.jpg', 'title': 'file_title', 'sourceHashes': {'hash_algorithm': 'the_hash'}, 'extra': {'any': 'extra'} } } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: auth_parameter = zenodo_validation_check(token) except PresQTResponseException: raise PresQTResponseException( 'Token is invalid. Response returned a 401 status code.', status.HTTP_401_UNAUTHORIZED) files = [] empty_containers = [] extra_metadata = {} base_url = None # If the resource_id is longer than 7 characters, the resource is an individual file if len(resource_id) > 7: # First we need to check if the file id given belongs to a public published record. zenodo_file = requests.get( 'https://zenodo.org/api/files/{}'.format(resource_id), params=auth_parameter) if zenodo_file.status_code != 200: # If not, we need to loop through their depositions to look for the file. zenodo_projects = requests.get( 'https://zenodo.org/api/deposit/depositions', params=auth_parameter).json() for entry in zenodo_projects: project_files = requests.get(entry['links']['self'], params=auth_parameter).json() for file in project_files['files']: if file['id'] == resource_id: base_url = entry['links']['self'] file_url = file['links']['self'] is_record = False break else: # If the file wasn't found we want to continue the loop. continue break else: is_record = True base_url = 'https://zenodo.org/api/files/{}'.format(resource_id) file_url = 'https://zenodo.org/api/files/{}'.format(resource_id) if base_url is None: raise PresQTResponseException( "The resource with id, {}, does not exist for this user.". format(resource_id), status.HTTP_404_NOT_FOUND) update_process_info_message(process_info_path, action, 'Downloading files from Zenodo...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') files, action_metadata = zenodo_download_helper( is_record, base_url, auth_parameter, files, file_url) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') # Otherwise, it's a full project else: base_url = 'https://zenodo.org/api/records/{}'.format(resource_id) zenodo_record = requests.get(base_url, params=auth_parameter) is_record = True if zenodo_record.status_code != 200: base_url = 'https://zenodo.org/api/deposit/depositions/{}'.format( resource_id) is_record = False try: files, action_metadata = zenodo_download_helper( is_record, base_url, auth_parameter, files) except PresQTResponseException: raise PresQTResponseException( "The resource with id, {}, does not exist for this user.". format(resource_id), status.HTTP_404_NOT_FOUND) extra_metadata = extra_metadata_helper(base_url, is_record, auth_parameter) file_urls = [file['file'] for file in files] update_process_info_message(process_info_path, action, 'Downloading files from Zenodo...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, auth_parameter, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content for file in files: file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def curate_nd_download_resource(token, resource_id): """ Fetch the requested resource from CurateND along with its hash information. Parameters ---------- token : str User's CurateND token resource_id : str ID of the resource requested Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: curate_instance = CurateND(token) except PresQTInvalidTokenError: raise PresQTValidationError( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get the resource resource = get_curate_nd_resource(resource_id, curate_instance) action_metadata = {"sourceUsername": resource.extra['depositor']} # Get all the files for the provided resources. files = [] empty_containers = [] if resource.kind_name == 'file': # Get the title of the Project to add to sourcePath project_title = requests.get(resource.extra['isPartOf'], headers={ 'X-Api-Token': '{}'.format(token) }).json()['title'] # This is so we aren't missing the few extra keys that are pulled out for the PresQT payload resource.extra.update({ "id": resource.id, "date_submitted": resource.date_submitted }) binary_file, curate_hash = resource.download() files.append({ 'file': binary_file, 'hashes': { 'md5': curate_hash }, 'title': resource.title, # If the file is the only resource we are downloading then we don't need it's full path. 'path': '/{}'.format(resource.title), 'source_path': '/{}/{}'.format(project_title, resource.title), 'extra_metadata': resource.extra }) else: if not resource.extra['containedFiles']: empty_containers.append('{}'.format(resource.title)) else: title_helper = {} hash_helper = {} file_urls = [] project_title = resource.title file_metadata = [] for file in resource.extra['containedFiles']: download_url = file['downloadUrl'] contained_file = get_curate_nd_resource( file['id'], curate_instance) file_metadata_dict = { "title": contained_file.title, "extra": contained_file.extra } file_metadata.append(file_metadata_dict) title_helper[download_url] = contained_file.title hash_helper[download_url] = contained_file.md5 file_urls.append(download_url) title_helper[file['downloadUrl']] = file['label'] file_urls.append(file['downloadUrl']) loop = asyncio.new_event_loop() download_data = loop.run_until_complete( async_main(file_urls, token)) for file in download_data: title = title_helper[file['url']] hash = hash_helper[file['url']] files.append({ 'file': file['binary_content'], 'hashes': { 'md5': hash }, 'title': title, "source_path": '/{}/{}'.format(project_title, title), 'path': '/{}/{}'.format(resource.title, title), 'extra_metadata': get_dictionary_from_list(file_metadata, 'title', title)['extra'] }) return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata }
def osf_download_resource(token, resource_id): """ Fetch the requested resource from OSF along with its hash information. Parameters ---------- token : str User's OSF token resource_id : str ID of the resource requested Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: osf_instance = OSF(token) except PresQTInvalidTokenError: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get contributor name contributor_name = requests.get( 'https://api.osf.io/v2/users/me/', headers={ 'Authorization': 'Bearer {}'.format(token) }).json()['data']['attributes']['full_name'] action_metadata = {"sourceUsername": contributor_name} # Get the resource resource = get_osf_resource(resource_id, osf_instance) # Get all files for the provided resources. # The 'path' value will be the path that the file is eventually saved in. The root of the # path should be the resource. files = [] empty_containers = [] if resource.kind_name == 'file': project = osf_instance.project(resource.parent_project_id) files.append({ "file": resource.download(), "hashes": resource.hashes, "title": resource.title, # If the file is the only resource we are downloading then we don't need it's full path "path": '/{}'.format(resource.title), "source_path": '/{}/{}{}'.format(project.title, resource.provider, resource.materialized_path), "extra_metadata": osf_download_metadata(resource) }) else: if resource.kind_name == 'project': resource.get_all_files('', files, empty_containers) project = resource elif resource.kind_name == 'storage': resource.get_all_files('/{}'.format(resource.title), files, empty_containers) project = osf_instance.project(resource.node) else: resource.get_all_files('', files, empty_containers) project = osf_instance.project(resource.parent_project_id) for file in files: # File Path needs to start at the folder and strip everything before it. # Example: If the resource is 'Docs2' and the starting path is # '/Project/Storage/Docs1/Docs2/file.jpeg' then the final path # needs to be '/Docs2/file.jpeg' path_to_strip = resource.materialized_path[:-( len(resource.title) + 2)] file['path'] = file['file'].materialized_path[len(path_to_strip ):] # Asynchronously make all download requests file_urls = [file['file'].download_url for file in files] loop = asyncio.new_event_loop() download_data = loop.run_until_complete(async_main(file_urls, token)) # Go through the file dictionaries and replace the file class with the binary_content for file in files: file['source_path'] = '/{}/{}{}'.format( project.title, file['file'].provider, file['file'].materialized_path) file['file'] = get_dictionary_from_list( download_data, 'url', file['file'].download_url)['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata }
def handle(self, *args, **kwargs): targets_json = read_file('presqt/specs/targets.json', True) list_of_partners_in = [] list_of_partners_out = [] for target in targets_json: if target['supported_actions']['resource_transfer_in'] == True: list_of_partners_in.append(target['name']) if target['supported_actions']['resource_transfer_out'] == True: list_of_partners_out.append(target['name']) ##### Get Input From User ##### while True: target_name = input('Enter target name (use underscores not spaces): ').lower() if set('[~! @#$%^&*()+{}":;[],.<>`=+-\']+$\\').intersection(target_name): print("Target name can't contain special characters or spaces") else: break human_readable_target_name = input('Enter human readable target name (format however): ') url_validator = URLValidator() while True: status_url = input('Enter target status url (include http:// or https://): ') try: url_validator(status_url) except ValidationError: print("Target status url must be a valid url") else: break while True: resource_collection = input('Does your target support the Resource Collection endpoint? (Y or N): ') if resource_collection not in ['Y', 'y', 'N', 'n']: print('Must input Y or N') else: if resource_collection in ['Y', 'y']: resource_collection = True else: resource_collection = False break while True: resource_detail = input('Does your target support the Resource Detail endpoint? (Y or N): ') if resource_detail not in ['Y', 'y', 'N', 'n']: print('Must input Y or N') else: if resource_detail in ['Y', 'y']: resource_detail = True else: resource_detail = False break while True: resource_download = input('Does your target support the Resource Download endpoint? (Y or N): ') if resource_download not in ['Y', 'y', 'N', 'n']: print('Must input Y or N') else: if resource_download in ['Y', 'y']: resource_download = True else: resource_download = False break while True: resource_upload = input('Does your target support the Resource Upload endpoint? (Y or N): ') if resource_upload not in ['Y', 'y', 'N', 'n']: print('Must input Y or N') else: if resource_upload in ['Y', 'y']: resource_upload = True else: resource_upload = False break while True: resource_transfer_in = input('Does your target support the Resource Transfer In endpoint? (Y or N): ') if resource_transfer_in not in ['Y', 'y', 'N', 'n']: print('Must input Y or N') else: if resource_transfer_in in ['Y', 'y']: resource_transfer_in = True else: resource_transfer_in = False break while True: resource_transfer_out = input('Does your target support the Resource Transfer Out endpoint? (Y or N): ') if resource_transfer_out not in ['Y', 'y', 'N', 'n']: print('Must input Y or N') else: if resource_transfer_out in ['Y', 'y']: resource_transfer_out = True else: resource_transfer_out = False break while True: transfer_in = input("Which PresQT partners are you allowing to transfer into your service? (comma seperated list with no spaces (use underscores))\nOptions are {}: ".format(list_of_partners_out)) if ' ' in transfer_in: print("Input can't contain spaces") continue transfer_in = transfer_in.lower().split(',') for partner in transfer_in: if partner not in list_of_partners_out: print("{} is not a recognized target, or doesn't support resource_transfer_out.".format(partner)) break else: break while True: transfer_out = input("Which PresQT partners are you allowing your service to transfer to? (comma seperated list with no spaces (use underscores))\nOptions are {}: ".format(list_of_partners_in)) if ' ' in transfer_out: print("Input can't contain spaces") continue transfer_out = transfer_out.lower().split(',') for partner in transfer_out: if partner not in list_of_partners_in: print("{} is not a recognized target, or doesn't support resource_transfer_in.".format(partner)) break else: break while True: hash_algorithms = input('Enter your supported hash algorithms (comma separated list with no spaces)') if ' ' in hash_algorithms: print("Input can't contain spaces") continue hash_algorithms = hash_algorithms.split(',') for hash_algorithm in hash_algorithms: if hash_algorithm not in hashlib.algorithms_available: print('{} is not supported by the hashlib Python library'.format(hash_algorithm)) break else: break ##### Check if target exists in targets.json ##### if get_dictionary_from_list(targets_json, 'name', target_name): print('Error! Target, {}, already exists in targets.json!'.format(target_name)) return ##### Make Target Directory ##### target_directory = 'presqt/targets/{}/'.format(target_name) try: os.makedirs(os.path.dirname(target_directory)) print('Directory created: {}'.format(target_directory)) except FileExistsError: print('Error! Target directory already exists!') return else: open('{}{}'.format(target_directory, '__init__.py'), 'a').close() ##### Make Target Function Directory ##### target_function_dir = '{}{}/'.format(target_directory, 'functions') os.makedirs(os.path.dirname(target_function_dir)) print('Directory created: {}'.format(target_function_dir)) open('{}{}'.format(target_function_dir, '__init__.py'), 'a').close() print('File created: {}'.format(target_function_dir)) ##### Make Target Action Files #### target_functions = {} if resource_collection or resource_detail: with open('{}fetch.py'.format(target_function_dir), 'w') as file: target_functions['fetch'] = {} if resource_collection: resource_collection_function ='{}_fetch_resources'.format(target_name) target_functions['fetch']['{}_resource_collection'.format(target_name)] = resource_collection_function file.write('def {}(token, search_parameter):\n\tpass'.format(resource_collection_function)) if resource_detail: file.write('\n\n') if resource_detail: resource_detail_function = '{}_fetch_resource'.format(target_name) target_functions['fetch']['{}_resource_detail'.format(target_name)] = resource_detail_function file.write('def {}(token, resource_id):\n\tpass'.format(resource_detail_function)) print('File created: {}fetch.py'.format(target_function_dir)) if resource_download: with open('{}download.py'.format(target_function_dir), 'w') as file: resource_download_function ='{}_download_resource'.format(target_name) target_functions['download'] = {'{}_resource_download'.format(target_name): resource_download_function} file.write('def {}(token, resource_id):\n\tpass'.format(resource_download_function)) print('File created: {}download.py'.format(target_function_dir)) if resource_upload: with open('{}upload.py'.format(target_function_dir), 'w') as file: resource_upload_function = '{}_upload_resource'.format(target_name) target_functions['upload'] = {'{}_resource_upload'.format(target_name): resource_upload_function} file.write('def {}(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action):\n\tpass'.format(resource_upload_function)) print('File created: {}upload.py'.format(target_function_dir)) ##### Write to function_router.py ##### with open('presqt/api_v1/utilities/utils/function_router.py', 'a') as file: if target_functions: file.write('\n') for file_name, file_name_dict in target_functions.items(): for variable_name, function_name in file_name_dict.items(): file.write(' {} = {}\n'.format(variable_name, function_name)) with open('presqt/api_v1/utilities/utils/function_router.py', 'r+') as file: content = file.read() file.seek(0, 0) new_imports = '' for file_name, file_name_dict in target_functions.items(): new_imports += 'from presqt.targets.{}.functions.{} import {}\n'.format(target_name, file_name, ', '.join(file_name_dict.values())) file.write(new_imports + content) print('File updated: presqt/api_v1/utilities/utils/function_router.py') ##### Write to targets.json ##### target_dict = { "name": target_name, "readable_name": human_readable_target_name, "status_url": status_url, "supported_actions": { "resource_collection": resource_collection, "resource_detail": resource_detail, "resource_download": resource_download, "resource_upload": resource_upload, "resource_transfer_in": resource_transfer_in, "resource_transfer_out": resource_transfer_out }, "supported_transfer_partners": { "transfer_in": transfer_in, "transfer_out": transfer_out }, "supported_hash_algorithms": hash_algorithms } data = read_file('presqt/specs/targets.json', True) data.append(target_dict) write_file('presqt/specs/targets.json', data, True) print('File updated: presqt/specs/targets.json')
def figshare_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from FigShare along with its hash information. Parameters ---------- token : str User's FigShare token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: headers, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) split_id = str(resource_id).split(":") extra_metadata = {} # But first we need to see whether it is a public project, or a private project. project_url = "https://api.figshare.com/v2/account/projects/{}".format( split_id[0]) response = requests.get(project_url, headers=headers) if response.status_code != 200: # Looking for a private project was unsuccessful, try a public project. project_url = "https://api.figshare.com/v2/projects/{}".format( split_id[0]) response = requests.get(project_url, headers=headers) if response.status_code != 200: # Project id is invalid raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) data = response.json() project_name = data['title'] # Flags to be used for file checks. file_urls = None files = None if len(split_id) == 1: # Download the contents of the project and build the list of file urls to download. articles_url = project_url + "/articles" files, empty_containers, action_metadata = download_project( username, articles_url, headers, project_name, []) file_urls = [file['file'] for file in files] extra_metadata = extra_metadata_helper(project_url, headers) elif len(split_id) == 2 or len(split_id) == 3: # We have an article or a file so we need to get the article url article_url = "https://api.figshare.com/v2/account/projects/{}/articles/{}".format( split_id[0], split_id[1]) response = requests.get(article_url, headers=headers) if response.status_code != 200: # Let's see if this is a public article.... article_url = "https://api.figshare.com/v2/articles/{}".format( split_id[1]) response = requests.get(article_url, headers=headers) if response.status_code != 200: # We couldn't find the article. raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) if len(split_id) == 2: # Download the contents of the article and build the list of file urls to download. files, empty_containers, action_metadata = download_article( username, article_url, headers, project_name, []) file_urls = [file['file'] for file in files] elif len(split_id) == 3: update_process_info_message(process_info_path, action, 'Downloading files from FigShare...') # Add the total number of articles to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') # Single file download. data = response.json() for file in data['files']: if str(file['id']) == split_id[2]: files = [{ "file": requests.get(file['download_url'], headers=headers).content, "hashes": { "md5": file['computed_md5'] }, "title": file['name'], "path": "/{}".format(file['name']), "source_path": "/{}/{}/{}".format(project_name, data['title'], file['name']), "extra_metadata": { "size": file['size'] } }] # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') empty_containers = [] action_metadata = {"sourceUsername": username} if not files: # We could not find the file. raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) if file_urls: update_process_info_message(process_info_path, action, 'Downloading files from FigShare...') # Add the total number of articles to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') # Start the async calls for project or article downloads loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, headers, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content for file in files: file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def gitlab_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from GitLab along with its hash information. Parameters ---------- token : str User's GitLab token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: header, user_id = validation_check(token) except PresQTResponseException: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get the user's GitLab username for action metadata username = requests.get("https://gitlab.com/api/v4/user", headers=header).json()['username'] partitioned_id = resource_id.partition(':') if ':' in resource_id: project_id = partitioned_id[0] else: project_id = resource_id project_url = 'https://gitlab.com/api/v4/projects/{}'.format(project_id) response = requests.get(project_url, headers=header) if response.status_code != 200: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) project_name = response.json()['name'] extra_metadata = {} if ':' not in resource_id: # This is for a project all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1".format( resource_id) data = gitlab_paginated_data(header, user_id, all_files_url) is_project = True # Get extra metadata extra_metadata = extra_metadata_helper(response.json(), header) elif ':' in resource_id and '%2E' not in resource_id: # This is for a directory all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?path={}&recursive=1".format( partitioned_id[0], partitioned_id[2].replace('+', ' ')) data = gitlab_paginated_data(header, user_id, all_files_url) if not data: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) is_project = False else: update_process_info_message(process_info_path, action, 'Downloading files from GitLab...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') # This is a single file data = requests.get('https://gitlab.com/api/v4/projects/{}/repository/files/{}?ref=master'.format( project_id, partitioned_id[2].replace('+', ' ')), headers=header).json() if 'message' in data.keys(): raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') return { 'resources': [{ 'file': base64.b64decode(data['content']), 'hashes': {'sha256': data['content_sha256']}, 'title': data['file_name'], 'path': '/{}'.format(data['file_name']), 'source_path': data['file_path'], 'extra_metadata': {}}], 'empty_containers': [], 'action_metadata': {'sourceUsername': username}, 'extra_metadata': extra_metadata } files, empty_containers, action_metadata = download_content( username, project_name, project_id, data, [], is_project) file_urls = [file['file'] for file in files] update_process_info_message(process_info_path, action, 'Downloading files from GitLab...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, header, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content # and replace the hashes with the correct file hashes for file in files: file['hashes'] = get_dictionary_from_list( download_data, 'url', file['file'])['hashes'] file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }