async def async_get(url, session, header, process_info_path, action): """ Coroutine that uses aiohttp to make a GET request. This is the method that will be called asynchronously with other GETs. Parameters ---------- url: str URL to call session: ClientSession object aiohttp ClientSession Object header: str Proper header for calls process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Response JSON """ async with session.get(url, headers=header) as response: assert response.status == 200 content = await response.json() # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') return { 'url': url, 'binary_content': base64.b64decode(content['content']), 'hashes': {'sha256': content['content_sha256']}}
def download_file(repo_data, resource_data, process_info_path, action): """ Build a dictionary for the requested file Parameters ---------- repo_data: dict Repository data gathered in the repo GET request resource_data: Resource data gathered in the resource GET request process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- A list of a single dictionary representing the file requested and delivered. Boom. """ repo_name = repo_data['name'] # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') return [{ 'file': base64.b64decode(resource_data['content']), 'hashes': {}, 'title': resource_data['name'], 'path': '/{}'.format(resource_data['name']), 'source_path': '/{}/{}'.format(repo_name, resource_data['path']), 'extra_metadata': {} }]
async def async_get(url, session, params, process_info_path, action): """ Coroutine that uses aiohttp to make a GET request. This is the method that will be called asynchronously with other GETs. Parameters ---------- url: str URL to call session: ClientSession object aiohttp ClientSession Object params: str params process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Response JSON """ async with session.get(url, params=params) as response: assert response.status == 200 content = await response.read() # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') return {'url': url, 'binary_content': content}
def get_resources(self, process_info_path, url=None): """ Get all of the user's resources. To batch calls together asynchronously we will group calls together by projects, then storages, then each storage's resources. """ resources = [] all_projects, top_level_projects = self.projects(url) # Add all top level projects and subprojects to the resources list self.iter_project_hierarchy(all_projects, top_level_projects, resources) # Add all storages to the resource list user_storages_links = self.iter_project_storages( all_projects, resources) # Get initial resources for all storages all_storages_resources = run_urls_async_with_pagination( self, user_storages_links) # Add the total number of storages to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(all_storages_resources), 'resource_collection', 'fetch') # Loop through the storage resources to either add them to the main resources list or # traverse further down the tree to get their children resources. for storage_resources in all_storages_resources: # Increment the number of files done in the process info file. increment_process_info(process_info_path, 'resource_collection', 'fetch') if storage_resources and storage_resources[ 'data']: #TODO: First if check doing this to avoid private file errors look into it # Calculate the given resource's container_id parent_project_id = storage_resources['data'][0][ 'relationships']['node']['data']['id'] parent_storage = storage_resources['data'][0]['attributes'][ 'provider'] container_id = '{}:{}'.format(parent_project_id, parent_storage) self.iter_resources_objects(storage_resources, resources, container_id) return resources
def figshare_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to the target. Parameters ---------- token : str User's token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API FigShare's Upload Process 1. Initiate new file upload (POST) within the article. Send file size, md5, and name but no file contents yet. 2. Send a GET request to the 'Uploader Service' to determine that the status is "Pending" and how many parts to split the upload into. 3. Split the file into the correct number of parts and upload each using a PUT request. 4. Send a POST request to complete the upload. """ try: headers, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) os_path = next(os.walk(resource_main_dir)) total_files = upload_total_files(resource_main_dir) # Update process info file update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to FigShare...") resources_ignored = [] resources_updated = [] file_metadata_list = [] action_metadata = {'destinationUsername': username} # Upload a new project if not resource_id: project_title = os_path[1][0] # Create a new project with the name being the top level directory's name. project_name, project_id = create_project(project_title, headers, token) # Create article, for now we'll name it the same as the project article_id = create_article(project_title, headers, project_id) else: # Upload to an existing project split_id = str(resource_id).split(":") project_id = split_id[0] try: project_title = requests.get( "https://api.figshare.com/v2/account/projects/{}".format( project_id), headers=headers).json()['title'] except KeyError: raise PresQTResponseException( "Project with id, {}, could not be found by the requesting user." .format(project_id), status.HTTP_400_BAD_REQUEST) if len(split_id) == 1: # We only have a project and we need to make a new article id # Check to see if an article with this name already exists articles = requests.get( "https://api.figshare.com/v2/account/projects/{}/articles". format(project_id), headers=headers).json() article_titles = [article['title'] for article in articles] new_title = get_duplicate_title(project_title, article_titles, "(PresQT*)") article_id = create_article(new_title, headers, resource_id) elif len(split_id) == 2: article_id = split_id[1] else: # Can't upload to file raise PresQTResponseException( "Can not upload into an existing file.", status.HTTP_400_BAD_REQUEST) # Get the article title try: article_title = requests.get( "https://api.figshare.com/v2/account/articles/{}".format( article_id), headers=headers).json()['title'] except KeyError: raise PresQTResponseException( "Article with id, {}, could not be found by the requesting user.". format(article_id), status.HTTP_400_BAD_REQUEST) # Get md5, size and name of zip file to be uploaded for path, subdirs, files in os.walk(resource_main_dir): for name in files: file_info = open(os.path.join(path, name), 'rb') zip_hash = hash_generator(file_info.read(), 'md5') figshare_file_upload_process(file_info, headers, name, article_id, file_type='zip', path=path) file_metadata_list.append({ 'actionRootPath': os.path.join(path, name), 'destinationPath': '/{}/{}/{}'.format(project_title, article_title, name), 'title': name, 'destinationHash': zip_hash }) increment_process_info(process_info_path, action, 'upload') return { "resources_ignored": resources_ignored, "resources_updated": resources_updated, "action_metadata": action_metadata, "file_metadata_list": file_metadata_list, "project_id": "{}:{}".format(project_id, article_id), "project_link": "https://figshare.com/account/home#/projects" }
def _upload_resource(self): """ Upload resources to the target and perform a fixity check on the resulting hashes. """ action = 'resource_upload' # This doesn't happen during an upload, so it won't be an error. If there is an error during # transfer this will be overwritten. self.keyword_enhancement_successful = True # Write the process id to the process_info file self.process_info_obj[ 'function_process_id'] = self.function_process.pid update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) # Data directory in the bag self.data_directory = '{}/data'.format(self.resource_main_dir) # If we are uploading (not transferring) then create the initial metadata based on the # zipped bag provided. if self.action == 'resource_upload': update_process_info_message(self.process_info_path, self.action, "Creating PRESQT_FTS_METADATA...") self.new_fts_metadata_files = [] for path, subdirs, files in os.walk(self.data_directory): for name in files: self.new_fts_metadata_files.append({ 'destinationHashes': {}, 'destinationPath': os.path.join(path, name)[len(self.data_directory):], 'failedFixityInfo': [], 'title': name, 'sourceHashes': { self.hash_algorithm: self.file_hashes[os.path.join(path, name)] }, 'sourcePath': os.path.join(path, name)[len(self.data_directory):], 'extra': {} }) destination_target_data = get_target_data( self.destination_target_name) self.details = "PresQT Upload to {}".format( destination_target_data['readable_name']) self.action_metadata = { 'id': str(uuid4()), 'details': self.details, 'actionDateTime': str(timezone.now()), 'actionType': self.action, 'sourceTargetName': 'Local Machine', 'sourceUsername': None, 'destinationTargetName': self.destination_target_name, 'destinationUsername': None, 'keywords': {}, 'files': { 'created': self.new_fts_metadata_files, 'updated': [], 'ignored': [] } } # If the target destination's storage hierarchy has a finite depth then zip the resources # to be uploaded along with their metadata. # Also, create metadata files for the new zip file to be uploaded. if self.infinite_depth is False: try: structure_validation(self) finite_depth_upload_helper(self) except PresQTResponseException as e: # Catch any errors that happen within the target fetch. # Update the server process_info file appropriately. self.process_info_obj['status_code'] = e.status_code self.process_info_obj['status'] = 'failed' if self.action == 'resource_transfer_in': self.process_info_obj['upload_status'] = 'failed' self.process_info_obj['message'] = e.data # Update the expiration from 5 hours to 1 hour from now. We can delete this faster because # it's an incomplete/failed directory. self.process_info_obj['expiration'] = str(timezone.now() + relativedelta( hours=1)) update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) return False # Fetch the proper function to call func = FunctionRouter.get_function(self.destination_target_name, action) # Upload the resources. func_dict has the following format: # { # 'resources_ignored': resources_ignored, # 'resources_updated': resources_updated, # 'action_metadata': action_metadata, # 'file_metadata_list': file_metadata_list, # 'project_id': title # } try: structure_validation(self) self.func_dict = func(self.destination_token, self.destination_resource_id, self.data_directory, self.hash_algorithm, self.file_duplicate_action, self.process_info_path, self.action) except PresQTResponseException as e: # Catch any errors that happen within the target fetch. # Update the server process_info file appropriately. self.process_info_obj['status_code'] = e.status_code self.process_info_obj['status'] = 'failed' if self.action == 'resource_transfer_in': self.process_info_obj['upload_status'] = 'failed' self.process_info_obj['message'] = e.data # Update the expiration from 5 hours to 1 hour from now. We can delete this faster # because it's an incomplete/failed directory. self.process_info_obj['expiration'] = str(timezone.now() + relativedelta(hours=1)) update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) return False self.process_info_obj = read_file(self.process_info_path, True)[self.action] # Check if fixity has failed on any files during a transfer. If so, update the # process_info_data file. self.upload_fixity = True self.upload_failed_fixity = [] for resource in self.func_dict['file_metadata_list']: resource['failed_fixity_info'] = [] if resource['destinationHash'] != self.file_hashes[resource['actionRootPath']] \ and resource['actionRootPath'] not in self.func_dict['resources_ignored']: self.upload_fixity = False self.upload_failed_fixity.append( resource['actionRootPath'][len(self.data_directory):]) resource['failed_fixity_info'].append({ 'NewGeneratedHash': self.file_hashes[resource['actionRootPath']], 'algorithmUsed': self.hash_algorithm, 'reasonFixityFailed': "Either the destination did not provide a hash " "or fixity failed during upload." }) # Strip the server created directory prefix of the file paths for ignored and updated files resources_ignored = [ file[len(self.data_directory):] for file in self.func_dict['resources_ignored'] ] self.process_info_obj['resources_ignored'] = resources_ignored resources_updated = [ file[len(self.data_directory):] for file in self.func_dict['resources_updated'] ] self.process_info_obj['resources_updated'] = resources_updated if self.action == 'resource_transfer_in': self.keyword_enhancement_successful = True if not self.destination_resource_id: self.destination_resource_id = self.func_dict['project_id'] if self.supports_keywords: self.keyword_enhancement_successful, self.destination_initial_keywords = update_targets_keywords( self, self.func_dict['project_id']) # Add the destination initial keywords to all keywords for accurate metadata list self.all_keywords = self.all_keywords + self.destination_initial_keywords self.metadata_validation = create_upload_metadata( self, self.func_dict['file_metadata_list'], self.func_dict['action_metadata'], self.func_dict['project_id'], resources_ignored, resources_updated) # Increment process_info one last time increment_process_info(self.process_info_path, self.action, 'upload') # Validate the final metadata upload_message = get_action_message(self, 'Upload', self.upload_fixity, self.metadata_validation, self.action_metadata) self.process_info_obj['message'] = upload_message if self.action == 'resource_upload': # Update server process file self.process_info_obj['status_code'] = '200' self.process_info_obj['status'] = 'finished' self.process_info_obj['hash_algorithm'] = self.hash_algorithm self.process_info_obj['failed_fixity'] = self.upload_failed_fixity self.process_info_obj['upload_status'] = upload_message self.process_info_obj['link_to_resource'] = self.func_dict[ "project_link"] update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) if self.email: context = { "upload_url": self.func_dict["project_link"], "upload_message": upload_message, "failed_fixity": self.upload_failed_fixity } email_blaster(self.email, "PresQT Upload Complete", context, "emails/upload_email.html") return True
def download_directory(header, path_to_resource, repo_data, process_info_path, action): """ Go through a repo's tree and download all files inside of a given resource directory path. Parameters ---------- header: dict API header expected by GitHub path_to_resource: str The path to the requested directory repo_data: dict Repository data gathered in the repo GET request process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- A list of dictionaries for each file being downloaded """ repo_name = repo_data['name'] # Strip {/sha} off the end trees_url = '{}/master?recursive=1'.format(repo_data['trees_url'][:-6]) contents = requests.get(trees_url, headers=header).json() number_of_files = len([ file for file in contents['tree'] if file['path'].startswith(path_to_resource) and file['type'] == 'blob' ]) # Add the total number of repository to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, number_of_files, action, 'download') update_process_info_message(process_info_path, action, 'Downloading files from GitHub...') files = [] for resource in contents['tree']: if resource['path'].startswith( path_to_resource) and resource['type'] == 'blob': # Strip the requested directory's parents off the directory path path_to_strip = path_to_resource.rpartition('/')[0] if path_to_strip: directory_path = '{}'.format( resource['path'].partition(path_to_strip)[2]) else: directory_path = '/{}'.format(resource['path']) file_data = requests.get(resource['url']).json() files.append({ 'file': base64.b64decode(file_data['content']), 'hashes': {}, 'title': resource['path'].rpartition('/')[0], 'path': directory_path, 'source_path': '/{}/{}'.format(repo_name, resource['path']), 'extra_metadata': {} }) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') return files
def zenodo_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from Zenodo along with its hash information. Parameters ---------- token : str User's Zenodo token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'metadata': { 'sourcePath': '/full/path/at/source.jpg', 'title': 'file_title', 'sourceHashes': {'hash_algorithm': 'the_hash'}, 'extra': {'any': 'extra'} } } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: auth_parameter = zenodo_validation_check(token) except PresQTResponseException: raise PresQTResponseException( 'Token is invalid. Response returned a 401 status code.', status.HTTP_401_UNAUTHORIZED) files = [] empty_containers = [] extra_metadata = {} base_url = None # If the resource_id is longer than 7 characters, the resource is an individual file if len(resource_id) > 7: # First we need to check if the file id given belongs to a public published record. zenodo_file = requests.get( 'https://zenodo.org/api/files/{}'.format(resource_id), params=auth_parameter) if zenodo_file.status_code != 200: # If not, we need to loop through their depositions to look for the file. zenodo_projects = requests.get( 'https://zenodo.org/api/deposit/depositions', params=auth_parameter).json() for entry in zenodo_projects: project_files = requests.get(entry['links']['self'], params=auth_parameter).json() for file in project_files['files']: if file['id'] == resource_id: base_url = entry['links']['self'] file_url = file['links']['self'] is_record = False break else: # If the file wasn't found we want to continue the loop. continue break else: is_record = True base_url = 'https://zenodo.org/api/files/{}'.format(resource_id) file_url = 'https://zenodo.org/api/files/{}'.format(resource_id) if base_url is None: raise PresQTResponseException( "The resource with id, {}, does not exist for this user.". format(resource_id), status.HTTP_404_NOT_FOUND) update_process_info_message(process_info_path, action, 'Downloading files from Zenodo...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') files, action_metadata = zenodo_download_helper( is_record, base_url, auth_parameter, files, file_url) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') # Otherwise, it's a full project else: base_url = 'https://zenodo.org/api/records/{}'.format(resource_id) zenodo_record = requests.get(base_url, params=auth_parameter) is_record = True if zenodo_record.status_code != 200: base_url = 'https://zenodo.org/api/deposit/depositions/{}'.format( resource_id) is_record = False try: files, action_metadata = zenodo_download_helper( is_record, base_url, auth_parameter, files) except PresQTResponseException: raise PresQTResponseException( "The resource with id, {}, does not exist for this user.". format(resource_id), status.HTTP_404_NOT_FOUND) extra_metadata = extra_metadata_helper(base_url, is_record, auth_parameter) file_urls = [file['file'] for file in files] update_process_info_message(process_info_path, action, 'Downloading files from Zenodo...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, auth_parameter, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content for file in files: file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def zenodo_upload_loop(action_metadata, resource_id, resource_main_dir, post_url, auth_parameter, title, file_duplicate_action, process_info_path, action): """ Loop through the files to be uploaded and return the dictionary. Parameters ---------- action_metadata : dict The metadata for this PresQT action resource_id : str The id of the resource the upload is happening on post_url : str The url to upload files to auth_parameter : dict Zenodo's authorization paramater title : str The title of the project created file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. """ resources_ignored = [] resources_updated = [] file_metadata_list = [] action_metadata = {'destinationUsername': None} # Get current files associated with the resource. project_url = "https://zenodo.org/api/deposit/depositions/{}".format(resource_id) current_file_list = requests.get(project_url, params=auth_parameter).json()['files'] file_title_list = [entry['filename'] for entry in current_file_list] for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: formatted_name = name.replace(' ', '_') if formatted_name in file_title_list and file_duplicate_action == 'ignore': resources_ignored.append(os.path.join(path, name)) continue data = {'name': formatted_name} files = {'file': open(os.path.join(path, name), "rb")} if formatted_name in file_title_list and file_duplicate_action == 'update': # First we need to delete the old file for entry in current_file_list: if formatted_name == entry['filename']: delete_response = requests.delete( entry['links']['self'], params=auth_parameter) if delete_response.status_code != 204: raise PresQTResponseException( "Zenodo returned an error trying to update {}".format(name), status.HTTP_400_BAD_REQUEST) # Add this resource to the updated list resources_updated.append(os.path.join(path, name)) # Make the upload request.... response = requests.post(post_url, params=auth_parameter, data=data, files=files) if response.status_code != 201: raise PresQTResponseException( "Zenodo returned an error trying to upload {}".format(name), status.HTTP_400_BAD_REQUEST) # Increment process info file increment_process_info(process_info_path, action, 'upload') file_metadata_list.append({ 'actionRootPath': os.path.join(path, name), 'destinationPath': '/{}/{}'.format(title, formatted_name), 'title': formatted_name, 'destinationHash': response.json()['checksum']}) return { "resources_ignored": resources_ignored, "resources_updated": resources_updated, "action_metadata": action_metadata, "file_metadata_list": file_metadata_list, "project_id": resource_id, "project_link": "https://zenodo.org/deposit?page=1&size=20" }
def create_directory(self, directory_path, file_duplicate_action, file_hashes, resources_ignored, resources_updated, file_metadata_list, process_info_path, action): """ Create a directory of folders and files found in the given directory_path. Parameters ---------- directory_path : str Directory to find the resources to create. file_duplicate_action : str Flag for how to handle the case of the file already existing. file_hashes : dict Dictionary of uploaded file hashes. resources_ignored : list List of duplicate resources ignored. resources_updated : list List of duplicate resources updated. file_metadata_list: list List of file metadata process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Returns same file_hashes, resources ignored, resources updated parameters. """ directory, folders, files = next(os.walk(directory_path)) for filename in files: file_path = '{}/{}'.format(directory, filename) file_to_write = read_file(file_path) file_action, file = self.create_file(filename, file_to_write, file_duplicate_action) file_metadata_list.append({ "actionRootPath": file_path, "destinationPath": '{}{}'.format(file.provider, file.materialized_path), "title": file.title, "destinationHash": file.hashes }) increment_process_info(process_info_path, action, 'upload') file_hashes[file_path] = file.hashes if file_action == 'ignored': resources_ignored.append(file_path) elif file_action == 'updated': resources_updated.append(file_path) for folder in folders: created_folder = self.create_folder(folder) created_folder.create_directory('{}/{}'.format(directory, folder), file_duplicate_action, file_hashes, resources_ignored, resources_updated, file_metadata_list, process_info_path, action)
def github_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to the target. Parameters ---------- token : str User's token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API """ try: header, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) os_path = next(os.walk(resource_main_dir)) # Get total amount of files total_files = upload_total_files(resource_main_dir) update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to GitHub...") # Upload a new repository if not resource_id: # Create a new repository with the name being the top level directory's name. # Note: GitHub doesn't allow spaces, or circlebois in repo_names repo_title = os_path[1][0].replace(' ', '_').replace("(", "-").replace( ")", "-").replace(":", "-") repo_name, repo_id, repo_url = create_repository(repo_title, token) resources_ignored = [] resources_updated = [] action_metadata = {"destinationUsername": username} file_metadata_list = [] for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: # Extract and encode the file bytes in the way expected by GitHub. file_bytes = open(os.path.join(path, name), 'rb').read() encoded_file = base64.b64encode(file_bytes).decode('utf-8') # A relative path to the file is what is added to the GitHub PUT address path_to_add = os.path.join(path.partition('/data/')[2], name) path_to_add_to_url = path_to_add.partition('/')[2].replace( ' ', '_') finished_path = '/' + repo_name + '/' + path_to_add_to_url file_metadata_list.append({ "actionRootPath": os.path.join(path, name), "destinationPath": finished_path, "title": name, "destinationHash": None }) put_url = "https://api.github.com/repos/{}/{}/contents/{}".format( username, repo_name, path_to_add_to_url) data = { "message": "PresQT Upload", "committer": { "name": "PresQT", "email": "N/A" }, "content": encoded_file } file_response = requests.put(put_url, headers=header, data=json.dumps(data)) if file_response.status_code != 201: raise PresQTResponseException( "Github returned the following error: '{}'".format( str(file_response.json()['message'])), status.HTTP_400_BAD_REQUEST) # Increment the file counter increment_process_info(process_info_path, action, 'upload') else: # Upload to an existing repository if ':' not in resource_id: repo_id = resource_id path_to_upload_to = '' # Upload to an existing directory else: partitioned_id = resource_id.partition(':') repo_id = partitioned_id[0] path_to_upload_to = '/{}'.format(partitioned_id[2]).replace( '%2F', '/').replace('%2E', '.') # Get initial repo data for the resource requested repo_url = 'https://api.github.com/repositories/{}'.format(repo_id) response = requests.get(repo_url, headers=header) if response.status_code != 200: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'. format(resource_id), status.HTTP_404_NOT_FOUND) repo_data = response.json() repo_name = repo_data['name'] repo_url = repo_data['svn_url'] # Get all repo resources so we can check if any files already exist repo_resources = requests.get('{}/master?recursive=1'.format( repo_data['trees_url'][:-6]), headers=header).json() if 'message' in repo_resources: repo_resources = requests.get('{}/main?recursive=1'.format( repo_data['trees_url'][:-6]), headers=header).json() # current_file_paths = ['/' + resource['path'] for resource in repo_resources['tree'] if resource['type'] == 'blob'] current_file_paths = [] for resource in repo_resources['tree']: if resource['type'] == 'blob': current_file_paths.append('/' + resource['path']) # Check if the provided path to upload to is actually a path to an existing file if path_to_upload_to in current_file_paths: raise PresQTResponseException( 'The Resource provided, {}, is not a container'.format( resource_id), status.HTTP_400_BAD_REQUEST) resources_ignored = [] resources_updated = [] file_metadata_list = [] sha = None action_metadata = {"destinationUsername": username} for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: path_to_file = os.path.join('/', path.partition('/data/')[2], name).replace(' ', '_') # Check if the file already exists in this repository full_file_path = '{}{}'.format(path_to_upload_to, path_to_file) if full_file_path in current_file_paths: if file_duplicate_action == 'ignore': resources_ignored.append(os.path.join(path, name)) continue else: resources_updated.append(os.path.join(path, name)) # Get the sha sha_url = 'https://api.github.com/repos/{}/contents{}'.format( repo_data['full_name'], full_file_path) sha_response = requests.get(sha_url, headers=header) sha = sha_response.json()['sha'] # Extract and encode the file bytes in the way expected by GitHub. file_bytes = open(os.path.join(path, name), 'rb').read() encoded_file = base64.b64encode(file_bytes).decode('utf-8') # A relative path to the file is what is added to the GitHub PUT address file_metadata_list.append({ "actionRootPath": os.path.join(path, name), "destinationPath": '/{}{}{}'.format(repo_name, path_to_upload_to, path_to_file), "title": name, "destinationHash": None }) put_url = 'https://api.github.com/repos/{}/contents{}{}'.format( repo_data['full_name'], path_to_upload_to, path_to_file) data = { "message": "PresQT Upload", "sha": sha, "committer": { "name": "PresQT", "email": "N/A" }, "content": encoded_file } upload_response = requests.put(put_url, headers=header, data=json.dumps(data)) if upload_response.status_code not in [200, 201]: raise PresQTResponseException( 'Upload failed with a status code of {}'.format( upload_response.status_code), status.HTTP_400_BAD_REQUEST) # Increment the file counter increment_process_info(process_info_path, action, 'upload') return { 'resources_ignored': resources_ignored, 'resources_updated': resources_updated, 'action_metadata': action_metadata, 'file_metadata_list': file_metadata_list, 'project_id': repo_id, "project_link": repo_url }
def osf_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from OSF along with its hash information. Parameters ---------- token : str User's OSF token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: osf_instance = OSF(token) except PresQTInvalidTokenError: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get contributor name contributor_name = requests.get('https://api.osf.io/v2/users/me/', headers={'Authorization': 'Bearer {}'.format(token)}).json()[ 'data']['attributes']['full_name'] action_metadata = {"sourceUsername": contributor_name} # Get the resource resource = get_osf_resource(resource_id, osf_instance) # Get all files for the provided resources. # The 'path' value will be the path that the file is eventually saved in. The root of the # path should be the resource. files = [] empty_containers = [] extra_metadata = {} if resource.kind_name == 'file': update_process_info_message(process_info_path, action, 'Downloading files from OSF...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') project = osf_instance.project(resource.parent_project_id) files.append({ "file": resource.download(), "hashes": resource.hashes, "title": resource.title, # If the file is the only resource we are downloading then we don't need it's full path "path": '/{}'.format(resource.title), "source_path": '/{}/{}{}'.format(project.title, resource.provider, resource.materialized_path), "extra_metadata": osf_download_metadata(resource) }) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') else: if resource.kind_name == 'project': extra_metadata = extra_metadata_helper(resource_id, {'Authorization': 'Bearer {}'.format(token)}) resource.get_all_files('', files, empty_containers) project = resource elif resource.kind_name == 'storage': resource.get_all_files('/{}'.format(resource.title), files, empty_containers) project = osf_instance.project(resource.node) else: resource.get_all_files('', files, empty_containers) project = osf_instance.project(resource.parent_project_id) for file in files: # File Path needs to start at the folder and strip everything before it. # Example: If the resource is 'Docs2' and the starting path is # '/Project/Storage/Docs1/Docs2/file.jpeg' then the final path # needs to be '/Docs2/file.jpeg' path_to_strip = resource.materialized_path[:-(len(resource.title) + 2)] file['path'] = file['file'].materialized_path[len(path_to_strip):] file_urls = [file['file'].download_url for file in files] update_process_info_message(process_info_path, action, 'Downloading files from OSF...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') # Asynchronously make all download requests loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete(async_main(file_urls, token, process_info_path, action)) # Go through the file dictionaries and replace the file class with the binary_content for file in files: file['source_path'] = '/{}/{}{}'.format(project.title, file['file'].provider, file['file'].materialized_path) file['file'] = get_dictionary_from_list( download_data, 'url', file['file'].download_url)['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def figshare_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from FigShare along with its hash information. Parameters ---------- token : str User's FigShare token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: headers, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) split_id = str(resource_id).split(":") extra_metadata = {} # But first we need to see whether it is a public project, or a private project. project_url = "https://api.figshare.com/v2/account/projects/{}".format( split_id[0]) response = requests.get(project_url, headers=headers) if response.status_code != 200: # Looking for a private project was unsuccessful, try a public project. project_url = "https://api.figshare.com/v2/projects/{}".format( split_id[0]) response = requests.get(project_url, headers=headers) if response.status_code != 200: # Project id is invalid raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) data = response.json() project_name = data['title'] # Flags to be used for file checks. file_urls = None files = None if len(split_id) == 1: # Download the contents of the project and build the list of file urls to download. articles_url = project_url + "/articles" files, empty_containers, action_metadata = download_project( username, articles_url, headers, project_name, []) file_urls = [file['file'] for file in files] extra_metadata = extra_metadata_helper(project_url, headers) elif len(split_id) == 2 or len(split_id) == 3: # We have an article or a file so we need to get the article url article_url = "https://api.figshare.com/v2/account/projects/{}/articles/{}".format( split_id[0], split_id[1]) response = requests.get(article_url, headers=headers) if response.status_code != 200: # Let's see if this is a public article.... article_url = "https://api.figshare.com/v2/articles/{}".format( split_id[1]) response = requests.get(article_url, headers=headers) if response.status_code != 200: # We couldn't find the article. raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) if len(split_id) == 2: # Download the contents of the article and build the list of file urls to download. files, empty_containers, action_metadata = download_article( username, article_url, headers, project_name, []) file_urls = [file['file'] for file in files] elif len(split_id) == 3: update_process_info_message(process_info_path, action, 'Downloading files from FigShare...') # Add the total number of articles to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') # Single file download. data = response.json() for file in data['files']: if str(file['id']) == split_id[2]: files = [{ "file": requests.get(file['download_url'], headers=headers).content, "hashes": { "md5": file['computed_md5'] }, "title": file['name'], "path": "/{}".format(file['name']), "source_path": "/{}/{}/{}".format(project_name, data['title'], file['name']), "extra_metadata": { "size": file['size'] } }] # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') empty_containers = [] action_metadata = {"sourceUsername": username} if not files: # We could not find the file. raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) if file_urls: update_process_info_message(process_info_path, action, 'Downloading files from FigShare...') # Add the total number of articles to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') # Start the async calls for project or article downloads loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, headers, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content for file in files: file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def curate_nd_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from CurateND along with its hash information. Parameters ---------- token : str User's CurateND token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: curate_instance = CurateND(token) except PresQTInvalidTokenError: raise PresQTValidationError( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get the resource resource = get_curate_nd_resource(resource_id, curate_instance) action_metadata = {"sourceUsername": resource.extra['depositor']} extra_metadata = {} # Get all the files for the provided resources. files = [] empty_containers = [] if resource.kind_name == 'file': title_url = resource.extra['isPartOf'] if type(title_url) is list: title_url = resource.extra['isPartOf'][0] # Get the title of the Project to add to sourcePath project_title = requests.get(title_url, headers={ 'X-Api-Token': '{}'.format(token) }).json()['title'] # This is so we aren't missing the few extra keys that are pulled out for the PresQT payload resource.extra.update({ "id": resource.id, "date_submitted": resource.date_submitted }) update_process_info_message(process_info_path, action, 'Downloading files from CurateND...') # Add the total number of items to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') binary_file, curate_hash = resource.download() files.append({ 'file': binary_file, 'hashes': { 'md5': curate_hash }, 'title': resource.title, # If the file is the only resource we are downloading then we don't need it's full path. 'path': '/{}'.format(resource.title), 'source_path': '/{}/{}'.format(project_title, resource.title), 'extra_metadata': resource.extra }) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') else: if not resource.extra['containedFiles']: empty_containers.append('{}'.format(resource.title)) else: update_process_info_message(process_info_path, action, 'Downloading files from CurateND...') # Add the total number of items to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(resource.extra['containedFiles']), action, 'download') title_helper = {} hash_helper = {} file_urls = [] project_title = resource.title file_metadata = [] extra_metadata = extra_metadata_helper(resource) for file in resource.extra['containedFiles']: download_url = file['downloadUrl'] contained_file = get_curate_nd_resource( file['id'], curate_instance) file_metadata_dict = { "title": contained_file.title, "extra": contained_file.extra } file_metadata.append(file_metadata_dict) title_helper[download_url] = contained_file.title hash_helper[download_url] = contained_file.md5 title_helper[file['downloadUrl']] = file['label'] file_urls.append(file['downloadUrl']) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, token, process_info_path, action)) for file in download_data: title = title_helper[file['url']] hash = hash_helper[file['url']] files.append({ 'file': file['binary_content'], 'hashes': { 'md5': hash }, 'title': title, "source_path": '/{}/{}'.format(project_title, title), 'path': '/{}/{}'.format(resource.title, title), 'extra_metadata': get_dictionary_from_list(file_metadata, 'title', title)['extra'] }) return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def gitlab_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from GitLab along with its hash information. Parameters ---------- token : str User's GitLab token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: header, user_id = validation_check(token) except PresQTResponseException: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get the user's GitLab username for action metadata username = requests.get("https://gitlab.com/api/v4/user", headers=header).json()['username'] partitioned_id = resource_id.partition(':') if ':' in resource_id: project_id = partitioned_id[0] else: project_id = resource_id project_url = 'https://gitlab.com/api/v4/projects/{}'.format(project_id) response = requests.get(project_url, headers=header) if response.status_code != 200: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) project_name = response.json()['name'] extra_metadata = {} if ':' not in resource_id: # This is for a project all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1".format( resource_id) data = gitlab_paginated_data(header, user_id, all_files_url) is_project = True # Get extra metadata extra_metadata = extra_metadata_helper(response.json(), header) elif ':' in resource_id and '%2E' not in resource_id: # This is for a directory all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?path={}&recursive=1".format( partitioned_id[0], partitioned_id[2].replace('+', ' ')) data = gitlab_paginated_data(header, user_id, all_files_url) if not data: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) is_project = False else: update_process_info_message(process_info_path, action, 'Downloading files from GitLab...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') # This is a single file data = requests.get('https://gitlab.com/api/v4/projects/{}/repository/files/{}?ref=master'.format( project_id, partitioned_id[2].replace('+', ' ')), headers=header).json() if 'message' in data.keys(): raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') return { 'resources': [{ 'file': base64.b64decode(data['content']), 'hashes': {'sha256': data['content_sha256']}, 'title': data['file_name'], 'path': '/{}'.format(data['file_name']), 'source_path': data['file_path'], 'extra_metadata': {}}], 'empty_containers': [], 'action_metadata': {'sourceUsername': username}, 'extra_metadata': extra_metadata } files, empty_containers, action_metadata = download_content( username, project_name, project_id, data, [], is_project) file_urls = [file['file'] for file in files] update_process_info_message(process_info_path, action, 'Downloading files from GitLab...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, header, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content # and replace the hashes with the correct file hashes for file in files: file['hashes'] = get_dictionary_from_list( download_data, 'url', file['file'])['hashes'] file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def gitlab_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to the target. Parameters ---------- token : str User's token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API """ base_url = "https://gitlab.com/api/v4/" try: headers, user_id = validation_check(token) except PresQTResponseException: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) username = requests.get("https://gitlab.com/api/v4/user", headers=headers).json()['username'] action_metadata = {"destinationUsername": username} os_path = next(os.walk(resource_main_dir)) # Get total amount of files total_files = upload_total_files(resource_main_dir) update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to GitLab...") resources_ignored = [] resources_updated = [] file_metadata_list = [] #*** CREATE NEW PROJECT ***# # Create a new project with the name being the top level directory's name. # Check if a project with this name exists for this user if not resource_id: project_title = os_path[1][0] titles = [data['name'] for data in gitlab_paginated_data(headers, user_id)] title = get_duplicate_title(project_title, titles, '-PresQT*-').replace('(', '-').replace(')', '-') response = requests.post('{}projects?name={}&visibility=public'.format( base_url, title), headers=headers) if response.status_code == 201: project_id = response.json()['id'] project_name = response.json()['name'] web_url = response.json()['web_url'] else: raise PresQTResponseException( "Response has status code {} while creating project {}.".format( response.status_code, project_title), status.HTTP_400_BAD_REQUEST) #*** UPLOAD FILES ***# # Upload files to project's repository base_repo_path = "{}projects/{}/repository/files/".format(base_url, project_id) for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: # Strip server directories from file path relative_file_path = os.path.join(path.partition('/data/{}/'.format( project_title))[2], name) # Extract and encode the file bytes in the way expected by GitLab. file_bytes = open(os.path.join(path, name), 'rb').read() encoded_file = base64.b64encode(file_bytes) # A relative path to the file is what is added to the GitLab POST address encoded_file_path = relative_file_path.replace('/', '%2F').replace('.', '%2E') request_data = {"branch": "master", "commit_message": "PresQT Upload", "encoding": "base64", "content": encoded_file} requests.post("{}{}".format( base_repo_path, encoded_file_path), headers=headers, data=request_data) # Get the file hash file_json = requests.get("{}{}?ref=master".format(base_repo_path, encoded_file_path), headers=headers) # Increment files finished increment_process_info(process_info_path, action, 'upload') file_metadata_list.append({ "actionRootPath": os.path.join(path, name), # This ensures that the title is up to date if there are duplicates "destinationPath": os.path.join(project_name, path.partition( '/data/')[2].partition('/')[2], name), "title": name, "destinationHash": file_json.json()['content_sha256'] }) else: if ':' not in resource_id: project_id = resource_id base_repo_url = "{}projects/{}/repository/files/".format(base_url, project_id) string_path_to_resource = '' else: partitioned_id = resource_id.partition(':') project_id = partitioned_id[0] base_repo_url = "{}projects/{}/repository/files/{}".format( base_url, project_id, partitioned_id[2]) string_path_to_resource = partitioned_id[2].replace('%2F', '/').replace('%2E', '.') # Check if the resource_id belongs to a file tree_url = 'https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1'.format( project_id) file_data = gitlab_paginated_data(headers, None, tree_url) for data in file_data: if data['path'] == string_path_to_resource: if data['type'] == 'blob': raise PresQTResponseException("Resource with id, {}, belongs to a file.".format( resource_id), status.HTTP_400_BAD_REQUEST) # Get project data project = requests.get('{}projects/{}'.format(base_url, project_id), headers=headers) if project.status_code != 200: raise PresQTResponseException("Project with id, {}, could not be found.".format( project_id), status.HTTP_404_NOT_FOUND) project_name = project.json()['name'] web_url = project.json()['web_url'] for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: # Strip server directories from file path relative_file_path = os.path.join(path.partition('/data/')[2], name) # A relative path to the file is what is added to the GitLab POST address if base_repo_url == "{}projects/{}/repository/files/".format(base_url, project_id): encoded_file_path = relative_file_path.replace('/', '%2F').replace('.', '%2E') else: encoded_file_path = '%2F{}'.format( relative_file_path.replace('/', '%2F').replace('.', '%2E')) full_encoded_url = '{}{}'.format(base_repo_url, encoded_file_path) ignore_file = False upload_request = requests.post file_bytes = None # Check if this file exists already for file in file_data: if os.path.join(string_path_to_resource, relative_file_path) == file['path']: if file_duplicate_action == 'ignore': resources_ignored.append(os.path.join(path, name)) ignore_file = True break else: file_url = '{}?ref=master'.format(full_encoded_url) file_response = requests.get(file_url, headers=headers) file_bytes = open(os.path.join(path, name), 'rb').read() if hash_generator(file_bytes, 'sha256') == file_response.json()['content_sha256']: resources_ignored.append(os.path.join(path, name)) ignore_file = True else: resources_updated.append(os.path.join(path, name)) upload_request = requests.put # Break out of this for loop and attempt to upload this duplicate break # If we find a file to ignore then move onto the next file in the os.walk if ignore_file: continue # Extract and encode the file bytes in the way expected by GitLab. if not file_bytes: file_bytes = open(os.path.join(path, name), 'rb').read() encoded_file = base64.b64encode(file_bytes) request_data = {"branch": "master", "commit_message": "PresQT Upload", "encoding": "base64", "content": encoded_file} response = upload_request("{}".format(full_encoded_url), headers=headers, data=request_data) if response.status_code not in [201, 200]: raise PresQTResponseException( 'Upload failed with a status code of {}'.format(response.status_code), status.HTTP_400_BAD_REQUEST) # Get the file hash file_json = requests.get("{}?ref=master".format(full_encoded_url), headers=headers).json() # Increment files finished increment_process_info(process_info_path, action, 'upload') file_metadata_list.append({ "actionRootPath": os.path.join(path, name), "destinationPath": os.path.join(project_name, path.partition('/data/')[2], name), "title": name, "destinationHash": file_json['content_sha256'] }) return { 'resources_ignored': resources_ignored, 'resources_updated': resources_updated, 'action_metadata': action_metadata, 'file_metadata_list': file_metadata_list, 'project_id': project_id, 'project_link': web_url }