def _upload_resource(self): """ Upload resources to the target and perform a fixity check on the resulting hashes. """ action = 'resource_upload' # This doesn't happen during an upload, so it won't be an error. If there is an error during # transfer this will be overwritten. self.keyword_enhancement_successful = True # Write the process id to the process_info file self.process_info_obj[ 'function_process_id'] = self.function_process.pid update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) # Data directory in the bag self.data_directory = '{}/data'.format(self.resource_main_dir) # If we are uploading (not transferring) then create the initial metadata based on the # zipped bag provided. if self.action == 'resource_upload': update_process_info_message(self.process_info_path, self.action, "Creating PRESQT_FTS_METADATA...") self.new_fts_metadata_files = [] for path, subdirs, files in os.walk(self.data_directory): for name in files: self.new_fts_metadata_files.append({ 'destinationHashes': {}, 'destinationPath': os.path.join(path, name)[len(self.data_directory):], 'failedFixityInfo': [], 'title': name, 'sourceHashes': { self.hash_algorithm: self.file_hashes[os.path.join(path, name)] }, 'sourcePath': os.path.join(path, name)[len(self.data_directory):], 'extra': {} }) destination_target_data = get_target_data( self.destination_target_name) self.details = "PresQT Upload to {}".format( destination_target_data['readable_name']) self.action_metadata = { 'id': str(uuid4()), 'details': self.details, 'actionDateTime': str(timezone.now()), 'actionType': self.action, 'sourceTargetName': 'Local Machine', 'sourceUsername': None, 'destinationTargetName': self.destination_target_name, 'destinationUsername': None, 'keywords': {}, 'files': { 'created': self.new_fts_metadata_files, 'updated': [], 'ignored': [] } } # If the target destination's storage hierarchy has a finite depth then zip the resources # to be uploaded along with their metadata. # Also, create metadata files for the new zip file to be uploaded. if self.infinite_depth is False: try: structure_validation(self) finite_depth_upload_helper(self) except PresQTResponseException as e: # Catch any errors that happen within the target fetch. # Update the server process_info file appropriately. self.process_info_obj['status_code'] = e.status_code self.process_info_obj['status'] = 'failed' if self.action == 'resource_transfer_in': self.process_info_obj['upload_status'] = 'failed' self.process_info_obj['message'] = e.data # Update the expiration from 5 hours to 1 hour from now. We can delete this faster because # it's an incomplete/failed directory. self.process_info_obj['expiration'] = str(timezone.now() + relativedelta( hours=1)) update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) return False # Fetch the proper function to call func = FunctionRouter.get_function(self.destination_target_name, action) # Upload the resources. func_dict has the following format: # { # 'resources_ignored': resources_ignored, # 'resources_updated': resources_updated, # 'action_metadata': action_metadata, # 'file_metadata_list': file_metadata_list, # 'project_id': title # } try: structure_validation(self) self.func_dict = func(self.destination_token, self.destination_resource_id, self.data_directory, self.hash_algorithm, self.file_duplicate_action, self.process_info_path, self.action) except PresQTResponseException as e: # Catch any errors that happen within the target fetch. # Update the server process_info file appropriately. self.process_info_obj['status_code'] = e.status_code self.process_info_obj['status'] = 'failed' if self.action == 'resource_transfer_in': self.process_info_obj['upload_status'] = 'failed' self.process_info_obj['message'] = e.data # Update the expiration from 5 hours to 1 hour from now. We can delete this faster # because it's an incomplete/failed directory. self.process_info_obj['expiration'] = str(timezone.now() + relativedelta(hours=1)) update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) return False self.process_info_obj = read_file(self.process_info_path, True)[self.action] # Check if fixity has failed on any files during a transfer. If so, update the # process_info_data file. self.upload_fixity = True self.upload_failed_fixity = [] for resource in self.func_dict['file_metadata_list']: resource['failed_fixity_info'] = [] if resource['destinationHash'] != self.file_hashes[resource['actionRootPath']] \ and resource['actionRootPath'] not in self.func_dict['resources_ignored']: self.upload_fixity = False self.upload_failed_fixity.append( resource['actionRootPath'][len(self.data_directory):]) resource['failed_fixity_info'].append({ 'NewGeneratedHash': self.file_hashes[resource['actionRootPath']], 'algorithmUsed': self.hash_algorithm, 'reasonFixityFailed': "Either the destination did not provide a hash " "or fixity failed during upload." }) # Strip the server created directory prefix of the file paths for ignored and updated files resources_ignored = [ file[len(self.data_directory):] for file in self.func_dict['resources_ignored'] ] self.process_info_obj['resources_ignored'] = resources_ignored resources_updated = [ file[len(self.data_directory):] for file in self.func_dict['resources_updated'] ] self.process_info_obj['resources_updated'] = resources_updated if self.action == 'resource_transfer_in': self.keyword_enhancement_successful = True if not self.destination_resource_id: self.destination_resource_id = self.func_dict['project_id'] if self.supports_keywords: self.keyword_enhancement_successful, self.destination_initial_keywords = update_targets_keywords( self, self.func_dict['project_id']) # Add the destination initial keywords to all keywords for accurate metadata list self.all_keywords = self.all_keywords + self.destination_initial_keywords self.metadata_validation = create_upload_metadata( self, self.func_dict['file_metadata_list'], self.func_dict['action_metadata'], self.func_dict['project_id'], resources_ignored, resources_updated) # Increment process_info one last time increment_process_info(self.process_info_path, self.action, 'upload') # Validate the final metadata upload_message = get_action_message(self, 'Upload', self.upload_fixity, self.metadata_validation, self.action_metadata) self.process_info_obj['message'] = upload_message if self.action == 'resource_upload': # Update server process file self.process_info_obj['status_code'] = '200' self.process_info_obj['status'] = 'finished' self.process_info_obj['hash_algorithm'] = self.hash_algorithm self.process_info_obj['failed_fixity'] = self.upload_failed_fixity self.process_info_obj['upload_status'] = upload_message self.process_info_obj['link_to_resource'] = self.func_dict[ "project_link"] update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) if self.email: context = { "upload_url": self.func_dict["project_link"], "upload_message": upload_message, "failed_fixity": self.upload_failed_fixity } email_blaster(self.email, "PresQT Upload Complete", context, "emails/upload_email.html") return True
def figshare_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to the target. Parameters ---------- token : str User's token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API FigShare's Upload Process 1. Initiate new file upload (POST) within the article. Send file size, md5, and name but no file contents yet. 2. Send a GET request to the 'Uploader Service' to determine that the status is "Pending" and how many parts to split the upload into. 3. Split the file into the correct number of parts and upload each using a PUT request. 4. Send a POST request to complete the upload. """ try: headers, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) os_path = next(os.walk(resource_main_dir)) total_files = upload_total_files(resource_main_dir) # Update process info file update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to FigShare...") resources_ignored = [] resources_updated = [] file_metadata_list = [] action_metadata = {'destinationUsername': username} # Upload a new project if not resource_id: project_title = os_path[1][0] # Create a new project with the name being the top level directory's name. project_name, project_id = create_project(project_title, headers, token) # Create article, for now we'll name it the same as the project article_id = create_article(project_title, headers, project_id) else: # Upload to an existing project split_id = str(resource_id).split(":") project_id = split_id[0] try: project_title = requests.get( "https://api.figshare.com/v2/account/projects/{}".format( project_id), headers=headers).json()['title'] except KeyError: raise PresQTResponseException( "Project with id, {}, could not be found by the requesting user." .format(project_id), status.HTTP_400_BAD_REQUEST) if len(split_id) == 1: # We only have a project and we need to make a new article id # Check to see if an article with this name already exists articles = requests.get( "https://api.figshare.com/v2/account/projects/{}/articles". format(project_id), headers=headers).json() article_titles = [article['title'] for article in articles] new_title = get_duplicate_title(project_title, article_titles, "(PresQT*)") article_id = create_article(new_title, headers, resource_id) elif len(split_id) == 2: article_id = split_id[1] else: # Can't upload to file raise PresQTResponseException( "Can not upload into an existing file.", status.HTTP_400_BAD_REQUEST) # Get the article title try: article_title = requests.get( "https://api.figshare.com/v2/account/articles/{}".format( article_id), headers=headers).json()['title'] except KeyError: raise PresQTResponseException( "Article with id, {}, could not be found by the requesting user.". format(article_id), status.HTTP_400_BAD_REQUEST) # Get md5, size and name of zip file to be uploaded for path, subdirs, files in os.walk(resource_main_dir): for name in files: file_info = open(os.path.join(path, name), 'rb') zip_hash = hash_generator(file_info.read(), 'md5') figshare_file_upload_process(file_info, headers, name, article_id, file_type='zip', path=path) file_metadata_list.append({ 'actionRootPath': os.path.join(path, name), 'destinationPath': '/{}/{}/{}'.format(project_title, article_title, name), 'title': name, 'destinationHash': zip_hash }) increment_process_info(process_info_path, action, 'upload') return { "resources_ignored": resources_ignored, "resources_updated": resources_updated, "action_metadata": action_metadata, "file_metadata_list": file_metadata_list, "project_id": "{}:{}".format(project_id, article_id), "project_link": "https://figshare.com/account/home#/projects" }
def download_directory(header, path_to_resource, repo_data, process_info_path, action): """ Go through a repo's tree and download all files inside of a given resource directory path. Parameters ---------- header: dict API header expected by GitHub path_to_resource: str The path to the requested directory repo_data: dict Repository data gathered in the repo GET request process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- A list of dictionaries for each file being downloaded """ repo_name = repo_data['name'] # Strip {/sha} off the end trees_url = '{}/master?recursive=1'.format(repo_data['trees_url'][:-6]) contents = requests.get(trees_url, headers=header).json() number_of_files = len([ file for file in contents['tree'] if file['path'].startswith(path_to_resource) and file['type'] == 'blob' ]) # Add the total number of repository to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, number_of_files, action, 'download') update_process_info_message(process_info_path, action, 'Downloading files from GitHub...') files = [] for resource in contents['tree']: if resource['path'].startswith( path_to_resource) and resource['type'] == 'blob': # Strip the requested directory's parents off the directory path path_to_strip = path_to_resource.rpartition('/')[0] if path_to_strip: directory_path = '{}'.format( resource['path'].partition(path_to_strip)[2]) else: directory_path = '/{}'.format(resource['path']) file_data = requests.get(resource['url']).json() files.append({ 'file': base64.b64decode(file_data['content']), 'hashes': {}, 'title': resource['path'].rpartition('/')[0], 'path': directory_path, 'source_path': '/{}/{}'.format(repo_name, resource['path']), 'extra_metadata': {} }) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') return files
def _download_resource(self): """ Downloads the resources from the target, performs a fixity check, zips them up in BagIt format. """ action = 'resource_download' # Write the process id to the process_info file self.process_info_obj[ 'function_process_id'] = self.function_process.pid update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) # Fetch the proper function to call func = FunctionRouter.get_function(self.source_target_name, action) # Fetch the resources. func_dict is in the format: # { # 'resources': files, # 'empty_containers': empty_containers, # 'action_metadata': action_metadata # } try: func_dict = func(self.source_token, self.source_resource_id, self.process_info_path, self.action) # If the resource is being transferred, has only one file, and that file is the # PresQT metadata then raise an error. if self.action == 'resource_transfer_in' and \ len(func_dict['resources']) == 1 \ and func_dict['resources'][0]['title'] == 'PRESQT_FTS_METADATA.json': raise PresQTResponseException( 'PresQT Error: PresQT FTS metadata cannot not be transferred by itself.', status.HTTP_400_BAD_REQUEST) except PresQTResponseException as e: # TODO: Functionalize this error section # Catch any errors that happen within the target fetch. # Update the server process_info file appropriately. self.process_info_obj['status_code'] = e.status_code self.process_info_obj['status'] = 'failed' if self.action == 'resource_transfer_in': self.process_info_obj['download_status'] = 'failed' self.process_info_obj['message'] = e.data # Update the expiration from 5 hours to 1 hour from now. We can delete this faster because # it's an incomplete/failed directory. self.process_info_obj['expiration'] = str(timezone.now() + relativedelta(hours=1)) update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) return False # Get the latest contents of the job's process_info.json file self.process_info_obj = read_file(self.process_info_path, True)[self.action] # The directory all files should be saved in. self.resource_main_dir = os.path.join(self.ticket_path, self.base_directory_name) update_process_info_message( self.process_info_path, self.action, 'Performing fixity checks and gathering metadata...') self.extra_metadata = func_dict['extra_metadata'] # For each resource, perform fixity check, gather metadata, and save it to disk. fixity_info = [] self.download_fixity = True self.download_failed_fixity = [] self.source_fts_metadata_actions = [] self.new_fts_metadata_files = [] self.all_keywords = [] self.initial_keywords = [] self.manual_keywords = [] self.enhanced_keywords = [] for resource in func_dict['resources']: # Perform the fixity check and add extra info to the returned fixity object. # Note: This method of calling the function needs to stay this way for test Mock fixity_obj, self.download_fixity = download_fixity_checker.download_fixity_checker( resource) fixity_info.append(fixity_obj) if not fixity_obj['fixity']: self.download_failed_fixity.append(resource['path']) # Create metadata for this resource or validate the metadata file if resource['title'] == 'PRESQT_FTS_METADATA.json': is_valid = validate_metadata(self, resource) if not is_valid: resource['path'] = resource['path'].replace( 'PRESQT_FTS_METADATA.json', 'INVALID_PRESQT_FTS_METADATA.json') create_download_metadata(self, resource, fixity_obj) write_file( '{}{}'.format(self.resource_main_dir, resource['path']), resource['file']) else: create_download_metadata(self, resource, fixity_obj) write_file( '{}{}'.format(self.resource_main_dir, resource['path']), resource['file']) # Enhance the source keywords self.keyword_dict = {} if self.action == 'resource_transfer_in': if self.supports_keywords: if self.keyword_action == 'automatic': self.keyword_dict = automatic_keywords(self) elif self.keyword_action == 'manual': self.keyword_dict = manual_keywords(self) self.keyword_enhancement_successful = True # Create PresQT action metadata update_process_info_message(self.process_info_path, self.action, "Creating PRESQT_FTS_METADATA...") self.source_username = func_dict['action_metadata']['sourceUsername'] if self.action == 'resource_transfer_in': source_target_data = get_target_data(self.source_target_name) destination_target_data = get_target_data( self.destination_target_name) self.details = "PresQT Transfer from {} to {}".format( source_target_data['readable_name'], destination_target_data['readable_name']) else: source_target_data = get_target_data(self.source_target_name) self.details = "PresQT Download from {}".format( source_target_data['readable_name']) self.action_metadata = { 'id': str(uuid4()), 'details': self.details, 'actionDateTime': str(timezone.now()), 'actionType': self.action, 'sourceTargetName': self.source_target_name, 'sourceUsername': self.source_username, 'destinationTargetName': 'Local Machine', 'destinationUsername': None, 'keywords': self.keyword_dict, 'files': { 'created': self.new_fts_metadata_files, 'updated': [], 'ignored': [] } } # TODO: Move this up to make it occur after we loop through func_dict['resources'] and write # resources # Write empty containers to disk for container_path in func_dict['empty_containers']: # Make sure the container_path has a '/' and the beginning and end if container_path[-1] != '/': container_path += '/' if container_path[0] != '/': container_path = '/' + container_path os.makedirs( os.path.dirname('{}{}'.format(self.resource_main_dir, container_path))) # If we are transferring the downloaded resource then bag it for the resource_upload method if self.action == 'resource_transfer_in': self.action_metadata[ 'destinationTargetName'] = self.destination_target_name # Make a BagIt 'bag' of the resources. bagit.make_bag(self.resource_main_dir, checksums=['md5', 'sha1', 'sha256', 'sha512']) self.process_info_obj['download_status'] = get_action_message( self, 'Download', self.download_fixity, True, self.action_metadata) return True # If we are only downloading the resource then create metadata, bag, zip, # and update the server process file. else: # Create Metadata file final_fts_metadata_data = create_fts_metadata( self.all_keywords, self.action_metadata, self.source_fts_metadata_actions, self.extra_metadata) # Validate the final metadata metadata_validation = schema_validator( 'presqt/json_schemas/metadata_schema.json', final_fts_metadata_data) self.process_info_obj['message'] = get_action_message( self, 'Download', self.download_fixity, metadata_validation, self.action_metadata) # Make a BagIt 'bag' of the resources. bagit.make_bag(self.resource_main_dir, checksums=['md5', 'sha1', 'sha256', 'sha512']) # Write metadata file. write_file( os.path.join(self.resource_main_dir, 'PRESQT_FTS_METADATA.json'), final_fts_metadata_data, True) # Add the fixity file to the disk directory write_file( os.path.join(self.resource_main_dir, 'fixity_info.json'), fixity_info, True) # Zip the BagIt 'bag' to send forward. zip_directory(self.resource_main_dir, "{}.zip".format(self.resource_main_dir), self.ticket_path) # Everything was a success so update the server metadata file. self.process_info_obj['status_code'] = '200' self.process_info_obj['status'] = 'finished' self.process_info_obj['zip_name'] = '{}.zip'.format( self.base_directory_name) self.process_info_obj[ 'failed_fixity'] = self.download_failed_fixity update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) if self.email: # Build link to retrieve the download download_reverse = reverse('job_status', kwargs={ "action": "download", "response_format": "zip" }) download_url = self.request.build_absolute_uri( download_reverse) final_download_url = "{}?ticket_number={}".format( download_url, self.ticket_number) context = { "download_url": final_download_url, "download_message": self.process_info_obj['message'], "failed_fixity": self.process_info_obj['failed_fixity'] } email_blaster(self.email, "PresQT Download Complete", context, "emails/download_email.html") return True
def zenodo_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to the target. Parameters ---------- token : str User's token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API """ try: auth_parameter = zenodo_validation_check(token) except PresQTValidationError: raise PresQTValidationError("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) os_path = next(os.walk(resource_main_dir)) total_files = upload_total_files(resource_main_dir) # Update process info file update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to Zenodo...") # Since Zenodo is a finite depth target, the checks for path validity have already been done. if resource_id: name_helper = requests.get("https://zenodo.org/api/deposit/depositions/{}".format( resource_id), params=auth_parameter).json() try: final_title = name_helper['title'] except KeyError: raise PresQTResponseException( "Can't find the resource with id {}, on Zenodo".format(resource_id), status.HTTP_404_NOT_FOUND) action_metadata = {"destinationUsername": None} else: action_metadata = {"destinationUsername": None} project_title = os_path[1][0] name_helper = requests.get("https://zenodo.org/api/deposit/depositions", params=auth_parameter).json() titles = [project['title'] for project in name_helper] final_title = get_duplicate_title(project_title, titles, ' (PresQT*)') resource_id = zenodo_upload_helper(auth_parameter, final_title) post_url = "https://zenodo.org/api/deposit/depositions/{}/files".format(resource_id) upload_dict = zenodo_upload_loop(action_metadata, resource_id, resource_main_dir, post_url, auth_parameter, final_title, file_duplicate_action, process_info_path, action) return upload_dict
def zenodo_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from Zenodo along with its hash information. Parameters ---------- token : str User's Zenodo token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'metadata': { 'sourcePath': '/full/path/at/source.jpg', 'title': 'file_title', 'sourceHashes': {'hash_algorithm': 'the_hash'}, 'extra': {'any': 'extra'} } } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: auth_parameter = zenodo_validation_check(token) except PresQTResponseException: raise PresQTResponseException( 'Token is invalid. Response returned a 401 status code.', status.HTTP_401_UNAUTHORIZED) files = [] empty_containers = [] extra_metadata = {} base_url = None # If the resource_id is longer than 7 characters, the resource is an individual file if len(resource_id) > 7: # First we need to check if the file id given belongs to a public published record. zenodo_file = requests.get( 'https://zenodo.org/api/files/{}'.format(resource_id), params=auth_parameter) if zenodo_file.status_code != 200: # If not, we need to loop through their depositions to look for the file. zenodo_projects = requests.get( 'https://zenodo.org/api/deposit/depositions', params=auth_parameter).json() for entry in zenodo_projects: project_files = requests.get(entry['links']['self'], params=auth_parameter).json() for file in project_files['files']: if file['id'] == resource_id: base_url = entry['links']['self'] file_url = file['links']['self'] is_record = False break else: # If the file wasn't found we want to continue the loop. continue break else: is_record = True base_url = 'https://zenodo.org/api/files/{}'.format(resource_id) file_url = 'https://zenodo.org/api/files/{}'.format(resource_id) if base_url is None: raise PresQTResponseException( "The resource with id, {}, does not exist for this user.". format(resource_id), status.HTTP_404_NOT_FOUND) update_process_info_message(process_info_path, action, 'Downloading files from Zenodo...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') files, action_metadata = zenodo_download_helper( is_record, base_url, auth_parameter, files, file_url) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') # Otherwise, it's a full project else: base_url = 'https://zenodo.org/api/records/{}'.format(resource_id) zenodo_record = requests.get(base_url, params=auth_parameter) is_record = True if zenodo_record.status_code != 200: base_url = 'https://zenodo.org/api/deposit/depositions/{}'.format( resource_id) is_record = False try: files, action_metadata = zenodo_download_helper( is_record, base_url, auth_parameter, files) except PresQTResponseException: raise PresQTResponseException( "The resource with id, {}, does not exist for this user.". format(resource_id), status.HTTP_404_NOT_FOUND) extra_metadata = extra_metadata_helper(base_url, is_record, auth_parameter) file_urls = [file['file'] for file in files] update_process_info_message(process_info_path, action, 'Downloading files from Zenodo...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, auth_parameter, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content for file in files: file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def github_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to the target. Parameters ---------- token : str User's token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API """ try: header, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) os_path = next(os.walk(resource_main_dir)) # Get total amount of files total_files = upload_total_files(resource_main_dir) update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to GitHub...") # Upload a new repository if not resource_id: # Create a new repository with the name being the top level directory's name. # Note: GitHub doesn't allow spaces, or circlebois in repo_names repo_title = os_path[1][0].replace(' ', '_').replace("(", "-").replace( ")", "-").replace(":", "-") repo_name, repo_id, repo_url = create_repository(repo_title, token) resources_ignored = [] resources_updated = [] action_metadata = {"destinationUsername": username} file_metadata_list = [] for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: # Extract and encode the file bytes in the way expected by GitHub. file_bytes = open(os.path.join(path, name), 'rb').read() encoded_file = base64.b64encode(file_bytes).decode('utf-8') # A relative path to the file is what is added to the GitHub PUT address path_to_add = os.path.join(path.partition('/data/')[2], name) path_to_add_to_url = path_to_add.partition('/')[2].replace( ' ', '_') finished_path = '/' + repo_name + '/' + path_to_add_to_url file_metadata_list.append({ "actionRootPath": os.path.join(path, name), "destinationPath": finished_path, "title": name, "destinationHash": None }) put_url = "https://api.github.com/repos/{}/{}/contents/{}".format( username, repo_name, path_to_add_to_url) data = { "message": "PresQT Upload", "committer": { "name": "PresQT", "email": "N/A" }, "content": encoded_file } file_response = requests.put(put_url, headers=header, data=json.dumps(data)) if file_response.status_code != 201: raise PresQTResponseException( "Github returned the following error: '{}'".format( str(file_response.json()['message'])), status.HTTP_400_BAD_REQUEST) # Increment the file counter increment_process_info(process_info_path, action, 'upload') else: # Upload to an existing repository if ':' not in resource_id: repo_id = resource_id path_to_upload_to = '' # Upload to an existing directory else: partitioned_id = resource_id.partition(':') repo_id = partitioned_id[0] path_to_upload_to = '/{}'.format(partitioned_id[2]).replace( '%2F', '/').replace('%2E', '.') # Get initial repo data for the resource requested repo_url = 'https://api.github.com/repositories/{}'.format(repo_id) response = requests.get(repo_url, headers=header) if response.status_code != 200: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'. format(resource_id), status.HTTP_404_NOT_FOUND) repo_data = response.json() repo_name = repo_data['name'] repo_url = repo_data['svn_url'] # Get all repo resources so we can check if any files already exist repo_resources = requests.get('{}/master?recursive=1'.format( repo_data['trees_url'][:-6]), headers=header).json() if 'message' in repo_resources: repo_resources = requests.get('{}/main?recursive=1'.format( repo_data['trees_url'][:-6]), headers=header).json() # current_file_paths = ['/' + resource['path'] for resource in repo_resources['tree'] if resource['type'] == 'blob'] current_file_paths = [] for resource in repo_resources['tree']: if resource['type'] == 'blob': current_file_paths.append('/' + resource['path']) # Check if the provided path to upload to is actually a path to an existing file if path_to_upload_to in current_file_paths: raise PresQTResponseException( 'The Resource provided, {}, is not a container'.format( resource_id), status.HTTP_400_BAD_REQUEST) resources_ignored = [] resources_updated = [] file_metadata_list = [] sha = None action_metadata = {"destinationUsername": username} for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: path_to_file = os.path.join('/', path.partition('/data/')[2], name).replace(' ', '_') # Check if the file already exists in this repository full_file_path = '{}{}'.format(path_to_upload_to, path_to_file) if full_file_path in current_file_paths: if file_duplicate_action == 'ignore': resources_ignored.append(os.path.join(path, name)) continue else: resources_updated.append(os.path.join(path, name)) # Get the sha sha_url = 'https://api.github.com/repos/{}/contents{}'.format( repo_data['full_name'], full_file_path) sha_response = requests.get(sha_url, headers=header) sha = sha_response.json()['sha'] # Extract and encode the file bytes in the way expected by GitHub. file_bytes = open(os.path.join(path, name), 'rb').read() encoded_file = base64.b64encode(file_bytes).decode('utf-8') # A relative path to the file is what is added to the GitHub PUT address file_metadata_list.append({ "actionRootPath": os.path.join(path, name), "destinationPath": '/{}{}{}'.format(repo_name, path_to_upload_to, path_to_file), "title": name, "destinationHash": None }) put_url = 'https://api.github.com/repos/{}/contents{}{}'.format( repo_data['full_name'], path_to_upload_to, path_to_file) data = { "message": "PresQT Upload", "sha": sha, "committer": { "name": "PresQT", "email": "N/A" }, "content": encoded_file } upload_response = requests.put(put_url, headers=header, data=json.dumps(data)) if upload_response.status_code not in [200, 201]: raise PresQTResponseException( 'Upload failed with a status code of {}'.format( upload_response.status_code), status.HTTP_400_BAD_REQUEST) # Increment the file counter increment_process_info(process_info_path, action, 'upload') return { 'resources_ignored': resources_ignored, 'resources_updated': resources_updated, 'action_metadata': action_metadata, 'file_metadata_list': file_metadata_list, 'project_id': repo_id, "project_link": repo_url }
def osf_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to OSF. Parameters ---------- token : str User's OSF token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing FTS action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains FTS metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API """ try: osf_instance = OSF(token) except PresQTInvalidTokenError: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get contributor name contributor_name = requests.get( 'https://api.osf.io/v2/users/me/', headers={ 'Authorization': 'Bearer {}'.format(token) }).json()['data']['attributes']['full_name'] action_metadata = {"destinationUsername": contributor_name} hashes = {} resources_ignored = [] resources_updated = [] file_metadata_list = [] # Get total amount of files total_files = upload_total_files(resource_main_dir) update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to OSF...") # If we are uploading to an existing container if resource_id: # Get the resource resource = get_osf_resource(resource_id, osf_instance) # Resource being uploaded to must not be a file if resource.kind_name == 'file': raise PresQTResponseException( "The Resource provided, {}, is not a container".format( resource_id), status.HTTP_400_BAD_REQUEST) elif resource.kind_name == 'project': project = resource project_id = project.id resource.storage('osfstorage').create_directory( resource_main_dir, file_duplicate_action, hashes, resources_ignored, resources_updated, file_metadata_list, process_info_path, action) else: # Folder or Storage resource.create_directory(resource_main_dir, file_duplicate_action, hashes, resources_ignored, resources_updated, file_metadata_list, process_info_path, action) # Get the project class for later metadata work if resource.kind_name == 'storage': project_id = resource.node else: project_id = resource.parent_project_id project = osf_instance.project(project_id) # else we are uploading a new project else: os_path = next(os.walk(resource_main_dir)) # Get the actual data we want to upload data_to_upload_path = '{}/{}'.format(os_path[0], os_path[1][0]) # Create a new project with the name being the top level directory's name. project = osf_instance.create_project(os_path[1][0]) project_id = project.id # Upload resources into OSFStorage for the new project. project.storage('osfstorage').create_directory( data_to_upload_path, file_duplicate_action, hashes, resources_ignored, resources_updated, file_metadata_list, process_info_path, action) for file_metadata in file_metadata_list: # Only send forward the hash we need based on the hash_algorithm provided file_metadata['destinationHash'] = file_metadata['destinationHash'][ hash_algorithm] # Prepend the project title to each resource's the metadata destinationPath file_metadata['destinationPath'] = '/{}/{}'.format( project.title, file_metadata['destinationPath']) return { 'resources_ignored': resources_ignored, 'resources_updated': resources_updated, 'action_metadata': action_metadata, 'file_metadata_list': file_metadata_list, 'project_id': project_id, "project_link": "https://osf.io/{}".format(project_id) }
def osf_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from OSF along with its hash information. Parameters ---------- token : str User's OSF token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: osf_instance = OSF(token) except PresQTInvalidTokenError: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get contributor name contributor_name = requests.get('https://api.osf.io/v2/users/me/', headers={'Authorization': 'Bearer {}'.format(token)}).json()[ 'data']['attributes']['full_name'] action_metadata = {"sourceUsername": contributor_name} # Get the resource resource = get_osf_resource(resource_id, osf_instance) # Get all files for the provided resources. # The 'path' value will be the path that the file is eventually saved in. The root of the # path should be the resource. files = [] empty_containers = [] extra_metadata = {} if resource.kind_name == 'file': update_process_info_message(process_info_path, action, 'Downloading files from OSF...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') project = osf_instance.project(resource.parent_project_id) files.append({ "file": resource.download(), "hashes": resource.hashes, "title": resource.title, # If the file is the only resource we are downloading then we don't need it's full path "path": '/{}'.format(resource.title), "source_path": '/{}/{}{}'.format(project.title, resource.provider, resource.materialized_path), "extra_metadata": osf_download_metadata(resource) }) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') else: if resource.kind_name == 'project': extra_metadata = extra_metadata_helper(resource_id, {'Authorization': 'Bearer {}'.format(token)}) resource.get_all_files('', files, empty_containers) project = resource elif resource.kind_name == 'storage': resource.get_all_files('/{}'.format(resource.title), files, empty_containers) project = osf_instance.project(resource.node) else: resource.get_all_files('', files, empty_containers) project = osf_instance.project(resource.parent_project_id) for file in files: # File Path needs to start at the folder and strip everything before it. # Example: If the resource is 'Docs2' and the starting path is # '/Project/Storage/Docs1/Docs2/file.jpeg' then the final path # needs to be '/Docs2/file.jpeg' path_to_strip = resource.materialized_path[:-(len(resource.title) + 2)] file['path'] = file['file'].materialized_path[len(path_to_strip):] file_urls = [file['file'].download_url for file in files] update_process_info_message(process_info_path, action, 'Downloading files from OSF...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') # Asynchronously make all download requests loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete(async_main(file_urls, token, process_info_path, action)) # Go through the file dictionaries and replace the file class with the binary_content for file in files: file['source_path'] = '/{}/{}{}'.format(project.title, file['file'].provider, file['file'].materialized_path) file['file'] = get_dictionary_from_list( download_data, 'url', file['file'].download_url)['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def figshare_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from FigShare along with its hash information. Parameters ---------- token : str User's FigShare token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: headers, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) split_id = str(resource_id).split(":") extra_metadata = {} # But first we need to see whether it is a public project, or a private project. project_url = "https://api.figshare.com/v2/account/projects/{}".format( split_id[0]) response = requests.get(project_url, headers=headers) if response.status_code != 200: # Looking for a private project was unsuccessful, try a public project. project_url = "https://api.figshare.com/v2/projects/{}".format( split_id[0]) response = requests.get(project_url, headers=headers) if response.status_code != 200: # Project id is invalid raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) data = response.json() project_name = data['title'] # Flags to be used for file checks. file_urls = None files = None if len(split_id) == 1: # Download the contents of the project and build the list of file urls to download. articles_url = project_url + "/articles" files, empty_containers, action_metadata = download_project( username, articles_url, headers, project_name, []) file_urls = [file['file'] for file in files] extra_metadata = extra_metadata_helper(project_url, headers) elif len(split_id) == 2 or len(split_id) == 3: # We have an article or a file so we need to get the article url article_url = "https://api.figshare.com/v2/account/projects/{}/articles/{}".format( split_id[0], split_id[1]) response = requests.get(article_url, headers=headers) if response.status_code != 200: # Let's see if this is a public article.... article_url = "https://api.figshare.com/v2/articles/{}".format( split_id[1]) response = requests.get(article_url, headers=headers) if response.status_code != 200: # We couldn't find the article. raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) if len(split_id) == 2: # Download the contents of the article and build the list of file urls to download. files, empty_containers, action_metadata = download_article( username, article_url, headers, project_name, []) file_urls = [file['file'] for file in files] elif len(split_id) == 3: update_process_info_message(process_info_path, action, 'Downloading files from FigShare...') # Add the total number of articles to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') # Single file download. data = response.json() for file in data['files']: if str(file['id']) == split_id[2]: files = [{ "file": requests.get(file['download_url'], headers=headers).content, "hashes": { "md5": file['computed_md5'] }, "title": file['name'], "path": "/{}".format(file['name']), "source_path": "/{}/{}/{}".format(project_name, data['title'], file['name']), "extra_metadata": { "size": file['size'] } }] # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') empty_containers = [] action_metadata = {"sourceUsername": username} if not files: # We could not find the file. raise PresQTResponseException( "The resource could not be found by the requesting user.", status.HTTP_404_NOT_FOUND) if file_urls: update_process_info_message(process_info_path, action, 'Downloading files from FigShare...') # Add the total number of articles to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') # Start the async calls for project or article downloads loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, headers, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content for file in files: file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def curate_nd_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from CurateND along with its hash information. Parameters ---------- token : str User's CurateND token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: curate_instance = CurateND(token) except PresQTInvalidTokenError: raise PresQTValidationError( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get the resource resource = get_curate_nd_resource(resource_id, curate_instance) action_metadata = {"sourceUsername": resource.extra['depositor']} extra_metadata = {} # Get all the files for the provided resources. files = [] empty_containers = [] if resource.kind_name == 'file': title_url = resource.extra['isPartOf'] if type(title_url) is list: title_url = resource.extra['isPartOf'][0] # Get the title of the Project to add to sourcePath project_title = requests.get(title_url, headers={ 'X-Api-Token': '{}'.format(token) }).json()['title'] # This is so we aren't missing the few extra keys that are pulled out for the PresQT payload resource.extra.update({ "id": resource.id, "date_submitted": resource.date_submitted }) update_process_info_message(process_info_path, action, 'Downloading files from CurateND...') # Add the total number of items to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') binary_file, curate_hash = resource.download() files.append({ 'file': binary_file, 'hashes': { 'md5': curate_hash }, 'title': resource.title, # If the file is the only resource we are downloading then we don't need it's full path. 'path': '/{}'.format(resource.title), 'source_path': '/{}/{}'.format(project_title, resource.title), 'extra_metadata': resource.extra }) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') else: if not resource.extra['containedFiles']: empty_containers.append('{}'.format(resource.title)) else: update_process_info_message(process_info_path, action, 'Downloading files from CurateND...') # Add the total number of items to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(resource.extra['containedFiles']), action, 'download') title_helper = {} hash_helper = {} file_urls = [] project_title = resource.title file_metadata = [] extra_metadata = extra_metadata_helper(resource) for file in resource.extra['containedFiles']: download_url = file['downloadUrl'] contained_file = get_curate_nd_resource( file['id'], curate_instance) file_metadata_dict = { "title": contained_file.title, "extra": contained_file.extra } file_metadata.append(file_metadata_dict) title_helper[download_url] = contained_file.title hash_helper[download_url] = contained_file.md5 title_helper[file['downloadUrl']] = file['label'] file_urls.append(file['downloadUrl']) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, token, process_info_path, action)) for file in download_data: title = title_helper[file['url']] hash = hash_helper[file['url']] files.append({ 'file': file['binary_content'], 'hashes': { 'md5': hash }, 'title': title, "source_path": '/{}/{}'.format(project_title, title), 'path': '/{}/{}'.format(resource.title, title), 'extra_metadata': get_dictionary_from_list(file_metadata, 'title', title)['extra'] }) return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def gitlab_download_resource(token, resource_id, process_info_path, action): """ Fetch the requested resource from GitLab along with its hash information. Parameters ---------- token : str User's GitLab token resource_id : str ID of the resource requested process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: header, user_id = validation_check(token) except PresQTResponseException: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) # Get the user's GitLab username for action metadata username = requests.get("https://gitlab.com/api/v4/user", headers=header).json()['username'] partitioned_id = resource_id.partition(':') if ':' in resource_id: project_id = partitioned_id[0] else: project_id = resource_id project_url = 'https://gitlab.com/api/v4/projects/{}'.format(project_id) response = requests.get(project_url, headers=header) if response.status_code != 200: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) project_name = response.json()['name'] extra_metadata = {} if ':' not in resource_id: # This is for a project all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1".format( resource_id) data = gitlab_paginated_data(header, user_id, all_files_url) is_project = True # Get extra metadata extra_metadata = extra_metadata_helper(response.json(), header) elif ':' in resource_id and '%2E' not in resource_id: # This is for a directory all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?path={}&recursive=1".format( partitioned_id[0], partitioned_id[2].replace('+', ' ')) data = gitlab_paginated_data(header, user_id, all_files_url) if not data: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) is_project = False else: update_process_info_message(process_info_path, action, 'Downloading files from GitLab...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, 1, action, 'download') # This is a single file data = requests.get('https://gitlab.com/api/v4/projects/{}/repository/files/{}?ref=master'.format( project_id, partitioned_id[2].replace('+', ' ')), headers=header).json() if 'message' in data.keys(): raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format(resource_id), status.HTTP_404_NOT_FOUND) # Increment the number of files done in the process info file. increment_process_info(process_info_path, action, 'download') return { 'resources': [{ 'file': base64.b64decode(data['content']), 'hashes': {'sha256': data['content_sha256']}, 'title': data['file_name'], 'path': '/{}'.format(data['file_name']), 'source_path': data['file_path'], 'extra_metadata': {}}], 'empty_containers': [], 'action_metadata': {'sourceUsername': username}, 'extra_metadata': extra_metadata } files, empty_containers, action_metadata = download_content( username, project_name, project_id, data, [], is_project) file_urls = [file['file'] for file in files] update_process_info_message(process_info_path, action, 'Downloading files from GitLab...') # Add the total number of projects to the process info file. # This is necessary to keep track of the progress of the request. update_process_info(process_info_path, len(file_urls), action, 'download') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) download_data = loop.run_until_complete( async_main(file_urls, header, process_info_path, action)) # Go through the file dictionaries and replace the file path with the binary_content # and replace the hashes with the correct file hashes for file in files: file['hashes'] = get_dictionary_from_list( download_data, 'url', file['file'])['hashes'] file['file'] = get_dictionary_from_list( download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata, 'extra_metadata': extra_metadata }
def gitlab_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to the target. Parameters ---------- token : str User's token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API """ base_url = "https://gitlab.com/api/v4/" try: headers, user_id = validation_check(token) except PresQTResponseException: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) username = requests.get("https://gitlab.com/api/v4/user", headers=headers).json()['username'] action_metadata = {"destinationUsername": username} os_path = next(os.walk(resource_main_dir)) # Get total amount of files total_files = upload_total_files(resource_main_dir) update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to GitLab...") resources_ignored = [] resources_updated = [] file_metadata_list = [] #*** CREATE NEW PROJECT ***# # Create a new project with the name being the top level directory's name. # Check if a project with this name exists for this user if not resource_id: project_title = os_path[1][0] titles = [data['name'] for data in gitlab_paginated_data(headers, user_id)] title = get_duplicate_title(project_title, titles, '-PresQT*-').replace('(', '-').replace(')', '-') response = requests.post('{}projects?name={}&visibility=public'.format( base_url, title), headers=headers) if response.status_code == 201: project_id = response.json()['id'] project_name = response.json()['name'] web_url = response.json()['web_url'] else: raise PresQTResponseException( "Response has status code {} while creating project {}.".format( response.status_code, project_title), status.HTTP_400_BAD_REQUEST) #*** UPLOAD FILES ***# # Upload files to project's repository base_repo_path = "{}projects/{}/repository/files/".format(base_url, project_id) for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: # Strip server directories from file path relative_file_path = os.path.join(path.partition('/data/{}/'.format( project_title))[2], name) # Extract and encode the file bytes in the way expected by GitLab. file_bytes = open(os.path.join(path, name), 'rb').read() encoded_file = base64.b64encode(file_bytes) # A relative path to the file is what is added to the GitLab POST address encoded_file_path = relative_file_path.replace('/', '%2F').replace('.', '%2E') request_data = {"branch": "master", "commit_message": "PresQT Upload", "encoding": "base64", "content": encoded_file} requests.post("{}{}".format( base_repo_path, encoded_file_path), headers=headers, data=request_data) # Get the file hash file_json = requests.get("{}{}?ref=master".format(base_repo_path, encoded_file_path), headers=headers) # Increment files finished increment_process_info(process_info_path, action, 'upload') file_metadata_list.append({ "actionRootPath": os.path.join(path, name), # This ensures that the title is up to date if there are duplicates "destinationPath": os.path.join(project_name, path.partition( '/data/')[2].partition('/')[2], name), "title": name, "destinationHash": file_json.json()['content_sha256'] }) else: if ':' not in resource_id: project_id = resource_id base_repo_url = "{}projects/{}/repository/files/".format(base_url, project_id) string_path_to_resource = '' else: partitioned_id = resource_id.partition(':') project_id = partitioned_id[0] base_repo_url = "{}projects/{}/repository/files/{}".format( base_url, project_id, partitioned_id[2]) string_path_to_resource = partitioned_id[2].replace('%2F', '/').replace('%2E', '.') # Check if the resource_id belongs to a file tree_url = 'https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1'.format( project_id) file_data = gitlab_paginated_data(headers, None, tree_url) for data in file_data: if data['path'] == string_path_to_resource: if data['type'] == 'blob': raise PresQTResponseException("Resource with id, {}, belongs to a file.".format( resource_id), status.HTTP_400_BAD_REQUEST) # Get project data project = requests.get('{}projects/{}'.format(base_url, project_id), headers=headers) if project.status_code != 200: raise PresQTResponseException("Project with id, {}, could not be found.".format( project_id), status.HTTP_404_NOT_FOUND) project_name = project.json()['name'] web_url = project.json()['web_url'] for path, subdirs, files in os.walk(resource_main_dir): if not subdirs and not files: resources_ignored.append(path) for name in files: # Strip server directories from file path relative_file_path = os.path.join(path.partition('/data/')[2], name) # A relative path to the file is what is added to the GitLab POST address if base_repo_url == "{}projects/{}/repository/files/".format(base_url, project_id): encoded_file_path = relative_file_path.replace('/', '%2F').replace('.', '%2E') else: encoded_file_path = '%2F{}'.format( relative_file_path.replace('/', '%2F').replace('.', '%2E')) full_encoded_url = '{}{}'.format(base_repo_url, encoded_file_path) ignore_file = False upload_request = requests.post file_bytes = None # Check if this file exists already for file in file_data: if os.path.join(string_path_to_resource, relative_file_path) == file['path']: if file_duplicate_action == 'ignore': resources_ignored.append(os.path.join(path, name)) ignore_file = True break else: file_url = '{}?ref=master'.format(full_encoded_url) file_response = requests.get(file_url, headers=headers) file_bytes = open(os.path.join(path, name), 'rb').read() if hash_generator(file_bytes, 'sha256') == file_response.json()['content_sha256']: resources_ignored.append(os.path.join(path, name)) ignore_file = True else: resources_updated.append(os.path.join(path, name)) upload_request = requests.put # Break out of this for loop and attempt to upload this duplicate break # If we find a file to ignore then move onto the next file in the os.walk if ignore_file: continue # Extract and encode the file bytes in the way expected by GitLab. if not file_bytes: file_bytes = open(os.path.join(path, name), 'rb').read() encoded_file = base64.b64encode(file_bytes) request_data = {"branch": "master", "commit_message": "PresQT Upload", "encoding": "base64", "content": encoded_file} response = upload_request("{}".format(full_encoded_url), headers=headers, data=request_data) if response.status_code not in [201, 200]: raise PresQTResponseException( 'Upload failed with a status code of {}'.format(response.status_code), status.HTTP_400_BAD_REQUEST) # Get the file hash file_json = requests.get("{}?ref=master".format(full_encoded_url), headers=headers).json() # Increment files finished increment_process_info(process_info_path, action, 'upload') file_metadata_list.append({ "actionRootPath": os.path.join(path, name), "destinationPath": os.path.join(project_name, path.partition('/data/')[2], name), "title": name, "destinationHash": file_json['content_sha256'] }) return { 'resources_ignored': resources_ignored, 'resources_updated': resources_updated, 'action_metadata': action_metadata, 'file_metadata_list': file_metadata_list, 'project_id': project_id, 'project_link': web_url }