def search_validator(search_parameter): """ Ensure the query parameter passed into the API view is valid. Parameters ---------- search_parameter : dict The query parameter passed to the view. """ # Check that the search query only has one key. if len(search_parameter.keys()) > 1: raise PresQTResponseException('PresQT Error: The search query is not formatted correctly.', status.HTTP_400_BAD_REQUEST) list_of_search_params = ['id', 'title'] # Check that the query parameter is in list of accepted searches if list(search_parameter.keys())[0] not in list_of_search_params: raise PresQTResponseException('PresQT Error: The search query is not formatted correctly.', status.HTTP_400_BAD_REQUEST) # Ensure that there are no special characters in the search. regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]') if 'title' in search_parameter: if (regex.search(search_parameter['title']) is not None): raise PresQTResponseException('PresQT Error: The search query is not formatted correctly.', status.HTTP_400_BAD_REQUEST)
def create_file(self, file_name, file_to_write, file_duplicate_action): """ Upload a file to a container. Parameters ---------- file_name : str Name of the file to create. file_to_write : bytes File to create. file_duplicate_action : str Flag for how to handle the case of the file already existing. Returns ------- Class instance of the created file. """ # When uploading a large file (>a few MB) that already exists # we sometimes get a ConnectionError instead of a status == 409. connection_error = False try: response = self.put(self._new_file_url, params={'name': file_name}, data=file_to_write) except ConnectionError: connection_error = True # If the file is a duplicate then either ignore or update it if connection_error or response.status_code == 409: original_file = self.get_file_by_name(file_name) if file_duplicate_action == 'ignore': return 'ignored', original_file elif file_duplicate_action == 'update': # Only attempt to update the file if the new file is different than the original if hash_generator(file_to_write, 'md5') != original_file.hashes['md5']: response = self.get_file_by_name(file_name).update( file_to_write) if response.status_code == 200: return 'updated', self.get_file_by_name(file_name) else: raise PresQTResponseException( "Response has status code {} while updating file {}" .format(response.status_code, file_name), status.HTTP_400_BAD_REQUEST) else: return 'ignored', original_file # File uploaded successfully elif response.status_code == 201: return 'created', self.get_file_by_name(file_name) else: raise PresQTResponseException( "Response has status code {} while creating file {}".format( response.status_code, file_name), status.HTTP_400_BAD_REQUEST)
def structure_validation(instance): """ This function will ensure that the structure of the files or project to upload are valid. Parameters ---------- instance: BaseResource class instance Class we want to add the attributes to """ # Get information about the data directory os_path, folders, files = next(os.walk(instance.data_directory)) if len(folders) > 1: raise PresQTResponseException( "PresQT Error: Repository is not formatted correctly. Multiple directories exist at the top level.", status.HTTP_400_BAD_REQUEST) if len( files ) > 0 and instance.destination_resource_id is None and instance.action == 'resource_upload': raise PresQTResponseException( "PresQT Error: Repository is not formatted correctly. Files exist at the top level.", status.HTTP_400_BAD_REQUEST) if len( files ) > 0 and instance.destination_resource_id is None and instance.action == 'resource_transfer_in': raise PresQTResponseException( "PresQT Error: You need to select a resource to transfer into, as a single file can not be uploaded as a new project.", status.HTTP_400_BAD_REQUEST)
def figshare_upload_keywords(token, resource_id, keywords): """ Upload the keywords to a given resource id. Parameters ---------- token: str User's FigShare token resource_id: str ID of the resource requested keywords: list List of new keywords to upload. Returns ------- A dictionary object that represents the updated FigShare resource keywords. Dictionary must be in the following format: { "updated_keywords": [ 'eggs', 'EGG', 'Breakfast' ] } """ split_id = resource_id.split(":") if len(split_id) == 3: raise PresQTResponseException( "FigShare projects/files do no have keywords.", status.HTTP_400_BAD_REQUEST) elif len(split_id) == 1: return {'updated_keywords': keywords, 'project_id': resource_id} from presqt.targets.figshare.functions.fetch import figshare_fetch_resource # This will raise an error if the id is invalid figshare_fetch_resource(token, resource_id) headers = {"Authorization": "token {}".format(token)} put_url = "https://api.figshare.com/v2/account/articles/{}".format( split_id[1]) data = {"tags": keywords} response = requests.put(put_url, headers=headers, data=json.dumps(data)) if response.status_code != 205: raise PresQTResponseException( "FigShare returned a {} error trying to update keywords.".format( response.status_code), status.HTTP_400_BAD_REQUEST) return {'updated_keywords': keywords, 'project_id': resource_id}
def upload_parts(headers, upload_url, parts, file): """ Upload the parts of the file to FigShare. File offsets are determined by the initial FigShare POST upload. Parameters ---------- headers: dict The user's FigShare Auth headers upload_url: str The url to upload the file parts: list List of parts to be uploaded file: bytes The file itself """ headers["Content-Type"] = "application/binary" for part in parts: file.seek(part['startOffset']) data = file.read(part['endOffset'] - part['startOffset'] + 1) upload_status = requests.put( "{}/{}".format(upload_url, part['partNo']), headers=headers, data=data) if upload_status.status_code != 200: raise PresQTResponseException( "FigShare returned an error trying to upload. Some items may still have been created on FigShare.", status.HTTP_400_BAD_REQUEST)
def validation_check(token): """ Ensure a proper FigShare API token has been provided. Parameters ---------- token : str User's FigShare token Returns ------- The properly formatted FigShare Auth header. """ headers = {"Authorization": "token {}".format(token)} request = requests.get("http://api.figshare.com/v2/account", headers=headers) if request.status_code == 403: raise PresQTResponseException( "Token is invalid. Response returned a 403 status code.", status.HTTP_401_UNAUTHORIZED) username = request.json()['email'] return headers, username
def keyword_enhancer(keywords): """ Send a list of keywords to SciGraph to be enhanced. Parameters ---------- keywords: list The list of keywords to be enhanced. Returns ------- The enhanced list of keywords. """ if not keywords: raise PresQTResponseException( 'There are no keywords to enhance for this resource.', status.HTTP_400_BAD_REQUEST) new_list_of_keywords = [] final_list_of_keywords = [] keyword_lower_case = [keyword.lower() for keyword in keywords] # Get the new keyword suggestions from Sci-Graph for keyword in keyword_lower_case: final_list_of_keywords.append(keyword) # Get SciGraph 'term' keyword suggestions response = requests.get( 'http://ec-scigraph.sdsc.edu:9000/scigraph/vocabulary/term/{}?limit=20' .format(keyword)) if response.status_code == 200: for label in response.json()[0]['labels']: label_lower_case = label.lower() if label_lower_case not in keyword_lower_case: new_list_of_keywords.append(label_lower_case) final_list_of_keywords.append(label_lower_case) return list(set(new_list_of_keywords)), list(set(final_list_of_keywords))
def zenodo_download_helper(is_record, base_url, auth_parameter, files, file_url=None): """ This is used in Zenodo's download function. Parameters ---------- is_record : boolean Flag for if the download is a public record base_url : str The url of the Zenodo project. auth_parameter : str The Authentication parameter expected by Zenodo. files : list The list of files to append to. file_url : str If the download is a single file, we also pass the link to the file. Returns ------- The list of file dictionaries and action_metadata. """ project_info = requests.get(base_url, auth_parameter) if project_info.status_code != 200: raise PresQTResponseException( 'The response returned a 404 not found status code.', status.HTTP_404_NOT_FOUND) project_helper = project_info.json() if is_record is True: # Record endpoints are inconsistent, so there are a few checks that need to happen. try: username = project_helper['owners'][0] except KeyError: username = None try: project_name = project_helper['metadata']['title'] except KeyError: project_name = None else: # The deposition endpoints are consistent username = project_helper['owner'] project_name = project_helper['title'] action_metadata = {"sourceUsername": username} if file_url: metadata_helper = requests.get(file_url, params=auth_parameter).json() files = zenodo_file_download_helper(auth_parameter, is_record, project_name, metadata_helper, files) else: files = zenodo_project_download_helper(is_record, project_name, project_helper, files) return files, action_metadata
def get(self, url, *args, **kwargs): """ Handle any errors that may pop up while making GET requests through the session. Parameters ---------- url: str URL to make the GET request to. Returns ------- HTTP Response object """ response = self.session.get(url, *args, **kwargs) if response.status_code == 200: return response elif response.status_code == 410: raise PresQTResponseException( "The requested resource is no longer available.", status.HTTP_410_GONE) elif response.status_code == 404: raise OSFNotFoundError("Resource not found.", status.HTTP_404_NOT_FOUND) elif response.status_code == 403: raise OSFForbiddenError( "User does not have access to this resource with the token provided.", status.HTTP_403_FORBIDDEN)
def create_repository(title, token): """ Create a GitHub repository. Parameters ---------- title : str The title of the repo being created token : str The users GitHub API token. """ header = {"Authorization": "token {}".format(token)} repository_payload = {"name": title} response = requests.post('https://api.github.com/user/repos'.format(token), headers=header, data=json.dumps(repository_payload)) if response.status_code == 201: return title elif response.status_code == 422: # This is a little gross, but there isn't a better way to do it that I'm aware of. from presqt.targets.github.utilities import github_paginated_data titles = [data['name'] for data in github_paginated_data(token)] title = get_duplicate_title(title, titles, '-PresQT*-') return create_repository(title, token) else: raise PresQTResponseException( "Response has status code {} while creating repository {}".format( response.status_code, title), status.HTTP_400_BAD_REQUEST)
def create_folder(self, folder_name): """ Create a new sub-folder for this container. Parameters ---------- folder_name : str Name of the folder to create. Returns ------- Class instance of the created folder. """ response = self.put(self._new_folder_url, params={'name': folder_name}) if response.status_code == 409: return self.get_folder_by_name(folder_name) elif response.status_code == 201: return self.get_folder_by_name(folder_name) else: raise PresQTResponseException( "Response has status code {} while creating folder {}".format( response.status_code, folder_name), status.HTTP_400_BAD_REQUEST)
def resource(self, resource_id): """ Get an item or file with the given resource_id. Parameters ---------- resource_id : str id of the resource we want to fetch. Returns ------- Instance of the desired resource. """ url = self.session.build_url(resource_id) response_data = self.get(url) response_json = response_data.json() # If the id given can't be found or is of type person, we want to raise an exception. # Error are only present in the payload if an error occured. if 'error' in response_json.keys(): raise PresQTResponseException( 'The resource, {}, could not be found on CurateND.'.format( resource_id), status.HTTP_404_NOT_FOUND) try: response_json['containedFiles'] except KeyError: # If the containedFiles key is not in the payload, we are creating a file. return File(response_data.json(), self.session) else: return Item(response_data.json(), self.session)
def create_article(article_title, headers, project_id): """ Create a FigShare article. Parameters ---------- article_title : str The title of the project being created headers : dict The users FigShare Auth header """ article_payload = {"title": article_title} response = requests.post( "https://api.figshare.com/v2/account/projects/{}/articles".format(project_id), headers=headers, data=json.dumps(article_payload) ) if response.status_code != 201: raise PresQTResponseException( "Response has status code {} while creating article {}".format(response.status_code, article_title), status.HTTP_400_BAD_REQUEST) article_response = requests.get(response.json()['location'], headers=headers).json() return article_response['id']
def create_project(self, title): """- Create a project for this user. """ titles = [project.title for project in self.projects()[1]] title = get_duplicate_title(title, titles, ' (PresQT*)') project_payload = { "data": { "type": "nodes", "attributes": { "title": title, "category": "project" } } } response = self.post(self.session.build_url('nodes'), data=json.dumps(project_payload), headers={'content-type': 'application/json'}) if response.status_code == 201: return self.project(response.json()['data']['id']) else: raise PresQTResponseException( "Response has status code {} while creating project {}".format( response.status_code, title), status.HTTP_400_BAD_REQUEST)
def gitlab_fetch_keywords(token, resource_id): """ Fetch the keywords of a given resource id. Parameters ---------- token: str User's GitLab token resource_id: str ID of the resource requested Returns ------- A dictionary object that represents the GitLab resource keywords. Dictionary must be in the following format: { "topics": [ "eggs", "ham", "bacon" ], "keywords": [ "eggs", "ham", "bacon" ] } """ headers, user_id = validation_check(token) from presqt.targets.gitlab.functions.fetch import gitlab_fetch_resource resource = gitlab_fetch_resource(token, resource_id) if resource['kind_name'] in ['dir', 'file']: raise PresQTResponseException( "On GitLab only projects have keywords, not files or directories, therefore PresQT keyword features are not supported at GitLab's file or directory levels.", status.HTTP_400_BAD_REQUEST) # LOOK INTO THE PROJECT FOR METADATA metadata = None metadata_url = "https://gitlab.com/api/v4/projects/{}/repository/files/PRESQT_FTS_METADATA.json?ref=master".format( resource_id) metadata_file_response = requests.get(metadata_url, headers=headers) if metadata_file_response.status_code == 200: base64_metadata = base64.b64decode( metadata_file_response.json()['content']) metadata = json.loads(base64_metadata) if metadata: try: keywords = list( set(resource['extra']['tag_list'] + metadata['allEnhancedKeywords'])) except KeyError: keywords = list(set(resource['extra']['tag_list'])) else: keywords = list(set(resource['extra']['tag_list'])) return {'tag_list': keywords, 'keywords': keywords}
def curate_nd_fetch_keywords(token, resource_id): """ Fetch the keywords of a given resource id. Parameters ---------- token: str User's CurateND token resource_id: str ID of the resource requested Returns ------- A dictionary object that represents the CurateND resource keywords. Dictionary must be in the following format: { "subject": [ "eggs", "ham", "bacon" ], "keywords": [ "eggs", "ham", "bacon" ] } """ try: curate_instance = CurateND(token) except PresQTInvalidTokenError: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) resource = get_curate_nd_resource(resource_id, curate_instance) if 'subject' in resource.extra.keys(): return { 'subject': resource.extra['subject'], 'keywords': resource.extra['subject'] } else: raise PresQTResponseException( "The given resource id does not support keywords.", status.HTTP_400_BAD_REQUEST)
def curate_nd_fetch_resource(token, resource_id): """ Fetch the CurateND resource matching the resource_id given. Parameters ---------- token : str User's CurateND token resource_id : str ID of the resource requested Returns ------- A dictionary object that represents the CurateND resource. Dictionary must be in the following format: { "kind": "item", "kind_name": "file", "id": "12345", "title": "23296359282_934200ec59_o.jpg", "date_created": "2019-05-13T14:54:17.129170Z", "date_modified": "2019-05-13T14:54:17.129170Z", "hashes": { "md5": "aaca7ef067dcab7cb8d79c36243823e4", }, "extra": { "any": extra, "values": here } } """ try: curate_instance = CurateND(token) except PresQTInvalidTokenError: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED, ) # Get the resource resource = get_curate_nd_resource(resource_id, curate_instance) resource_dict = { "kind": resource.kind, "kind_name": resource.kind_name, "id": resource.id, "title": resource.title, "date_created": resource.date_submitted, "date_modified": resource.modified, "hashes": { "md5": resource.md5 }, "extra": resource.extra } return resource_dict
def gitlab_upload_keywords(token, resource_id, keywords): """ Upload the keywords to a given resource id. Parameters ---------- token: str User's GitLab token resource_id: str ID of the resource requested keywords: list List of new keywords to upload. Returns ------- A dictionary object that represents the updated GitLab resource keywords. Dictionary must be in the following format: { "updated_keywords": [ 'eggs', 'EGG', 'Breakfast' ] } """ from presqt.targets.gitlab.functions.fetch import gitlab_fetch_resource # This will raise an error if not a project. resource = gitlab_fetch_resource(token, resource_id) project_id = resource_id if resource['kind_name'] in ['file', 'dir']: project_id = resource['id'].partition(':')[0] headers = {"Private-Token": "{}".format(token)} put_url = 'https://gitlab.com/api/v4/projects/{}'.format(project_id) new_keywords = [keyword.lower() for keyword in keywords] new_keywords_string = ','.join(list(set(new_keywords))) response = requests.put("{}?tag_list={}".format(put_url, new_keywords_string), headers=headers) if response.status_code != 200: raise PresQTResponseException( "GitLab returned a {} error trying to update keywords.".format( response.status_code), status.HTTP_400_BAD_REQUEST) return { 'updated_keywords': response.json()['tag_list'], 'project_id': project_id }
def zenodo_upload_helper(auth_parameter, project_title=None): """ Initialize a new project on Zenodo. Parameters ---------- auth_parameter : str The Authentication parameter expected by Zenodo. Returns ------- The new Project ID. """ headers = {"Content-Type": "application/json"} project_info = requests.post('https://zenodo.org/api/deposit/depositions', params=auth_parameter, json={}, headers=headers) if project_info.status_code != 201: raise PresQTResponseException( "Zenodo returned a {} status code while trying to create the project." .format(project_info.status_code), status.HTTP_400_BAD_REQUEST) project_helper = project_info.json() project_id = project_helper['id'] project_owner = project_helper['owner'] # Now we need to add some info to the project. data = { 'metadata': { 'title': project_title, 'upload_type': 'other', 'description': 'PresQT Upload', 'creators': [{ 'name': str(project_owner) }] } } requests.put( 'https://zenodo.org/api/deposit/depositions/{}'.format(project_id), params=auth_parameter, data=json.dumps(data), headers=headers) return project_id
def get_all_paginated_data(url, token): """ Get all data for the requesting user. Parameters ---------- url : str URL to the current data to get token: str User's OSF token Returns ------- Data dictionary of the data points gathered up until now. """ headers = {'Authorization': 'Bearer {}'.format(token)} # Get initial data response = requests.get(url, headers=headers) if response.status_code == 200: response_json = response.json() elif response.status_code == 410: raise PresQTResponseException("The requested resource is no longer available.", status.HTTP_410_GONE) elif response.status_code == 404: raise OSFNotFoundError("Resource not found.", status.HTTP_404_NOT_FOUND) elif response.status_code == 403: raise OSFForbiddenError( "User does not have access to this resource with the token provided.", status.HTTP_403_FORBIDDEN) data = response_json['data'] meta = response_json['links']['meta'] # Calculate pagination pages if '?filter' in url or '?page' in url: # We already have all the data we need for this request return data else: page_total = get_page_total(meta['total'], meta['per_page']) url_list = ['{}?page={}'.format(url, number) for number in range(2, page_total + 1)] # Call all pagination pages asynchronously from presqt.targets.osf.utilities.utils.async_functions import run_urls_async children_data = run_urls_async(url_list, headers) [data.extend(child['data']) for child in children_data] return data
def osf_fetch_resources(token, search_parameter): """ Fetch all OSF resources for the user connected to the given token. Parameters ---------- token : str User's OSF token search_parameter : dict The search parameter passed to the API View Gets passed formatted as {'title': 'search_info'} Returns ------- List of dictionary objects that represent OSF resources. Dictionary must be in the following format: { "kind": "container", "kind_name": "folder", "id": "12345", "container": "None", "title": "Folder Name", } """ try: osf_instance = OSF(token) except PresQTInvalidTokenError: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) if search_parameter: if 'title' in search_parameter: # Format the search that is coming in to be passed to the OSF API search_parameters = search_parameter['title'].replace(' ', '+') url = 'https://api.osf.io/v2/nodes/?filter[title]={}'.format( search_parameters) elif 'id' in search_parameter: url = 'https://api.osf.io/v2/nodes/?filter[id]={}'.format( search_parameter['id']) else: url = None try: resources = osf_instance.get_resources(url) except PresQTValidationError as e: raise e return resources
def curate_nd_fetch_resources(token, search_parameter): """ Fetch all CurateND resources for the user connected to the given token. Parameters ---------- token : str User's CurateND token search_parameter : dict The search parameter passed to the API View Gets passed formatted as {'title': 'search_info'} Returns ------- List of dictionary objects that represent CurateND resources. Dictionary must be in the following format: { "kind": "container", "kind_name": "folder", "id": "12345", "container": "None", "title": "Folder Name", } """ try: curate_instance = CurateND(token) except PresQTInvalidTokenError: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) if search_parameter: if 'title' in search_parameter: # Format the search that is coming in to be passed to the Curate API search_parameters = search_parameter['title'].replace(' ', '+') search_url = 'https://curate.nd.edu/api/items?q={}'.format( search_parameters) try: resources = curate_instance.get_resources(search_url) except PresQTValidationError as e: raise e elif 'id' in search_parameter: resources = get_curate_nd_resources_by_id(token, search_parameter['id']) else: resources = curate_instance.get_resources() return resources
def get_osf_resource(resource_id, osf_instance): """ Get an OSF resource based on a given id. Parameters ---------- resource_id : str Resource ID to retrieve. osf_instance : OSF class object Instance of the OSF class we want to use to get the resource from. Returns ------- The class object for the resource requested. """ # Since we don't know the file type, try and get the resource as a storage provider first. resource_id_split = resource_id.split(':') try: resource = osf_instance.project(resource_id_split[0]).storage(resource_id_split[1]) except (OSFNotFoundError, IndexError): pass else: return resource # If it's not a storage provider then check if it's a file or folder. try: resource = osf_instance.resource(resource_id) except OSFNotFoundError: pass else: return resource # If it's not a folder/file then it's a project or it doesn't exist. try: resource = osf_instance.project(resource_id) except OSFNotFoundError as e: raise PresQTResponseException( "Resource with id '{}' not found for this user.".format(resource_id), e.status_code) else: return resource
def validation_check(token): """ Ensure a proper GitLab API token has been provided. Parameters ---------- token : str User's GitLab token Returns ------- The requesting user's username and properly formatted GitLab Auth header. """ headers = {"Private-Token": "{}".format(token)} request = requests.get("https://gitlab.com/api/v4/user", headers=headers) if request.status_code == 401: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) else: return headers, request.json()['id']
def validation_check(token): """ Ensure a proper GitHub API token has been provided. Parameters ---------- token : str User's GitHub token Returns ------- The requesting user's username and properly formatted GitHub Auth header. """ header = {"Authorization": "token {}".format(token), "Accept": "application/vnd.github.mercy-preview+json"} validation = requests.get("https://api.github.com/user", headers=header).json() try: username = validation['login'] except: raise PresQTResponseException("Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) return header, username
def create_project(project_title, headers, token): """ Create a FigShare repository. Parameters ---------- project_title : str The title of the project being created headers : dict The users FigShare Auth header token : str The users Auth token """ from presqt.targets.figshare.functions.fetch import figshare_fetch_resources # Get all the project titles figshare_resources, pages = figshare_fetch_resources(token, None) titles = [ data['title'] for data in figshare_resources if data['kind_name'] == 'project' ] title = get_duplicate_title(project_title, titles, '(PresQT*)') project_payload = {"title": title} response = requests.post("https://api.figshare.com/v2/account/projects", headers=headers, data=json.dumps(project_payload)) if response.status_code == 201: # The second item returned is the project id. return project_title, response.json()['location'].rpartition('/')[2] else: raise PresQTResponseException( "Response has status code {} while creating project {}".format( response.status_code, project_title), status.HTTP_400_BAD_REQUEST)
def get_curate_nd_resource(resource_id, curate_nd_instance): """ Get a CurateND resource based on a given id. Parameters ---------- resource_id : str Resource ID to retrieve curate_nd_instance : CurateND class object Instance of the CurateND class we want to use to get the resource from. Returns ------- The class object for the resource requested. """ try: resource = curate_nd_instance.resource(resource_id) except CurateNDForbiddenError as e: raise PresQTResponseException( "User does not have access to this resource with the token provided.", e.status_code) else: return resource
def figshare_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action): """ Upload the files found in the resource_main_dir to the target. Parameters ---------- token : str User's token. resource_id : str ID of the resource requested. resource_main_dir : str Path to the main directory for the resources to be uploaded. hash_algorithm : str Hash algorithm we are using to check for fixity. file_duplicate_action : str The action to take when a duplicate file is found process_info_path: str Path to the process info file that keeps track of the action's progress action: str The action being performed Returns ------- Dictionary with the following keys: values 'resources_ignored' : Array of string file paths of files that were ignored when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/ignored/file.pg', 'another/ignored/file.jpg'] 'resources_updated' : Array of string file paths of files that were updated when uploading the resource. Path should have the same base as resource_main_dir. Example: ['path/to/updated/file.jpg'] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'destinationUsername': '******' } 'file_metadata_list': List of dictionaries for each file that contains metadata and hash info. Must be in the following format: { "actionRootPath": '/path/on/disk', "destinationPath": '/path/on/target/destination', "title": 'file_title', "destinationHash": {'hash_algorithm': 'the_hash'}} } 'project_id': ID of the parent project for this upload. Needed for metadata upload. 'project_link': The link to either the resource or the home page of the user if not available through API FigShare's Upload Process 1. Initiate new file upload (POST) within the article. Send file size, md5, and name but no file contents yet. 2. Send a GET request to the 'Uploader Service' to determine that the status is "Pending" and how many parts to split the upload into. 3. Split the file into the correct number of parts and upload each using a PUT request. 4. Send a POST request to complete the upload. """ try: headers, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) os_path = next(os.walk(resource_main_dir)) total_files = upload_total_files(resource_main_dir) # Update process info file update_process_info(process_info_path, total_files, action, 'upload') update_process_info_message(process_info_path, action, "Uploading files to FigShare...") resources_ignored = [] resources_updated = [] file_metadata_list = [] action_metadata = {'destinationUsername': username} # Upload a new project if not resource_id: project_title = os_path[1][0] # Create a new project with the name being the top level directory's name. project_name, project_id = create_project(project_title, headers, token) # Create article, for now we'll name it the same as the project article_id = create_article(project_title, headers, project_id) else: # Upload to an existing project split_id = str(resource_id).split(":") project_id = split_id[0] try: project_title = requests.get( "https://api.figshare.com/v2/account/projects/{}".format( project_id), headers=headers).json()['title'] except KeyError: raise PresQTResponseException( "Project with id, {}, could not be found by the requesting user." .format(project_id), status.HTTP_400_BAD_REQUEST) if len(split_id) == 1: # We only have a project and we need to make a new article id # Check to see if an article with this name already exists articles = requests.get( "https://api.figshare.com/v2/account/projects/{}/articles". format(project_id), headers=headers).json() article_titles = [article['title'] for article in articles] new_title = get_duplicate_title(project_title, article_titles, "(PresQT*)") article_id = create_article(new_title, headers, resource_id) elif len(split_id) == 2: article_id = split_id[1] else: # Can't upload to file raise PresQTResponseException( "Can not upload into an existing file.", status.HTTP_400_BAD_REQUEST) # Get the article title try: article_title = requests.get( "https://api.figshare.com/v2/account/articles/{}".format( article_id), headers=headers).json()['title'] except KeyError: raise PresQTResponseException( "Article with id, {}, could not be found by the requesting user.". format(article_id), status.HTTP_400_BAD_REQUEST) # Get md5, size and name of zip file to be uploaded for path, subdirs, files in os.walk(resource_main_dir): for name in files: file_info = open(os.path.join(path, name), 'rb') zip_hash = hash_generator(file_info.read(), 'md5') figshare_file_upload_process(file_info, headers, name, article_id, file_type='zip', path=path) file_metadata_list.append({ 'actionRootPath': os.path.join(path, name), 'destinationPath': '/{}/{}/{}'.format(project_title, article_title, name), 'title': name, 'destinationHash': zip_hash }) increment_process_info(process_info_path, action, 'upload') return { "resources_ignored": resources_ignored, "resources_updated": resources_updated, "action_metadata": action_metadata, "file_metadata_list": file_metadata_list, "project_id": "{}:{}".format(project_id, article_id), "project_link": "https://figshare.com/account/home#/projects" }
def github_download_resource(token, resource_id): """ Fetch the requested resource from GitHub along with its hash information. Parameters ---------- token : str User's GitHub token resource_id : str ID of the resource requested Returns ------- Dictionary with the following keys: values 'resources': List of dictionary objects that each hold a file and its information. Dictionary must be in the following format: { 'file': binary_file, 'hashes': {'hash_algorithm': 'the_hash'}, 'title': 'file.jpg', 'path': '/path/to/file', 'source_path: '/full/path/to/file', 'extra_metadata': {'any': 'extra'} } 'empty_containers: List of string paths representing empty containers that must be written. Example: ['empty/folder/to/write/', 'another/empty/folder/] 'action_metadata': Dictionary containing action metadata. Must be in the following format: { 'sourceUsername': '******', } """ try: header, username = validation_check(token) except PresQTResponseException: raise PresQTResponseException( "Token is invalid. Response returned a 401 status code.", status.HTTP_401_UNAUTHORIZED) project_url = 'https://api.github.com/repositories/{}'.format(resource_id) response = requests.get(project_url, headers=header) if response.status_code != 200: raise PresQTResponseException( 'The resource with id, {}, does not exist for this user.'.format( resource_id), status.HTTP_404_NOT_FOUND) data = response.json() repo_name = data['name'] # Strip off the unnecessary {+path} that's included in the url # Example: https://api.github.com/repos/eggyboi/djangoblog/contents/{+path} becomes # https://api.github.com/repos/eggyboi/djangoblog/contents contents_url = data['contents_url'].partition('/{+path}')[0] files, empty_containers, action_metadata = download_content( username, contents_url, header, repo_name, []) file_urls = [file['file'] for file in files] loop = asyncio.new_event_loop() download_data = loop.run_until_complete(async_main(file_urls, header)) # Go through the file dictionaries and replace the file path with the binary_content for file in files: file['file'] = get_dictionary_from_list(download_data, 'url', file['file'])['binary_content'] return { 'resources': files, 'empty_containers': empty_containers, 'action_metadata': action_metadata }
def _download_resource(self): """ Downloads the resources from the target, performs a fixity check, zips them up in BagIt format. """ action = 'resource_download' # Write the process id to the process_info file self.process_info_obj[ 'function_process_id'] = self.function_process.pid update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) # Fetch the proper function to call func = FunctionRouter.get_function(self.source_target_name, action) # Fetch the resources. func_dict is in the format: # { # 'resources': files, # 'empty_containers': empty_containers, # 'action_metadata': action_metadata # } try: func_dict = func(self.source_token, self.source_resource_id, self.process_info_path, self.action) # If the resource is being transferred, has only one file, and that file is the # PresQT metadata then raise an error. if self.action == 'resource_transfer_in' and \ len(func_dict['resources']) == 1 \ and func_dict['resources'][0]['title'] == 'PRESQT_FTS_METADATA.json': raise PresQTResponseException( 'PresQT Error: PresQT FTS metadata cannot not be transferred by itself.', status.HTTP_400_BAD_REQUEST) except PresQTResponseException as e: # TODO: Functionalize this error section # Catch any errors that happen within the target fetch. # Update the server process_info file appropriately. self.process_info_obj['status_code'] = e.status_code self.process_info_obj['status'] = 'failed' if self.action == 'resource_transfer_in': self.process_info_obj['download_status'] = 'failed' self.process_info_obj['message'] = e.data # Update the expiration from 5 hours to 1 hour from now. We can delete this faster because # it's an incomplete/failed directory. self.process_info_obj['expiration'] = str(timezone.now() + relativedelta(hours=1)) update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) return False # Get the latest contents of the job's process_info.json file self.process_info_obj = read_file(self.process_info_path, True)[self.action] # The directory all files should be saved in. self.resource_main_dir = os.path.join(self.ticket_path, self.base_directory_name) update_process_info_message( self.process_info_path, self.action, 'Performing fixity checks and gathering metadata...') self.extra_metadata = func_dict['extra_metadata'] # For each resource, perform fixity check, gather metadata, and save it to disk. fixity_info = [] self.download_fixity = True self.download_failed_fixity = [] self.source_fts_metadata_actions = [] self.new_fts_metadata_files = [] self.all_keywords = [] self.initial_keywords = [] self.manual_keywords = [] self.enhanced_keywords = [] for resource in func_dict['resources']: # Perform the fixity check and add extra info to the returned fixity object. # Note: This method of calling the function needs to stay this way for test Mock fixity_obj, self.download_fixity = download_fixity_checker.download_fixity_checker( resource) fixity_info.append(fixity_obj) if not fixity_obj['fixity']: self.download_failed_fixity.append(resource['path']) # Create metadata for this resource or validate the metadata file if resource['title'] == 'PRESQT_FTS_METADATA.json': is_valid = validate_metadata(self, resource) if not is_valid: resource['path'] = resource['path'].replace( 'PRESQT_FTS_METADATA.json', 'INVALID_PRESQT_FTS_METADATA.json') create_download_metadata(self, resource, fixity_obj) write_file( '{}{}'.format(self.resource_main_dir, resource['path']), resource['file']) else: create_download_metadata(self, resource, fixity_obj) write_file( '{}{}'.format(self.resource_main_dir, resource['path']), resource['file']) # Enhance the source keywords self.keyword_dict = {} if self.action == 'resource_transfer_in': if self.supports_keywords: if self.keyword_action == 'automatic': self.keyword_dict = automatic_keywords(self) elif self.keyword_action == 'manual': self.keyword_dict = manual_keywords(self) self.keyword_enhancement_successful = True # Create PresQT action metadata update_process_info_message(self.process_info_path, self.action, "Creating PRESQT_FTS_METADATA...") self.source_username = func_dict['action_metadata']['sourceUsername'] if self.action == 'resource_transfer_in': source_target_data = get_target_data(self.source_target_name) destination_target_data = get_target_data( self.destination_target_name) self.details = "PresQT Transfer from {} to {}".format( source_target_data['readable_name'], destination_target_data['readable_name']) else: source_target_data = get_target_data(self.source_target_name) self.details = "PresQT Download from {}".format( source_target_data['readable_name']) self.action_metadata = { 'id': str(uuid4()), 'details': self.details, 'actionDateTime': str(timezone.now()), 'actionType': self.action, 'sourceTargetName': self.source_target_name, 'sourceUsername': self.source_username, 'destinationTargetName': 'Local Machine', 'destinationUsername': None, 'keywords': self.keyword_dict, 'files': { 'created': self.new_fts_metadata_files, 'updated': [], 'ignored': [] } } # TODO: Move this up to make it occur after we loop through func_dict['resources'] and write # resources # Write empty containers to disk for container_path in func_dict['empty_containers']: # Make sure the container_path has a '/' and the beginning and end if container_path[-1] != '/': container_path += '/' if container_path[0] != '/': container_path = '/' + container_path os.makedirs( os.path.dirname('{}{}'.format(self.resource_main_dir, container_path))) # If we are transferring the downloaded resource then bag it for the resource_upload method if self.action == 'resource_transfer_in': self.action_metadata[ 'destinationTargetName'] = self.destination_target_name # Make a BagIt 'bag' of the resources. bagit.make_bag(self.resource_main_dir, checksums=['md5', 'sha1', 'sha256', 'sha512']) self.process_info_obj['download_status'] = get_action_message( self, 'Download', self.download_fixity, True, self.action_metadata) return True # If we are only downloading the resource then create metadata, bag, zip, # and update the server process file. else: # Create Metadata file final_fts_metadata_data = create_fts_metadata( self.all_keywords, self.action_metadata, self.source_fts_metadata_actions, self.extra_metadata) # Validate the final metadata metadata_validation = schema_validator( 'presqt/json_schemas/metadata_schema.json', final_fts_metadata_data) self.process_info_obj['message'] = get_action_message( self, 'Download', self.download_fixity, metadata_validation, self.action_metadata) # Make a BagIt 'bag' of the resources. bagit.make_bag(self.resource_main_dir, checksums=['md5', 'sha1', 'sha256', 'sha512']) # Write metadata file. write_file( os.path.join(self.resource_main_dir, 'PRESQT_FTS_METADATA.json'), final_fts_metadata_data, True) # Add the fixity file to the disk directory write_file( os.path.join(self.resource_main_dir, 'fixity_info.json'), fixity_info, True) # Zip the BagIt 'bag' to send forward. zip_directory(self.resource_main_dir, "{}.zip".format(self.resource_main_dir), self.ticket_path) # Everything was a success so update the server metadata file. self.process_info_obj['status_code'] = '200' self.process_info_obj['status'] = 'finished' self.process_info_obj['zip_name'] = '{}.zip'.format( self.base_directory_name) self.process_info_obj[ 'failed_fixity'] = self.download_failed_fixity update_or_create_process_info(self.process_info_obj, self.action, self.ticket_number) if self.email: # Build link to retrieve the download download_reverse = reverse('job_status', kwargs={ "action": "download", "response_format": "zip" }) download_url = self.request.build_absolute_uri( download_reverse) final_download_url = "{}?ticket_number={}".format( download_url, self.ticket_number) context = { "download_url": final_download_url, "download_message": self.process_info_obj['message'], "failed_fixity": self.process_info_obj['failed_fixity'] } email_blaster(self.email, "PresQT Download Complete", context, "emails/download_email.html") return True