def update_bag(self): """ Update a bag if necessary. This uses the Django signal pre_check_bag_flag to prepare collections, and then checks the AVUs 'metadata_dirty' and 'bag_modified' to determine whether to regenerate the metadata files and/or bag. This is a synchronous update. The call waits until the update is finished. """ from hs_core.tasks import create_bag_by_irods from hs_core.hydroshare.resource import check_resource_type from hs_core.hydroshare.hs_bagit import create_bag_files # send signal for pre_check_bag_flag resource_cls = check_resource_type(self.resource_type) pre_check_bag_flag.send(sender=resource_cls, resource=self) metadata_dirty = self.getAVU('metadata_dirty') bag_modified = self.getAVU('bag_modified') if metadata_dirty: # automatically cast to Bool create_bag_files(self) self.setAVU('metadata_dirty', False) # the ticket system does synchronous bag creation. # async bag creation isn't supported. if bag_modified: # automatically cast to Bool create_bag_by_irods(self.short_id) self.setAVU('bag_modified', False)
def update_bag(self): """ Update a bag if necessary. This uses the Django signal pre_check_bag_flag to prepare collections, and then checks the AVUs 'metadata_dirty' and 'bag_modified' to determine whether to regenerate the metadata files and/or bag. This is a synchronous update. The call waits until the update is finished. """ from hs_core.tasks import create_bag_by_irods from hs_core.hydroshare.resource import check_resource_type from hs_core.hydroshare.hs_bagit import create_bag_files # send signal for pre_check_bag_flag resource_cls = check_resource_type(self.resource_type) pre_check_bag_flag.send(sender=resource_cls, resource=self) metadata_dirty = self.getAVU('metadata_dirty') bag_modified = self.getAVU('bag_modified') if metadata_dirty: # automatically cast to Bool create_bag_files(self) self.setAVU('metadata_dirty', False) # the ticket system does synchronous bag creation. # async bag creation isn't supported. if bag_modified: # automatically cast to Bool create_bag_by_irods(self.short_id) self.setAVU('bag_modified', False)
def resource_modified(resource, by_user=None, overwrite_bag=True): """ Set an AVU flag that forces the bag to be recreated before fetch. This indicates that some content of the bag has been edited. """ resource.last_changed_by = by_user resource.updated = now().isoformat() # seems this is the best place to sync resource title with metadata title resource.title = resource.metadata.title.value resource.save() if resource.metadata.dates.all().filter(type='modified'): res_modified_date = resource.metadata.dates.all().filter( type='modified')[0] resource.metadata.update_element('date', res_modified_date.id) if overwrite_bag: create_bag_files(resource) # set bag_modified-true AVU pair for the modified resource in iRODS to indicate # the resource is modified for on-demand bagging. set_dirty_bag_flag(resource)
def update_metadata_files(self): """ Make the metadata files resourcemetadata.xml and resourcemap.xml up to date. This checks the "metadata dirty" AVU before updating files if necessary. """ from hs_core.hydroshare.hs_bagit import create_bag_files metadata_dirty = self.getAVU('metadata_dirty') if metadata_dirty: create_bag_files(self) self.setAVU('metadata_dirty', False)
def update_metadata_files(self): """ Make the metadata files resourcemetadata.xml and resourcemap.xml up to date. This checks the "metadata dirty" AVU before updating files if necessary. """ from hs_core.hydroshare.hs_bagit import create_bag_files metadata_dirty = self.getAVU('metadata_dirty') if metadata_dirty: create_bag_files(self) self.setAVU('metadata_dirty', False)
def resource_modified(resource, by_user=None, overwrite_bag=True): resource.last_changed_by = by_user resource.updated = now().isoformat() resource.save() if resource.metadata.dates.all().filter(type='modified'): res_modified_date = resource.metadata.dates.all().filter(type='modified')[0] resource.metadata.update_element('date', res_modified_date.id) if overwrite_bag: create_bag_files(resource) istorage = IrodsStorage() # set bag_modified-true AVU pair for the modified resource in iRODS to indicate # the resource is modified for on-demand bagging. istorage.setAVU(resource.short_id, "bag_modified", "true")
def create_or_update_from_package(resource, term, **kwargs): terms_dict = dict(StressPeriod='stress_period', GroundWaterFlow='ground_water_flow', BoundaryCondition='boundary_condition', ModelCalibration='model_calibration', GeneralElements='general_elements', GridDimensions='grid_dimensions', StudyArea='study_area') t = terms_dict[term.term] metadata_term_obj = getattr(resource.metadata, t) if not metadata_term_obj: resource.metadata.create_element(term.term, **kwargs) else: resource.metadata.update_element(term.term, metadata_term_obj.id, **kwargs) create_bag_files(resource)
def resource_modified(resource, by_user=None, overwrite_bag=True): resource.last_changed_by = by_user resource.updated = now().isoformat() resource.save() if resource.metadata.dates.all().filter(type='modified'): res_modified_date = resource.metadata.dates.all().filter( type='modified')[0] resource.metadata.update_element('date', res_modified_date.id) if overwrite_bag: create_bag_files(resource) istorage = IrodsStorage() # set bag_modified-true AVU pair for the modified resource in iRODS to indicate # the resource is modified for on-demand bagging. istorage.setAVU(resource.short_id, "bag_modified", "true")
def resource_modified(resource, by_user=None, overwrite_bag=True): """ Set an AVU flag that forces the bag to be recreated before fetch. This indicates that some content of the bag has been edited. """ resource.last_changed_by = by_user resource.updated = now().isoformat() # seems this is the best place to sync resource title with metadata title resource.title = resource.metadata.title.value resource.save() if resource.metadata.dates.all().filter(type='modified'): res_modified_date = resource.metadata.dates.all().filter(type='modified')[0] resource.metadata.update_element('date', res_modified_date.id) if overwrite_bag: create_bag_files(resource) # set bag_modified-true AVU pair for the modified resource in iRODS to indicate # the resource is modified for on-demand bagging. set_dirty_bag_flag(resource)
def create_or_update_from_package(resource, term, **kwargs): terms_dict = dict( StressPeriod='stress_period', GroundWaterFlow='ground_water_flow', BoundaryCondition='boundary_condition', ModelCalibration='model_calibration', GeneralElements='general_elements', GridDimensions='grid_dimensions', StudyArea='study_area' ) t = terms_dict[term.term] metadata_term_obj = getattr(resource.metadata, t) if not metadata_term_obj: resource.metadata.create_element( term.term, **kwargs ) else: resource.metadata.update_element( term.term, metadata_term_obj.id, **kwargs ) create_bag_files(resource)
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True, *args, **kwargs): """ perform a download request, either asynchronously or synchronously :param request: the request object. :param path: the path of the thing to be downloaded. :param rest_call: True if calling from REST API :param use_async: True means to utilize asynchronous creation of objects to download. :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming. The following variables are computed: * `path` is the public path of the thing to be downloaded. * `irods_path` is the location of `path` in irods. * `output_path` is the output path to be reported in the response object. * `irods_output_path` is the location of `output_path` in irods and there are six cases: Zipped query param signal the download should be zipped - folders are always zipped regardless of this paramter - single file aggregations are zipped with the aggregation metadata files A path may point to: 1. a single file 2. a single-file-aggregation object in a composite resource. 3. a folder 3. a metadata object that may need updating. 4. a bag that needs to be updated and then returned. 6. a previously zipped file that was zipped asynchronously. """ if __debug__: logger.debug("request path is {}".format(path)) split_path_strs = path.split('/') while split_path_strs[-1] == '': split_path_strs.pop() path = u'/'.join(split_path_strs) # no trailing slash # initialize case variables is_bag_download = False is_zip_download = False is_zip_request = request.GET.get('zipped', "False").lower() == "true" is_sf_agg_file = False is_sf_request = False if split_path_strs[0] == 'bags': is_bag_download = True # format is bags/{rid}.zip res_id = os.path.splitext(split_path_strs[1])[0] elif split_path_strs[0] == 'zips': is_zip_download = True # zips prefix means that we are following up on an asynchronous download request # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid res_id = split_path_strs[3] else: # regular download request res_id = split_path_strs[0] if __debug__: logger.debug("resource id is {}".format(res_id)) # now we have the resource Id and can authorize the request # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize(request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: response.content = "<h1>" + content_msg + "</h1>" return response # default values are changed later as needed istorage = res.get_irods_storage() if res.is_federated: irods_path = os.path.join(res.resource_federation_path, path) else: irods_path = path # in many cases, path and output_path are the same. output_path = path irods_output_path = irods_path # folder requests are automatically zipped if not is_bag_download and not is_zip_download: # path points into resource: should I zip it? store_path = u'/'.join(split_path_strs[1:]) # data/contents/{path-to-something} if res.is_folder(store_path): # automatically zip folders is_zip_request = True daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) if res.is_federated: irods_output_path = os.path.join(res.resource_federation_path, output_path) else: irods_output_path = output_path if __debug__: logger.debug("automatically zipping folder {} to {}".format(path, output_path)) elif istorage.exists(irods_path): if __debug__: logger.debug("request for single file {}".format(path)) is_sf_request = True # check for single file aggregations if "data/contents/" in path: # not a metadata file for f in ResourceFile.objects.filter(object_id=res.id): if path == f.storage_path: is_sf_agg_file = True if not is_zip_request and f.has_logical_file and \ f.logical_file.is_single_file_aggregation: download_url = request.GET.get('url_download', 'false').lower() if download_url == 'false': # redirect to referenced url in the url file instead redirect_url = f.logical_file.redirect_url if redirect_url: return HttpResponseRedirect(redirect_url) if __debug__: logger.debug( "request for single file aggregation {}".format(path)) break if is_zip_request: daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) if res.is_federated: irods_output_path = os.path.join(res.resource_federation_path, output_path) else: irods_output_path = output_path # After this point, we have valid path, irods_path, output_path, and irods_output_path # * is_zip_request: signals download should be zipped, folders are always zipped # * is_sf_agg_file: path is a single-file aggregation in Composite Resource # * is_sf_request: path is a single-file # flags for download: # * is_bag_download: download a bag in format bags/{rid}.zip # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip # if none of these are set, it's a normal download # determine active session if res.is_federated: # the resource is stored in federated zone session = icommands.ACTIVE_SESSION else: # TODO: From Alva: I do not understand the use case for changing the environment. # TODO: This seems an enormous potential vulnerability, as arguments are # TODO: passed from the URI directly to IRODS without verification. if 'environment' in kwargs: logger.warn("setting iRODS from environment") environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): if __debug__: logger.debug("using GLOBAL_SESSION") session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: if __debug__: logger.debug("using ACTIVE_SESSION") session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') resource_cls = check_resource_type(res.resource_type) if is_zip_request: if use_async: task = create_temp_zip.apply_async((res_id, irods_path, irods_output_path, is_sf_agg_file, is_sf_request)) delete_zip.apply_async((irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if rest_call: return HttpResponse( json.dumps({ 'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': '/django_irods/rest_download/' + output_path}), content_type="application/json") else: # return status to the UI request.session['task_id'] = task.task_id # TODO: this is mistaken for a bag download in the UI! # TODO: multiple asynchronous downloads don't stack! request.session['download_path'] = '/django_irods/download/' + output_path # redirect to resource landing page, which interprets session variables. return HttpResponseRedirect(res.get_absolute_url()) else: # synchronous creation of download ret_status = create_temp_zip(res_id, irods_path, irods_output_path, is_sf_agg_file, is_sf_request) delete_zip.apply_async((irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if not ret_status: content_msg = "Zip could not be created." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response # At this point, output_path presumably exists and contains a zipfile # to be streamed below elif is_bag_download: # Shorten request if it contains extra junk at the end bag_file_name = res_id + '.zip' output_path = os.path.join('bags', bag_file_name) if not res.is_federated: irods_output_path = output_path else: irods_output_path = os.path.join(res.resource_federation_path, output_path) bag_modified = res.getAVU('bag_modified') # recreate the bag if it doesn't exist even if bag_modified is "false". if __debug__: logger.debug(u"irods_output_path is {}".format(irods_output_path)) if bag_modified is None or not bag_modified: if not istorage.exists(irods_output_path): bag_modified = True # send signal for pre_check_bag_flag # this generates metadata other than that generated by create_bag_files. pre_check_bag_flag.send(sender=resource_cls, resource=res) metadata_dirty = res.getAVU('metadata_dirty') if metadata_dirty is None or metadata_dirty: create_bag_files(res) # sets metadata_dirty to False bag_modified = "True" if bag_modified is None or bag_modified: if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task = create_bag_by_irods.apply_async((res_id,), countdown=3) if rest_call: return HttpResponse(json.dumps({'bag_status': 'Not ready', 'task_id': task.task_id}), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = request.path return HttpResponseRedirect(res.get_absolute_url()) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response else: # regular file download # if fetching main metadata files, then these need to be refreshed. if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'): metadata_dirty = res.getAVU("metadata_dirty") if metadata_dirty is None or metadata_dirty: create_bag_files(res) # sets metadata_dirty to False # send signal for pre download file # TODO: does not contain subdirectory information: duplicate refreshes possible download_file_name = split_path_strs[-1] # end of path # this logs the download request in the tracking system pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # If we get this far, # * path and irods_path point to true input # * output_path and irods_output_path point to true output. # Try to stream the file back to the requester. # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(output_path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header # TODO: standardize this to make it less brittle stdout = session.run("ils", None, "-l", irods_output_path)[0].split() flen = int(stdout[3]) # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true') # and reverse proxy is possible according to configuration (SENDFILE_ON=True) # and reverse proxy isn't overridden by user (use_reverse_proxy=True). if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. This deals with federated resources by reading their path, matching local vaults, and # redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At # present, the only one of these is /irods-user, which handles files whose federation # path is stored in the variable 'userpath'. # 4. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # If this path is resource_federation_path, then the file is a local user file userpath = '/' + os.path.join( getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home', getattr(settings, 'HS_IRODS_PROXY_USER_IN_USER_ZONE', 'localHydroProxy')) # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(irods_output_path): content_msg = "file path {} does not exist in iRODS".format(output_path) response = HttpResponse(status=404) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response if not res.is_federated: # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join([ getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path]) if __debug__: logger.debug("Reverse proxying local {}".format(response['X-Accel-Redirect'])) return response elif res.resource_federation_path == userpath: # this guarantees a "user" resource # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = os.path.join( getattr(settings, 'IRODS_USER_URI', '/irods-user'), output_path) if __debug__: logger.debug("Reverse proxying user {}".format(response['X-Accel-Redirect'])) return response # if we get here, none of the above conditions are true # if reverse proxy is enabled, then this is because the resource is remote and federated # OR the user specifically requested a non-proxied download. options = ('-',) # we're redirecting to stdout. # this unusual way of calling works for streaming federated or local resources if __debug__: logger.debug("Locally streaming {}".format(output_path)) # track download count res.update_download_count() proc = session.run_safe('iget', None, irods_output_path, *options) response = FileResponse(proc.stdout, content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen return response
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True, *args, **kwargs): split_path_strs = path.split('/') is_bag_download = False is_zip_download = False is_sf_agg_file = False if split_path_strs[0] == 'bags': res_id = os.path.splitext(split_path_strs[1])[0] is_bag_download = True elif split_path_strs[0] == 'zips': if path.endswith('.zip'): res_id = os.path.splitext(split_path_strs[2])[0] else: res_id = os.path.splitext(split_path_strs[1])[0] is_zip_download = True else: res_id = split_path_strs[0] # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize(request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: response.content = "<h1>" + content_msg + "</h1>" return response if res.resource_type == "CompositeResource" and not path.endswith(".zip"): for f in ResourceFile.objects.filter(object_id=res.id): if path == f.storage_path: if f.has_logical_file and f.logical_file.is_single_file_aggregation: is_sf_agg_file = True if res.resource_federation_path: # the resource is stored in federated zone istorage = IrodsStorage('federated') federated_path = res.resource_federation_path path = os.path.join(federated_path, path) session = icommands.ACTIVE_SESSION else: # TODO: From Alva: I do not understand the use case for changing the environment. # TODO: This seems an enormous potential vulnerability, as arguments are # TODO: passed from the URI directly to IRODS without verification. istorage = IrodsStorage() federated_path = '' if 'environment' in kwargs: environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') resource_cls = check_resource_type(res.resource_type) if federated_path: res_root = os.path.join(federated_path, res_id) else: res_root = res_id if is_zip_download or is_sf_agg_file: if not path.endswith(".zip"): # requesting folder that needs to be zipped input_path = path.split(res_id)[1] random_hash = random.getrandbits(32) daily_date = datetime.datetime.today().strftime('%Y-%m-%d') random_hash_path = 'zips/{daily_date}/{res_id}/{rand_folder}'.format( daily_date=daily_date, res_id=res_id, rand_folder=random_hash) output_path = '{random_hash_path}{path}.zip'.format(random_hash_path=random_hash_path, path=input_path) if res.resource_type == "CompositeResource": aggregation_name = input_path[len('/data/contents/'):] res.create_aggregation_xml_documents(aggregation_name=aggregation_name) if use_async: task = create_temp_zip.apply_async((res_id, input_path, output_path, is_sf_agg_file), countdown=3) delete_zip.apply_async((random_hash_path, ), countdown=(20 * 60)) # delete after 20 minutes if is_sf_agg_file: download_path = request.path.split(res_id)[0] + output_path else: download_path = request.path.split("zips")[0] + output_path if rest_call: return HttpResponse(json.dumps({'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': download_path}), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = download_path return HttpResponseRedirect(res.get_absolute_url()) ret_status = create_temp_zip(res_id, input_path, output_path, is_sf_agg_file) delete_zip.apply_async((random_hash_path, ), countdown=(20 * 60)) # delete after 20 minutes if not ret_status: content_msg = "Zip cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response path = output_path bag_modified = istorage.getAVU(res_root, 'bag_modified') # make sure if bag_modified is not set to true, we still recreate the bag if the # bag file does not exist for some reason to resolve the error to download a nonexistent # bag when bag_modified is false due to the flag being out-of-sync with the real bag status if bag_modified is None or bag_modified.lower() == "false": # check whether the bag file exists bag_file_name = res_id + '.zip' if res_root.startswith(res_id): bag_full_path = os.path.join('bags', bag_file_name) else: bag_full_path = os.path.join(federated_path, 'bags', bag_file_name) # set bag_modified to 'true' if the bag does not exist so that it can be recreated # and the bag_modified AVU will be set correctly as well subsequently if not istorage.exists(bag_full_path): bag_modified = 'true' metadata_dirty = istorage.getAVU(res_root, 'metadata_dirty') # do on-demand bag creation # needs to check whether res_id collection exists before getting/setting AVU on it # to accommodate the case where the very same resource gets deleted by another request # when it is getting downloaded if is_bag_download: # send signal for pre_check_bag_flag pre_check_bag_flag.send(sender=resource_cls, resource=res) if bag_modified is None or bag_modified.lower() == "true": if metadata_dirty is None or metadata_dirty.lower() == 'true': create_bag_files(res) if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task = create_bag_by_irods.apply_async((res_id,), countdown=3) if rest_call: return HttpResponse(json.dumps({'bag_status': 'Not ready', 'task_id': task.task_id}), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = request.path return HttpResponseRedirect(res.get_absolute_url()) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response elif metadata_dirty is None or metadata_dirty.lower() == 'true': if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'): # we need to regenerate the metadata xml files create_bag_files(res) # send signal for pre download file download_file_name = split_path_strs[-1] pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header stdout = session.run("ils", None, "-l", path)[0].split() flen = int(stdout[3]) # If this path is resource_federation_path, then the file is a local user file userpath = '/' + os.path.join( getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home', getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy')) # Allow reverse proxy if request was forwarded by nginx # (HTTP_X_DJANGO_REVERSE_PROXY is 'true') # and reverse proxy is possible according to configuration. if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. This deals with federated resources by reading their path, matching local vaults, and # redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At # present, the only one of these is /irods-user, which handles files whose federation # path is stored in the variable 'userpath'. # 4. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(path): content_msg = "file path {} does not exist in iRODS".format(path) response = HttpResponse(status=404) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response if not res.is_federated: # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join([ getattr(settings, 'IRODS_DATA_URI', '/irods-data'), path]) return response elif res.resource_federation_path == userpath: # this guarantees a "user" resource # invoke X-Accel-Redirect on physical vault file in nginx # if path is full user path; strip federation prefix if path.startswith(userpath): path = path[len(userpath)+1:] # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = os.path.join( getattr(settings, 'IRODS_USER_URI', '/irods-user'), path) return response # if we get here, none of the above conditions are true if flen <= FILE_SIZE_LIMIT: options = ('-',) # we're redirecting to stdout. # this unusual way of calling works for federated or local resources proc = session.run_safe('iget', None, path, *options) response = FileResponse(proc.stdout, content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen return response else: content_msg = "File larger than 1GB cannot be downloaded directly via HTTP. " \ "Please download the large file via iRODS clients." response = HttpResponse(status=403) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response
def put(self, request, pk): # Update science metadata based on resourcemetadata.xml uploaded resource, authorized, user = view_utils.authorize( request, pk, needed_permission=ACTION_TO_AUTHORIZE.EDIT_RESOURCE, raises_exception=False) if not authorized: raise PermissionDenied() files = request.FILES.values() if len(files) == 0: error_msg = {'file': 'No resourcemetadata.xml file was found to update resource ' 'metadata.'} raise ValidationError(detail=error_msg) elif len(files) > 1: error_msg = {'file': ('More than one file was found. Only one file, named ' 'resourcemetadata.xml, ' 'can be used to update resource metadata.')} raise ValidationError(detail=error_msg) scimeta = files[0] if scimeta.content_type not in self.ACCEPT_FORMATS: error_msg = {'file': ("Uploaded file has content type {t}, " "but only these types are accepted: {e}.").format( t=scimeta.content_type, e=",".join(self.ACCEPT_FORMATS))} raise ValidationError(detail=error_msg) expect = 'resourcemetadata.xml' if scimeta.name != expect: error_msg = {'file': "Uploaded file has name {n}, but expected {e}.".format( n=scimeta.name, e=expect)} raise ValidationError(detail=error_msg) # Temp directory to store resourcemetadata.xml tmp_dir = tempfile.mkdtemp() try: # Fake the bag structure so that GenericResourceMeta.read_metadata_from_resource_bag # can read and validate the system and science metadata for us. bag_data_path = os.path.join(tmp_dir, 'data') os.mkdir(bag_data_path) # Copy new science metadata to bag data path scimeta_path = os.path.join(bag_data_path, 'resourcemetadata.xml') shutil.copy(scimeta.temporary_file_path(), scimeta_path) # Copy existing resource map to bag data path # (use a file-like object as the file may be in iRODS, so we can't # just copy it to a local path) resmeta_path = os.path.join(bag_data_path, 'resourcemap.xml') with open(resmeta_path, 'wb') as resmeta: storage = get_file_storage() resmeta_irods = storage.open(AbstractResource.sysmeta_path(pk)) shutil.copyfileobj(resmeta_irods, resmeta) resmeta_irods.close() try: # Read resource system and science metadata domain = Site.objects.get_current().domain rm = GenericResourceMeta.read_metadata_from_resource_bag(tmp_dir, hydroshare_host=domain) # Update resource metadata rm.write_metadata_to_resource(resource, update_title=True, update_keywords=True) create_bag_files(resource) except HsDeserializationDependencyException as e: msg = ("HsDeserializationDependencyException encountered when updating " "science metadata for resource {pk}; depedent resource was {dep}.") msg = msg.format(pk=pk, dep=e.dependency_resource_id) logger.error(msg) raise ValidationError(detail=msg) except HsDeserializationException as e: raise ValidationError(detail=e.message) resource_modified(resource, request.user, overwrite_bag=False) return Response(data={'resource_id': pk}, status=status.HTTP_202_ACCEPTED) finally: shutil.rmtree(tmp_dir)
for(dirpath, dirnames, filenames) in os.walk(sys.argv[1]): for dirname in dirnames: if dirname != 'bags': content_path_list.append(os.path.join(sys.argv[1], dirname)) break # print content_path_list from hs_core.serialization import create_resource_from_bag from hs_core.hydroshare.hs_bagit import create_bag_files dep_res_meta = [] dep_res = [] for content_path in content_path_list: try: ret = create_resource_from_bag(content_path) if ret: dep_res_meta.append(ret[1]) dep_res.append(ret[2]) except Exception as ex: print ex.message continue for i in range(0, len(dep_res_meta)): try: dep_res_meta[i].write_metadata_to_resource(dep_res[i]) create_bag_files(dep_res[i]) except Exception as ex: print ex.message continue
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True, *args, **kwargs): """ perform a download request, either asynchronously or synchronously :param request: the request object. :param path: the path of the thing to be downloaded. :param rest_call: True if calling from REST API :param use_async: True means to utilize asynchronous creation of objects to download. :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming. The following variables are computed: * `path` is the public path of the thing to be downloaded. * `irods_path` is the location of `path` in irods. * `output_path` is the output path to be reported in the response object. * `irods_output_path` is the location of `output_path` in irods and there are six cases: Zipped query param signal the download should be zipped - folders are always zipped regardless of this paramter - single file aggregations are zipped with the aggregation metadata files A path may point to: 1. a single file 2. a single-file-aggregation object in a composite resource. 3. a folder 3. a metadata object that may need updating. 4. a bag that needs to be updated and then returned. 6. a previously zipped file that was zipped asynchronously. """ if __debug__: logger.debug("request path is {}".format(path)) split_path_strs = path.split('/') while split_path_strs[-1] == '': split_path_strs.pop() path = u'/'.join(split_path_strs) # no trailing slash # initialize case variables is_bag_download = False is_zip_download = False is_zip_request = request.GET.get('zipped', "False").lower() == "true" is_sf_agg_file = False is_sf_request = False if split_path_strs[0] == 'bags': is_bag_download = True # format is bags/{rid}.zip res_id = os.path.splitext(split_path_strs[1])[0] elif split_path_strs[0] == 'zips': is_zip_download = True # zips prefix means that we are following up on an asynchronous download request # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid res_id = split_path_strs[3] else: # regular download request res_id = split_path_strs[0] if __debug__: logger.debug("resource id is {}".format(res_id)) # now we have the resource Id and can authorize the request # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize(request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: response.content = "<h1>" + content_msg + "</h1>" return response # default values are changed later as needed istorage = res.get_irods_storage() if res.is_federated: irods_path = os.path.join(res.resource_federation_path, path) else: irods_path = path # in many cases, path and output_path are the same. output_path = path irods_output_path = irods_path # folder requests are automatically zipped if not is_bag_download and not is_zip_download: # path points into resource: should I zip it? store_path = u'/'.join(split_path_strs[1:]) # data/contents/{path-to-something} if res.is_folder(store_path): # automatically zip folders is_zip_request = True daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) if res.is_federated: irods_output_path = os.path.join(res.resource_federation_path, output_path) else: irods_output_path = output_path if __debug__: logger.debug("automatically zipping folder {} to {}".format(path, output_path)) elif istorage.exists(irods_path): if __debug__: logger.debug("request for single file {}".format(path)) is_sf_request = True # check for single file aggregations if "data/contents/" in path: # not a metadata file for f in ResourceFile.objects.filter(object_id=res.id): if path == f.storage_path: is_sf_agg_file = True if not is_zip_request and f.has_logical_file and \ f.logical_file.is_single_file_aggregation: download_url = request.GET.get('url_download', 'false').lower() if download_url == 'false': # redirect to referenced url in the url file instead redirect_url = f.logical_file.redirect_url if redirect_url: return HttpResponseRedirect(redirect_url) if __debug__: logger.debug( "request for single file aggregation {}".format(path)) break if is_zip_request: daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) if res.is_federated: irods_output_path = os.path.join(res.resource_federation_path, output_path) else: irods_output_path = output_path # After this point, we have valid path, irods_path, output_path, and irods_output_path # * is_zip_request: signals download should be zipped, folders are always zipped # * is_sf_agg_file: path is a single-file aggregation in Composite Resource # * is_sf_request: path is a single-file # flags for download: # * is_bag_download: download a bag in format bags/{rid}.zip # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip # if none of these are set, it's a normal download # determine active session if res.is_federated: # the resource is stored in federated zone session = icommands.ACTIVE_SESSION else: # TODO: From Alva: I do not understand the use case for changing the environment. # TODO: This seems an enormous potential vulnerability, as arguments are # TODO: passed from the URI directly to IRODS without verification. if 'environment' in kwargs: logger.warn("setting iRODS from environment") environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): if __debug__: logger.debug("using GLOBAL_SESSION") session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: if __debug__: logger.debug("using ACTIVE_SESSION") session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') resource_cls = check_resource_type(res.resource_type) if is_zip_request: if use_async: task = create_temp_zip.apply_async((res_id, irods_path, irods_output_path, is_sf_agg_file, is_sf_request)) delete_zip.apply_async((irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if rest_call: return HttpResponse( json.dumps({ 'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': '/django_irods/rest_download/' + output_path}), content_type="application/json") else: # return status to the UI request.session['task_id'] = task.task_id # TODO: this is mistaken for a bag download in the UI! # TODO: multiple asynchronous downloads don't stack! request.session['download_path'] = '/django_irods/download/' + output_path # redirect to resource landing page, which interprets session variables. return HttpResponseRedirect(res.get_absolute_url()) else: # synchronous creation of download ret_status = create_temp_zip(res_id, irods_path, irods_output_path, is_sf_agg_file, is_sf_request) delete_zip.apply_async((irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if not ret_status: content_msg = "Zip could not be created." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response # At this point, output_path presumably exists and contains a zipfile # to be streamed below elif is_bag_download: # Shorten request if it contains extra junk at the end bag_file_name = res_id + '.zip' output_path = os.path.join('bags', bag_file_name) if not res.is_federated: irods_output_path = output_path else: irods_output_path = os.path.join(res.resource_federation_path, output_path) bag_modified = res.getAVU('bag_modified') # recreate the bag if it doesn't exist even if bag_modified is "false". if __debug__: logger.debug(u"irods_output_path is {}".format(irods_output_path)) if bag_modified is None or not bag_modified: if not istorage.exists(irods_output_path): bag_modified = True # send signal for pre_check_bag_flag # this generates metadata other than that generated by create_bag_files. pre_check_bag_flag.send(sender=resource_cls, resource=res) metadata_dirty = res.getAVU('metadata_dirty') if metadata_dirty is None or metadata_dirty: create_bag_files(res) # sets metadata_dirty to False bag_modified = "True" if bag_modified is None or bag_modified: if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task = create_bag_by_irods.apply_async((res_id,), countdown=3) if rest_call: return HttpResponse(json.dumps({'bag_status': 'Not ready', 'task_id': task.task_id}), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = request.path return HttpResponseRedirect(res.get_absolute_url()) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response else: # regular file download # if fetching main metadata files, then these need to be refreshed. if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'): metadata_dirty = res.getAVU("metadata_dirty") if metadata_dirty is None or metadata_dirty: create_bag_files(res) # sets metadata_dirty to False # send signal for pre download file # TODO: does not contain subdirectory information: duplicate refreshes possible download_file_name = split_path_strs[-1] # end of path # this logs the download request in the tracking system pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # If we get this far, # * path and irods_path point to true input # * output_path and irods_output_path point to true output. # Try to stream the file back to the requester. # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(output_path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header # TODO: standardize this to make it less brittle stdout = session.run("ils", None, "-l", irods_output_path)[0].split() flen = int(stdout[3]) # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true') # and reverse proxy is possible according to configuration (SENDFILE_ON=True) # and reverse proxy isn't overridden by user (use_reverse_proxy=True). if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. This deals with federated resources by reading their path, matching local vaults, and # redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At # present, the only one of these is /irods-user, which handles files whose federation # path is stored in the variable 'userpath'. # 4. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # If this path is resource_federation_path, then the file is a local user file userpath = '/' + os.path.join( getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home', getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy')) # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(irods_output_path): content_msg = "file path {} does not exist in iRODS".format(output_path) response = HttpResponse(status=404) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response if not res.is_federated: # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join([ getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path]) if __debug__: logger.debug("Reverse proxying local {}".format(response['X-Accel-Redirect'])) return response elif res.resource_federation_path == userpath: # this guarantees a "user" resource # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = os.path.join( getattr(settings, 'IRODS_USER_URI', '/irods-user'), output_path) if __debug__: logger.debug("Reverse proxying user {}".format(response['X-Accel-Redirect'])) return response # if we get here, none of the above conditions are true # if reverse proxy is enabled, then this is because the resource is remote and federated # OR the user specifically requested a non-proxied download. options = ('-',) # we're redirecting to stdout. # this unusual way of calling works for streaming federated or local resources if __debug__: logger.debug("Locally streaming {}".format(output_path)) # track download count res.update_download_count() proc = session.run_safe('iget', None, irods_output_path, *options) response = FileResponse(proc.stdout, content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen return response
def create_bag_by_irods(resource_id): """Create a resource bag on iRODS side by running the bagit rule and ibun zip. This function runs as a celery task, invoked asynchronously so that it does not block the main web thread when it creates bags for very large files which will take some time. :param resource_id: the resource uuid that is used to look for the resource to create the bag for. :return: True if bag creation operation succeeds; False if there is an exception raised or resource does not exist. """ res = utils.get_resource_by_shortkey(resource_id) istorage = res.get_irods_storage() bag_path = res.bag_path metadata_dirty = istorage.getAVU(res.root_path, 'metadata_dirty') # if metadata has been changed, then regenerate metadata xml files if metadata_dirty is None or metadata_dirty.lower() == "true": try: create_bag_files(res) except Exception as ex: logger.error('Failed to create bag files. Error:{}'.format( ex.message)) # release the lock before returning bag creation failure return False irods_bagit_input_path = res.get_irods_path(resource_id, prepend_short_id=False) # check to see if bagit readme.txt file exists or not bagit_readme_file = res.get_irods_path('readme.txt') is_bagit_readme_exist = istorage.exists(bagit_readme_file) if irods_bagit_input_path.startswith(resource_id): # resource is in data zone, need to append the full path for iRODS bagit rule execution irods_dest_prefix = "/" + settings.IRODS_ZONE + "/home/" + settings.IRODS_USERNAME irods_bagit_input_path = os.path.join(irods_dest_prefix, resource_id) bagit_input_resource = "*DESTRESC='{def_res}'".format( def_res=settings.IRODS_DEFAULT_RESOURCE) else: # this will need to be changed with the default resource in whatever federated zone the # resource is stored in when we have such use cases to support bagit_input_resource = "*DESTRESC='{def_res}'".format( def_res=settings.HS_IRODS_USER_ZONE_DEF_RES) bagit_input_path = "*BAGITDATA='{path}'".format( path=irods_bagit_input_path) bagit_files = [ res.get_irods_path('bagit.txt'), res.get_irods_path('manifest-md5.txt'), res.get_irods_path('tagmanifest-md5.txt'), bag_path ] # only proceed when the resource is not deleted potentially by another request # when being downloaded is_exist = istorage.exists(irods_bagit_input_path) if is_exist: # if bagit readme.txt does not exist, add it. if not is_bagit_readme_exist: from_file_name = getattr(settings, 'HS_BAGIT_README_FILE_WITH_PATH', 'docs/bagit/readme.txt') istorage.saveFile(from_file_name, bagit_readme_file, True) # call iRODS bagit rule here bagit_rule_file = getattr(settings, 'IRODS_BAGIT_RULE', 'hydroshare/irods/ruleGenerateBagIt_HS.r') try: # call iRODS run and ibun command to create and zip the bag, ignore SessionException # for now as a workaround which could be raised from potential race conditions when # multiple ibun commands try to create the same zip file or the very same resource # gets deleted by another request when being downloaded istorage.runBagitRule(bagit_rule_file, bagit_input_path, bagit_input_resource) istorage.zipup(irods_bagit_input_path, bag_path) if res.raccess.published: # compute checksum to meet DataONE distribution requirement chksum = istorage.checksum(bag_path) res.bag_checksum = chksum istorage.setAVU(irods_bagit_input_path, 'bag_modified', "false") return True except SessionException as ex: # if an exception occurs, delete incomplete files potentially being generated by # iRODS bagit rule and zipping operations for fname in bagit_files: if istorage.exists(fname): istorage.delete(fname) logger.error(ex.stderr) return False else: logger.error('Resource does not exist.') return False
def test_create_bag_files(self): # this is the api call we are testing irods_storage_obj = hs_bagit.create_bag_files(self.test_res) self.assertTrue(isinstance(irods_storage_obj, IrodsStorage))
def create_bag_by_irods(resource_id): """Create a resource bag on iRODS side by running the bagit rule and ibun zip. This function runs as a celery task, invoked asynchronously so that it does not block the main web thread when it creates bags for very large files which will take some time. :param resource_id: the resource uuid that is used to look for the resource to create the bag for. :return: True if bag creation operation succeeds; False if there is an exception raised or resource does not exist. """ from hs_core.hydroshare.utils import get_resource_by_shortkey res = get_resource_by_shortkey(resource_id) istorage = res.get_irods_storage() metadata_dirty = istorage.getAVU(res.root_path, 'metadata_dirty') # if metadata has been changed, then regenerate metadata xml files if metadata_dirty is None or metadata_dirty.lower() == "true": try: create_bag_files(res) except Exception as ex: logger.error('Failed to create bag files. Error:{}'.format( ex.message)) return False bag_full_name = 'bags/{res_id}.zip'.format(res_id=resource_id) if res.resource_federation_path: irods_bagit_input_path = os.path.join(res.resource_federation_path, resource_id) is_exist = istorage.exists(irods_bagit_input_path) # check to see if bagit readme.txt file exists or not bagit_readme_file = '{fed_path}/{res_id}/readme.txt'.format( fed_path=res.resource_federation_path, res_id=resource_id) is_bagit_readme_exist = istorage.exists(bagit_readme_file) bagit_input_path = "*BAGITDATA='{path}'".format( path=irods_bagit_input_path) bagit_input_resource = "*DESTRESC='{def_res}'".format( def_res=settings.HS_IRODS_USER_ZONE_DEF_RES) bag_full_name = os.path.join(res.resource_federation_path, bag_full_name) bagit_files = [ '{fed_path}/{res_id}/bagit.txt'.format( fed_path=res.resource_federation_path, res_id=resource_id), '{fed_path}/{res_id}/manifest-md5.txt'.format( fed_path=res.resource_federation_path, res_id=resource_id), '{fed_path}/{res_id}/tagmanifest-md5.txt'.format( fed_path=res.resource_federation_path, res_id=resource_id), '{fed_path}/bags/{res_id}.zip'.format( fed_path=res.resource_federation_path, res_id=resource_id) ] else: is_exist = istorage.exists(resource_id) # check to see if bagit readme.txt file exists or not bagit_readme_file = '{res_id}/readme.txt'.format(res_id=resource_id) is_bagit_readme_exist = istorage.exists(bagit_readme_file) irods_dest_prefix = "/" + settings.IRODS_ZONE + "/home/" + settings.IRODS_USERNAME irods_bagit_input_path = os.path.join(irods_dest_prefix, resource_id) bagit_input_path = "*BAGITDATA='{path}'".format( path=irods_bagit_input_path) bagit_input_resource = "*DESTRESC='{def_res}'".format( def_res=settings.IRODS_DEFAULT_RESOURCE) bagit_files = [ '{res_id}/bagit.txt'.format(res_id=resource_id), '{res_id}/manifest-md5.txt'.format(res_id=resource_id), '{res_id}/tagmanifest-md5.txt'.format(res_id=resource_id), 'bags/{res_id}.zip'.format(res_id=resource_id) ] # only proceed when the resource is not deleted potentially by another request # when being downloaded if is_exist: # if bagit readme.txt does not exist, add it. if not is_bagit_readme_exist: from_file_name = getattr(settings, 'HS_BAGIT_README_FILE_WITH_PATH', 'docs/bagit/readme.txt') istorage.saveFile(from_file_name, bagit_readme_file, True) # call iRODS bagit rule here bagit_rule_file = getattr(settings, 'IRODS_BAGIT_RULE', 'hydroshare/irods/ruleGenerateBagIt_HS.r') try: # call iRODS run and ibun command to create and zip the bag, ignore SessionException # for now as a workaround which could be raised from potential race conditions when # multiple ibun commands try to create the same zip file or the very same resource # gets deleted by another request when being downloaded istorage.runBagitRule(bagit_rule_file, bagit_input_path, bagit_input_resource) istorage.zipup(irods_bagit_input_path, bag_full_name) istorage.setAVU(irods_bagit_input_path, 'bag_modified', "false") return True except SessionException as ex: # if an exception occurs, delete incomplete files potentially being generated by # iRODS bagit rule and zipping operations for fname in bagit_files: if istorage.exists(fname): istorage.delete(fname) logger.error(ex.stderr) return False else: logger.error('Resource does not exist.') return False
def create_bag_by_irods(resource_id): """Create a resource bag on iRODS side by running the bagit rule and ibun zip. This function runs as a celery task, invoked asynchronously so that it does not block the main web thread when it creates bags for very large files which will take some time. :param resource_id: the resource uuid that is used to look for the resource to create the bag for. :return: True if bag creation operation succeeds; False if there is an exception raised or resource does not exist. """ from hs_core.hydroshare.utils import get_resource_by_shortkey res = get_resource_by_shortkey(resource_id) istorage = res.get_irods_storage() metadata_dirty = istorage.getAVU(res.root_path, 'metadata_dirty') # if metadata has been changed, then regenerate metadata xml files if metadata_dirty is None or metadata_dirty.lower() == "true": try: create_bag_files(res) except Exception as ex: logger.error('Failed to create bag files. Error:{}'.format(ex.message)) return False bag_full_name = 'bags/{res_id}.zip'.format(res_id=resource_id) if res.resource_federation_path: irods_bagit_input_path = os.path.join(res.resource_federation_path, resource_id) is_exist = istorage.exists(irods_bagit_input_path) # check to see if bagit readme.txt file exists or not bagit_readme_file = '{fed_path}/{res_id}/readme.txt'.format( fed_path=res.resource_federation_path, res_id=resource_id) is_bagit_readme_exist = istorage.exists(bagit_readme_file) bagit_input_path = "*BAGITDATA='{path}'".format(path=irods_bagit_input_path) bagit_input_resource = "*DESTRESC='{def_res}'".format( def_res=settings.HS_IRODS_LOCAL_ZONE_DEF_RES) bag_full_name = os.path.join(res.resource_federation_path, bag_full_name) bagit_files = [ '{fed_path}/{res_id}/bagit.txt'.format(fed_path=res.resource_federation_path, res_id=resource_id), '{fed_path}/{res_id}/manifest-md5.txt'.format( fed_path=res.resource_federation_path, res_id=resource_id), '{fed_path}/{res_id}/tagmanifest-md5.txt'.format( fed_path=res.resource_federation_path, res_id=resource_id), '{fed_path}/bags/{res_id}.zip'.format(fed_path=res.resource_federation_path, res_id=resource_id) ] else: is_exist = istorage.exists(resource_id) # check to see if bagit readme.txt file exists or not bagit_readme_file = '{res_id}/readme.txt'.format(res_id=resource_id) is_bagit_readme_exist = istorage.exists(bagit_readme_file) irods_dest_prefix = "/" + settings.IRODS_ZONE + "/home/" + settings.IRODS_USERNAME irods_bagit_input_path = os.path.join(irods_dest_prefix, resource_id) bagit_input_path = "*BAGITDATA='{path}'".format(path=irods_bagit_input_path) bagit_input_resource = "*DESTRESC='{def_res}'".format( def_res=settings.IRODS_DEFAULT_RESOURCE) bagit_files = [ '{res_id}/bagit.txt'.format(res_id=resource_id), '{res_id}/manifest-md5.txt'.format(res_id=resource_id), '{res_id}/tagmanifest-md5.txt'.format(res_id=resource_id), 'bags/{res_id}.zip'.format(res_id=resource_id) ] # only proceed when the resource is not deleted potentially by another request # when being downloaded if is_exist: # if bagit readme.txt does not exist, add it. if not is_bagit_readme_exist: from_file_name = getattr(settings, 'HS_BAGIT_README_FILE_WITH_PATH', 'docs/bagit/readme.txt') istorage.saveFile(from_file_name, bagit_readme_file, True) # call iRODS bagit rule here bagit_rule_file = getattr(settings, 'IRODS_BAGIT_RULE', 'hydroshare/irods/ruleGenerateBagIt_HS.r') try: # call iRODS run and ibun command to create and zip the bag, ignore SessionException # for now as a workaround which could be raised from potential race conditions when # multiple ibun commands try to create the same zip file or the very same resource # gets deleted by another request when being downloaded istorage.runBagitRule(bagit_rule_file, bagit_input_path, bagit_input_resource) istorage.zipup(irods_bagit_input_path, bag_full_name) istorage.setAVU(irods_bagit_input_path, 'bag_modified', "false") return True except SessionException as ex: # if an exception occurs, delete incomplete files potentially being generated by # iRODS bagit rule and zipping operations for fname in bagit_files: if istorage.exists(fname): istorage.delete(fname) logger.error(ex.stderr) return False else: logger.error('Resource does not exist.') return False
def test_create_bag_files(self): # this is the api call we are testing irods_storage_obj = hs_bagit.create_bag_files(self.test_res) self.assertTrue(isinstance(irods_storage_obj, IrodsStorage))
def handle(self, *args, **options): if len(options['resource_ids']) > 0: # an array of resource short_id to check. for rid in options['resource_ids']: try: resource = BaseResource.objects.get(short_id=rid) istorage = resource.get_irods_storage() scimeta_path = os.path.join(resource.root_path, 'data', 'resourcemetadata.xml') if istorage.exists(scimeta_path): print("found {}".format(scimeta_path)) else: print("{} NOT FOUND".format(scimeta_path)) resmap_path = os.path.join(resource.root_path, 'data', 'resourcemap.xml') if istorage.exists(resmap_path): print("found {}".format(resmap_path)) else: print("{} NOT FOUND".format(resmap_path)) if istorage.exists(resource.bag_path): print("found bag {}".format(resource.bag_path)) else: print("bag {} NOT FOUND".format(resource.bag_path)) dirty = resource.getAVU('metadata_dirty') print("metadata_dirty is {}".format(str(dirty))) modified = resource.getAVU('bag_modified') print("bag_modified is {}".format(str(modified))) if options['generate']: # generate usable bag create_bag_files(resource) print("metadata generated for {} from Django".format(rid)) resource.setAVU('metadata_dirty', 'false') print("metadata_dirty set to false for {}".format(rid)) create_bag_by_irods(rid) print("bag generated for {} from iRODs".format(rid)) resource.setAVU('bag_modified', 'false') print("bag_modified set to false for {}".format(rid)) elif options['generate_metadata']: create_bag_files(resource) print("metadata generated for {} from Django".format(rid)) resource.setAVU('metadata_dirty', 'false') print("metadata_dirty set to false for {}".format(rid)) elif options['generate_bag']: create_bag_by_irods(rid) print("bag generated for {} from iRODs".format(rid)) resource.setAVU('bag_modified', 'false') print("bag_modified set to false for {}".format(rid)) elif options['reset']: # reset all data to pristine resource.setAVU('metadata_dirty', 'true') print("metadata_dirty set to true for {}".format(rid)) try: istorage.delete(resource.scimeta_path) print("metadata {} deleted".format(resource.scimeta_path)) except SessionException as ex: print("delete of {} failed: {}" .format(resource.scimeta_path, ex.stderr)) try: istorage.delete(resource.resmap_path) print("map {} deleted".format(resource.resmap_path)) except SessionException as ex: print("delete of {} failed: {}" .format(resource.resmap_path, ex.stderr)) resource.setAVU('bag_modified', 'true') print("bag_modified set to true for {}".format(rid)) try: istorage.delete(resource.bag_path) print("bag {} deleted".format(resource.bag_path)) except SessionException as ex: print("delete of {} failed: {}" .format(resource.bag_path, ex.stderr)) elif options['reset_metadata']: resource.setAVU('metadata_dirty', 'true') print("metadata_dirty set to true for {}".format(rid)) try: istorage.delete(resource.scimeta_path) print("metadata {} deleted".format(resource.scimeta_path)) except SessionException as ex: print("delete of {} failed: {}" .format(resource.scimeta_path, ex.stderr)) try: istorage.delete(resource.resmap_path) print("map {} deleted".format(resource.resmap_path)) except SessionException as ex: print("delete of {} failed: {}" .format(resource.resmap_path, ex.stderr)) elif options['reset_bag']: resource.setAVU('bag_modified', 'true') print("bag_modified set to true for {}".format(rid)) try: istorage.delete(resource.bag_path) print("bag {} deleted".format(resource.bag_path)) except SessionException as ex: print("delete of {} failed: {}" .format(resource.bag_path, ex.stderr)) except BaseResource.DoesNotExist: print("Resource with id {} NOT FOUND in Django".format(rid))
def check_bag(rid, options): requests.packages.urllib3.disable_warnings() try: resource = BaseResource.objects.get(short_id=rid) istorage = resource.get_irods_storage() root_exists = istorage.exists(resource.root_path) if root_exists: # print status of metadata/bag system scimeta_path = os.path.join(resource.root_path, 'data', 'resourcemetadata.xml') scimeta_exists = istorage.exists(scimeta_path) if scimeta_exists: print("resource metadata {} found".format(scimeta_path)) else: print("resource metadata {} NOT FOUND".format(scimeta_path)) resmap_path = os.path.join(resource.root_path, 'data', 'resourcemap.xml') resmap_exists = istorage.exists(resmap_path) if resmap_exists: print("resource map {} found".format(resmap_path)) else: print("resource map {} NOT FOUND".format(resmap_path)) bag_exists = istorage.exists(resource.bag_path) if bag_exists: print("bag {} found".format(resource.bag_path)) else: print("bag {} NOT FOUND".format(resource.bag_path)) dirty = resource.getAVU('metadata_dirty') print("{}.metadata_dirty is {}".format(rid, str(dirty))) modified = resource.getAVU('bag_modified') print("{}.bag_modified is {}".format(rid, str(modified))) if options['reset']: # reset all data to pristine resource.setAVU('metadata_dirty', 'true') print("{}.metadata_dirty set to true".format(rid)) try: istorage.delete(resource.scimeta_path) print("{} deleted".format(resource.scimeta_path)) except SessionException as ex: print("{} delete failed: {}" .format(resource.scimeta_path, ex.stderr)) try: istorage.delete(resource.resmap_path) print("{} deleted".format(resource.resmap_path)) except SessionException as ex: print("{} delete failed: {}" .format(resource.resmap_path, ex.stderr)) resource.setAVU('bag_modified', 'true') print("{}.bag_modified set to true".format(rid)) try: istorage.delete(resource.bag_path) print("{} deleted".format(resource.bag_path)) except SessionException as ex: print("{} delete failed: {}" .format(resource.bag_path, ex.stderr)) if options['reset_metadata']: resource.setAVU('metadata_dirty', 'true') print("{}.metadata_dirty set to true".format(rid)) try: istorage.delete(resource.scimeta_path) print("{} deleted".format(resource.scimeta_path)) except SessionException as ex: print("delete of {} failed: {}" .format(resource.scimeta_path, ex.stderr)) try: istorage.delete(resource.resmap_path) print("{} deleted".format(resource.resmap_path)) except SessionException as ex: print("{} delete failed: {}" .format(resource.resmap_path, ex.stderr)) if options['reset_bag']: resource.setAVU('bag_modified', 'true') print("{}.bag_modified set to true".format(rid)) try: istorage.delete(resource.bag_path) print("{} deleted".format(resource.bag_path)) except SessionException as ex: print("{} delete failed: {}" .format(resource.bag_path, ex.stderr)) if options['generate']: # generate usable bag if not options['if_needed'] or dirty or not scimeta_exists or not resmap_exists: try: create_bag_files(resource) except ValueError as e: print("{}: value error encountered: {}".format(rid, e.message)) return print("{} metadata generated from Django".format(rid)) resource.setAVU('metadata_dirty', 'false') resource.setAVU('bag_modified', 'true') print("{}.metadata_dirty set to false".format(rid)) if not options['if_needed'] or modified or not bag_exists: create_bag_by_irods(rid) print("{} bag generated from iRODs".format(rid)) resource.setAVU('bag_modified', 'false') print("{}.bag_modified set to false".format(rid)) if options['generate_metadata']: if not options['if_needed'] or dirty or not scimeta_exists or not resmap_exists: try: create_bag_files(resource) except ValueError as e: print("{}: value error encountered: {}".format(rid, e.message)) return print("{}: metadata generated from Django".format(rid)) resource.setAVU('metadata_dirty', 'false') print("{}.metadata_dirty set to false".format(rid)) resource.setAVU('bag_modified', 'true') print("{}.bag_modified set to false".format(rid)) if options['generate_bag']: if not options['if_needed'] or modified or not bag_exists: create_bag_by_irods(rid) print("{}: bag generated from iRODs".format(rid)) resource.setAVU('bag_modified', 'false') print("{}.bag_modified set to false".format(rid)) if options['download_bag']: if options['password']: server = getattr(settings, 'FQDN_OR_IP', 'www.hydroshare.org') uri = "https://{}/hsapi/resource/{}/".format(server, rid) print("download uri is {}".format(uri)) r = hs_requests.get(uri, verify=False, stream=True, auth=requests.auth.HTTPBasicAuth(options['login'], options['password'])) print("download return status is {}".format(str(r.status_code))) print("redirects:") for thing in r.history: print("...url: {}".format(thing.url)) filename = 'tmp/check_bag_block' with open(filename, 'wb') as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) else: print("cannot download bag without username and password.") if options['open_bag']: if options['password']: server = getattr(settings, 'FQDN_OR_IP', 'www.hydroshare.org') uri = "https://{}/hsapi/resource/{}/".format(server, rid) print("download uri is {}".format(uri)) r = hs_requests.get(uri, verify=False, stream=True, auth=requests.auth.HTTPBasicAuth(options['login'], options['password'])) print("download return status is {}".format(str(r.status_code))) print("redirects:") for thing in r.history: print("...url: {}".format(thing.url)) filename = 'tmp/check_bag_block' with open(filename, 'wb') as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) break else: print("cannot open bag without username and password.") else: print("Resource with id {} does not exist in iRODS".format(rid)) except BaseResource.DoesNotExist: print("Resource with id {} NOT FOUND in Django".format(rid))
def put(self, request, pk): # Update science metadata based on resourcemetadata.xml uploaded resource, authorized, user = view_utils.authorize( request, pk, needed_permission=ACTION_TO_AUTHORIZE.EDIT_RESOURCE, raises_exception=False) if not authorized: raise PermissionDenied() files = request.FILES.values() if len(files) == 0: error_msg = { 'file': 'No resourcemetadata.xml file was found to update resource ' 'metadata.' } raise ValidationError(detail=error_msg) elif len(files) > 1: error_msg = { 'file': ('More than one file was found. Only one file, named ' 'resourcemetadata.xml, ' 'can be used to update resource metadata.') } raise ValidationError(detail=error_msg) scimeta = files[0] if scimeta.content_type not in self.ACCEPT_FORMATS: error_msg = { 'file': ("Uploaded file has content type {t}, " "but only these types are accepted: {e}.").format( t=scimeta.content_type, e=",".join(self.ACCEPT_FORMATS)) } raise ValidationError(detail=error_msg) expect = 'resourcemetadata.xml' if scimeta.name != expect: error_msg = { 'file': "Uploaded file has name {n}, but expected {e}.".format( n=scimeta.name, e=expect) } raise ValidationError(detail=error_msg) # Temp directory to store resourcemetadata.xml tmp_dir = tempfile.mkdtemp() try: # Fake the bag structure so that GenericResourceMeta.read_metadata_from_resource_bag # can read and validate the system and science metadata for us. bag_data_path = os.path.join(tmp_dir, 'data') os.mkdir(bag_data_path) # Copy new science metadata to bag data path scimeta_path = os.path.join(bag_data_path, 'resourcemetadata.xml') shutil.copy(scimeta.temporary_file_path(), scimeta_path) # Copy existing resource map to bag data path # (use a file-like object as the file may be in iRODS, so we can't # just copy it to a local path) resmeta_path = os.path.join(bag_data_path, 'resourcemap.xml') with open(resmeta_path, 'wb') as resmeta: storage = get_file_storage() resmeta_irods = storage.open(AbstractResource.sysmeta_path(pk)) shutil.copyfileobj(resmeta_irods, resmeta) resmeta_irods.close() try: # Read resource system and science metadata domain = Site.objects.get_current().domain rm = GenericResourceMeta.read_metadata_from_resource_bag( tmp_dir, hydroshare_host=domain) # Update resource metadata rm.write_metadata_to_resource(resource, update_title=True, update_keywords=True) create_bag_files(resource) except HsDeserializationDependencyException as e: msg = ( "HsDeserializationDependencyException encountered when updating " "science metadata for resource {pk}; depedent resource was {dep}." ) msg = msg.format(pk=pk, dep=e.dependency_resource_id) logger.error(msg) raise ValidationError(detail=msg) except HsDeserializationException as e: raise ValidationError(detail=e.message) resource_modified(resource, request.user, overwrite_bag=False) return Response(data={'resource_id': pk}, status=status.HTTP_202_ACCEPTED) finally: shutil.rmtree(tmp_dir)
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True, *args, **kwargs): split_path_strs = path.split('/') is_bag_download = False is_zip_download = False is_sf_agg_file = False if split_path_strs[0] == 'bags': res_id = os.path.splitext(split_path_strs[1])[0] is_bag_download = True elif split_path_strs[0] == 'zips': if path.endswith('.zip'): res_id = os.path.splitext(split_path_strs[2])[0] else: res_id = os.path.splitext(split_path_strs[1])[0] is_zip_download = True else: res_id = split_path_strs[0] # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize( request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: response.content = "<h1>" + content_msg + "</h1>" return response if res.resource_type == "CompositeResource" and not path.endswith(".zip"): for f in ResourceFile.objects.filter(object_id=res.id): if path == f.storage_path: if f.has_logical_file and f.logical_file.is_single_file_aggregation: is_sf_agg_file = True if res.resource_federation_path: # the resource is stored in federated zone istorage = IrodsStorage('federated') federated_path = res.resource_federation_path path = os.path.join(federated_path, path) session = icommands.ACTIVE_SESSION else: # TODO: From Alva: I do not understand the use case for changing the environment. # TODO: This seems an enormous potential vulnerability, as arguments are # TODO: passed from the URI directly to IRODS without verification. istorage = IrodsStorage() federated_path = '' if 'environment' in kwargs: environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') resource_cls = check_resource_type(res.resource_type) if federated_path: res_root = os.path.join(federated_path, res_id) else: res_root = res_id if is_zip_download or is_sf_agg_file: if not path.endswith( ".zip"): # requesting folder that needs to be zipped input_path = path.split(res_id)[1] random_hash = random.getrandbits(32) daily_date = datetime.datetime.today().strftime('%Y-%m-%d') random_hash_path = 'zips/{daily_date}/{res_id}/{rand_folder}'.format( daily_date=daily_date, res_id=res_id, rand_folder=random_hash) output_path = '{random_hash_path}{path}.zip'.format( random_hash_path=random_hash_path, path=input_path) if res.resource_type == "CompositeResource": aggregation_name = input_path[len('/data/contents/'):] res.create_aggregation_xml_documents( aggregation_name=aggregation_name) if use_async: task = create_temp_zip.apply_async( (res_id, input_path, output_path, is_sf_agg_file), countdown=3) delete_zip.apply_async( (random_hash_path, ), countdown=(20 * 60)) # delete after 20 minutes if is_sf_agg_file: download_path = request.path.split(res_id)[0] + output_path else: download_path = request.path.split("zips")[0] + output_path if rest_call: return HttpResponse(json.dumps({ 'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': download_path }), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = download_path return HttpResponseRedirect(res.get_absolute_url()) ret_status = create_temp_zip(res_id, input_path, output_path, is_sf_agg_file) delete_zip.apply_async( (random_hash_path, ), countdown=(20 * 60)) # delete after 20 minutes if not ret_status: content_msg = "Zip cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response path = output_path bag_modified = istorage.getAVU(res_root, 'bag_modified') # make sure if bag_modified is not set to true, we still recreate the bag if the # bag file does not exist for some reason to resolve the error to download a nonexistent # bag when bag_modified is false due to the flag being out-of-sync with the real bag status if bag_modified is None or bag_modified.lower() == "false": # check whether the bag file exists bag_file_name = res_id + '.zip' if res_root.startswith(res_id): bag_full_path = os.path.join('bags', bag_file_name) else: bag_full_path = os.path.join(federated_path, 'bags', bag_file_name) # set bag_modified to 'true' if the bag does not exist so that it can be recreated # and the bag_modified AVU will be set correctly as well subsequently if not istorage.exists(bag_full_path): bag_modified = 'true' metadata_dirty = istorage.getAVU(res_root, 'metadata_dirty') # do on-demand bag creation # needs to check whether res_id collection exists before getting/setting AVU on it # to accommodate the case where the very same resource gets deleted by another request # when it is getting downloaded if is_bag_download: # send signal for pre_check_bag_flag pre_check_bag_flag.send(sender=resource_cls, resource=res) if bag_modified is None or bag_modified.lower() == "true": if metadata_dirty is None or metadata_dirty.lower() == 'true': create_bag_files(res) if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task = create_bag_by_irods.apply_async((res_id, ), countdown=3) if rest_call: return HttpResponse(json.dumps({ 'bag_status': 'Not ready', 'task_id': task.task_id }), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = request.path return HttpResponseRedirect(res.get_absolute_url()) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response elif metadata_dirty is None or metadata_dirty.lower() == 'true': if path.endswith("resourcemap.xml") or path.endswith( 'resourcemetadata.xml'): # we need to regenerate the metadata xml files create_bag_files(res) # send signal for pre download file download_file_name = split_path_strs[-1] pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header stdout = session.run("ils", None, "-l", path)[0].split() flen = int(stdout[3]) # If this path is resource_federation_path, then the file is a local user file userpath = '/' + os.path.join( getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home', getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy')) # Allow reverse proxy if request was forwarded by nginx # (HTTP_X_DJANGO_REVERSE_PROXY is 'true') # and reverse proxy is possible according to configuration. if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. This deals with federated resources by reading their path, matching local vaults, and # redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At # present, the only one of these is /irods-user, which handles files whose federation # path is stored in the variable 'userpath'. # 4. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(path): content_msg = "file path {} does not exist in iRODS".format(path) response = HttpResponse(status=404) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response if not res.is_federated: # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response[ 'Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join( [getattr(settings, 'IRODS_DATA_URI', '/irods-data'), path]) return response elif res.resource_federation_path == userpath: # this guarantees a "user" resource # invoke X-Accel-Redirect on physical vault file in nginx # if path is full user path; strip federation prefix if path.startswith(userpath): path = path[len(userpath) + 1:] # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response[ 'Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = os.path.join( getattr(settings, 'IRODS_USER_URI', '/irods-user'), path) return response # if we get here, none of the above conditions are true if flen <= FILE_SIZE_LIMIT: options = ('-', ) # we're redirecting to stdout. # this unusual way of calling works for federated or local resources proc = session.run_safe('iget', None, path, *options) response = FileResponse(proc.stdout, content_type=mtype) response[ 'Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen return response else: content_msg = "File larger than 1GB cannot be downloaded directly via HTTP. " \ "Please download the large file via iRODS clients." response = HttpResponse(status=403) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response