Exemplo n.º 1
0
    def update_bag(self):
        """
        Update a bag if necessary.

        This uses the Django signal pre_check_bag_flag to prepare collections,
        and then checks the AVUs 'metadata_dirty' and 'bag_modified' to determine
        whether to regenerate the metadata files and/or bag.

        This is a synchronous update. The call waits until the update is finished.
        """
        from hs_core.tasks import create_bag_by_irods
        from hs_core.hydroshare.resource import check_resource_type
        from hs_core.hydroshare.hs_bagit import create_bag_files

        # send signal for pre_check_bag_flag
        resource_cls = check_resource_type(self.resource_type)
        pre_check_bag_flag.send(sender=resource_cls, resource=self)

        metadata_dirty = self.getAVU('metadata_dirty')
        bag_modified = self.getAVU('bag_modified')

        if metadata_dirty:  # automatically cast to Bool
            create_bag_files(self)
            self.setAVU('metadata_dirty', False)

        # the ticket system does synchronous bag creation.
        # async bag creation isn't supported.
        if bag_modified:  # automatically cast to Bool
            create_bag_by_irods(self.short_id)
            self.setAVU('bag_modified', False)
Exemplo n.º 2
0
    def update_bag(self):
        """
        Update a bag if necessary.

        This uses the Django signal pre_check_bag_flag to prepare collections,
        and then checks the AVUs 'metadata_dirty' and 'bag_modified' to determine
        whether to regenerate the metadata files and/or bag.

        This is a synchronous update. The call waits until the update is finished.
        """
        from hs_core.tasks import create_bag_by_irods
        from hs_core.hydroshare.resource import check_resource_type
        from hs_core.hydroshare.hs_bagit import create_bag_files

        # send signal for pre_check_bag_flag
        resource_cls = check_resource_type(self.resource_type)
        pre_check_bag_flag.send(sender=resource_cls, resource=self)

        metadata_dirty = self.getAVU('metadata_dirty')
        bag_modified = self.getAVU('bag_modified')

        if metadata_dirty:  # automatically cast to Bool
            create_bag_files(self)
            self.setAVU('metadata_dirty', False)

        # the ticket system does synchronous bag creation.
        # async bag creation isn't supported.
        if bag_modified:  # automatically cast to Bool
            create_bag_by_irods(self.short_id)
            self.setAVU('bag_modified', False)
Exemplo n.º 3
0
def resource_modified(resource, by_user=None, overwrite_bag=True):
    """
    Set an AVU flag that forces the bag to be recreated before fetch.

    This indicates that some content of the bag has been edited.

    """

    resource.last_changed_by = by_user

    resource.updated = now().isoformat()
    # seems this is the best place to sync resource title with metadata title
    resource.title = resource.metadata.title.value
    resource.save()
    if resource.metadata.dates.all().filter(type='modified'):
        res_modified_date = resource.metadata.dates.all().filter(
            type='modified')[0]
        resource.metadata.update_element('date', res_modified_date.id)

    if overwrite_bag:
        create_bag_files(resource)

    # set bag_modified-true AVU pair for the modified resource in iRODS to indicate
    # the resource is modified for on-demand bagging.
    set_dirty_bag_flag(resource)
Exemplo n.º 4
0
    def update_metadata_files(self):
        """
        Make the metadata files resourcemetadata.xml and resourcemap.xml up to date.

        This checks the "metadata dirty" AVU before updating files if necessary.
        """
        from hs_core.hydroshare.hs_bagit import create_bag_files

        metadata_dirty = self.getAVU('metadata_dirty')
        if metadata_dirty:
            create_bag_files(self)
            self.setAVU('metadata_dirty', False)
Exemplo n.º 5
0
    def update_metadata_files(self):
        """
        Make the metadata files resourcemetadata.xml and resourcemap.xml up to date.

        This checks the "metadata dirty" AVU before updating files if necessary.
        """
        from hs_core.hydroshare.hs_bagit import create_bag_files

        metadata_dirty = self.getAVU('metadata_dirty')
        if metadata_dirty:
            create_bag_files(self)
            self.setAVU('metadata_dirty', False)
Exemplo n.º 6
0
def resource_modified(resource, by_user=None, overwrite_bag=True):
    resource.last_changed_by = by_user

    resource.updated = now().isoformat()
    resource.save()
    if resource.metadata.dates.all().filter(type='modified'):
        res_modified_date = resource.metadata.dates.all().filter(type='modified')[0]
        resource.metadata.update_element('date', res_modified_date.id)

    if overwrite_bag:
        create_bag_files(resource)

    istorage = IrodsStorage()
    # set bag_modified-true AVU pair for the modified resource in iRODS to indicate
    # the resource is modified for on-demand bagging.
    istorage.setAVU(resource.short_id, "bag_modified", "true")
Exemplo n.º 7
0
def create_or_update_from_package(resource, term, **kwargs):
    terms_dict = dict(StressPeriod='stress_period',
                      GroundWaterFlow='ground_water_flow',
                      BoundaryCondition='boundary_condition',
                      ModelCalibration='model_calibration',
                      GeneralElements='general_elements',
                      GridDimensions='grid_dimensions',
                      StudyArea='study_area')
    t = terms_dict[term.term]
    metadata_term_obj = getattr(resource.metadata, t)
    if not metadata_term_obj:
        resource.metadata.create_element(term.term, **kwargs)
    else:
        resource.metadata.update_element(term.term, metadata_term_obj.id,
                                         **kwargs)
    create_bag_files(resource)
Exemplo n.º 8
0
def resource_modified(resource, by_user=None, overwrite_bag=True):
    resource.last_changed_by = by_user

    resource.updated = now().isoformat()
    resource.save()
    if resource.metadata.dates.all().filter(type='modified'):
        res_modified_date = resource.metadata.dates.all().filter(
            type='modified')[0]
        resource.metadata.update_element('date', res_modified_date.id)

    if overwrite_bag:
        create_bag_files(resource)

    istorage = IrodsStorage()
    # set bag_modified-true AVU pair for the modified resource in iRODS to indicate
    # the resource is modified for on-demand bagging.
    istorage.setAVU(resource.short_id, "bag_modified", "true")
Exemplo n.º 9
0
def resource_modified(resource, by_user=None, overwrite_bag=True):
    """
    Set an AVU flag that forces the bag to be recreated before fetch.

    This indicates that some content of the bag has been edited.

    """

    resource.last_changed_by = by_user

    resource.updated = now().isoformat()
    # seems this is the best place to sync resource title with metadata title
    resource.title = resource.metadata.title.value
    resource.save()
    if resource.metadata.dates.all().filter(type='modified'):
        res_modified_date = resource.metadata.dates.all().filter(type='modified')[0]
        resource.metadata.update_element('date', res_modified_date.id)

    if overwrite_bag:
        create_bag_files(resource)

    # set bag_modified-true AVU pair for the modified resource in iRODS to indicate
    # the resource is modified for on-demand bagging.
    set_dirty_bag_flag(resource)
Exemplo n.º 10
0
def create_or_update_from_package(resource, term, **kwargs):
    terms_dict = dict(
            StressPeriod='stress_period',
            GroundWaterFlow='ground_water_flow',
            BoundaryCondition='boundary_condition',
            ModelCalibration='model_calibration',
            GeneralElements='general_elements',
            GridDimensions='grid_dimensions',
            StudyArea='study_area'
            )
    t = terms_dict[term.term]
    metadata_term_obj = getattr(resource.metadata, t)
    if not metadata_term_obj:
        resource.metadata.create_element(
            term.term,
            **kwargs
        )
    else:
        resource.metadata.update_element(
            term.term,
            metadata_term_obj.id,
            **kwargs
        )
    create_bag_files(resource)
Exemplo n.º 11
0
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True,
             *args, **kwargs):
    """ perform a download request, either asynchronously or synchronously

    :param request: the request object.
    :param path: the path of the thing to be downloaded.
    :param rest_call: True if calling from REST API
    :param use_async: True means to utilize asynchronous creation of objects to download.
    :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming.

    The following variables are computed:

    * `path` is the public path of the thing to be downloaded.
    * `irods_path` is the location of `path` in irods.
    * `output_path` is the output path to be reported in the response object.
    * `irods_output_path` is the location of `output_path` in irods

    and there are six cases:

    Zipped query param signal the download should be zipped
        - folders are always zipped regardless of this paramter
        - single file aggregations are zipped with the aggregation metadata files

    A path may point to:
    1. a single file
    2. a single-file-aggregation object in a composite resource.
    3. a folder
    3. a metadata object that may need updating.
    4. a bag that needs to be updated and then returned.
    6. a previously zipped file that was zipped asynchronously.

    """
    if __debug__:
        logger.debug("request path is {}".format(path))

    split_path_strs = path.split('/')
    while split_path_strs[-1] == '':
        split_path_strs.pop()
    path = u'/'.join(split_path_strs)  # no trailing slash

    # initialize case variables
    is_bag_download = False
    is_zip_download = False
    is_zip_request = request.GET.get('zipped', "False").lower() == "true"
    is_sf_agg_file = False
    is_sf_request = False

    if split_path_strs[0] == 'bags':
        is_bag_download = True
        # format is bags/{rid}.zip
        res_id = os.path.splitext(split_path_strs[1])[0]
    elif split_path_strs[0] == 'zips':
        is_zip_download = True
        # zips prefix means that we are following up on an asynchronous download request
        # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid
        res_id = split_path_strs[3]
    else:  # regular download request
        res_id = split_path_strs[0]

    if __debug__:
        logger.debug("resource id is {}".format(res_id))

    # now we have the resource Id and can authorize the request
    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(request, res_id,
                                   needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
                                   raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        if rest_call:
            raise PermissionDenied(content_msg)
        else:
            response.content = "<h1>" + content_msg + "</h1>"
            return response

    # default values are changed later as needed

    istorage = res.get_irods_storage()
    if res.is_federated:
        irods_path = os.path.join(res.resource_federation_path, path)
    else:
        irods_path = path
    # in many cases, path and output_path are the same.
    output_path = path

    irods_output_path = irods_path
    # folder requests are automatically zipped
    if not is_bag_download and not is_zip_download:  # path points into resource: should I zip it?
        store_path = u'/'.join(split_path_strs[1:])  # data/contents/{path-to-something}
        if res.is_folder(store_path):  # automatically zip folders
            is_zip_request = True
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path)
            if res.is_federated:
                irods_output_path = os.path.join(res.resource_federation_path, output_path)
            else:
                irods_output_path = output_path
            if __debug__:
                logger.debug("automatically zipping folder {} to {}".format(path, output_path))
        elif istorage.exists(irods_path):
            if __debug__:
                logger.debug("request for single file {}".format(path))
            is_sf_request = True

            # check for single file aggregations
            if "data/contents/" in path:  # not a metadata file
                for f in ResourceFile.objects.filter(object_id=res.id):
                    if path == f.storage_path:
                        is_sf_agg_file = True
                        if not is_zip_request and f.has_logical_file and \
                                f.logical_file.is_single_file_aggregation:
                            download_url = request.GET.get('url_download', 'false').lower()
                            if download_url == 'false':
                                # redirect to referenced url in the url file instead
                                redirect_url = f.logical_file.redirect_url
                                if redirect_url:
                                    return HttpResponseRedirect(redirect_url)
                        if __debug__:
                            logger.debug(
                                "request for single file aggregation {}".format(path))
                        break

            if is_zip_request:
                daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
                output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path)
                if res.is_federated:
                    irods_output_path = os.path.join(res.resource_federation_path, output_path)
                else:
                    irods_output_path = output_path

    # After this point, we have valid path, irods_path, output_path, and irods_output_path
    # * is_zip_request: signals download should be zipped, folders are always zipped
    # * is_sf_agg_file: path is a single-file aggregation in Composite Resource
    # * is_sf_request: path is a single-file
    # flags for download:
    # * is_bag_download: download a bag in format bags/{rid}.zip
    # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip
    # if none of these are set, it's a normal download

    # determine active session
    if res.is_federated:
        # the resource is stored in federated zone
        session = icommands.ACTIVE_SESSION
    else:
        # TODO: From Alva: I do not understand the use case for changing the environment.
        # TODO: This seems an enormous potential vulnerability, as arguments are
        # TODO: passed from the URI directly to IRODS without verification.
        if 'environment' in kwargs:
            logger.warn("setting iRODS from environment")
            environment = int(kwargs['environment'])
            environment = m.RodsEnvironment.objects.get(pk=environment)
            session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH,
                              session_id=uuid4())
            session.create_environment(environment)
            session.run('iinit', None, environment.auth)
        elif getattr(settings, 'IRODS_GLOBAL_SESSION', False):
            if __debug__:
                logger.debug("using GLOBAL_SESSION")
            session = GLOBAL_SESSION
        elif icommands.ACTIVE_SESSION:
            if __debug__:
                logger.debug("using ACTIVE_SESSION")
            session = icommands.ACTIVE_SESSION
        else:
            raise KeyError('settings must have IRODS_GLOBAL_SESSION set '
                           'if there is no environment object')

    resource_cls = check_resource_type(res.resource_type)

    if is_zip_request:

        if use_async:
            task = create_temp_zip.apply_async((res_id, irods_path, irods_output_path,
                                                is_sf_agg_file, is_sf_request))
            delete_zip.apply_async((irods_output_path, ),
                                   countdown=(60 * 60 * 24))  # delete after 24 hours

            if rest_call:
                return HttpResponse(
                    json.dumps({
                        'zip_status': 'Not ready',
                        'task_id': task.task_id,
                        'download_path': '/django_irods/rest_download/' + output_path}),
                    content_type="application/json")
            else:
                # return status to the UI
                request.session['task_id'] = task.task_id
                # TODO: this is mistaken for a bag download in the UI!
                # TODO: multiple asynchronous downloads don't stack!
                request.session['download_path'] = '/django_irods/download/' + output_path
                # redirect to resource landing page, which interprets session variables.
                return HttpResponseRedirect(res.get_absolute_url())

        else:  # synchronous creation of download
            ret_status = create_temp_zip(res_id, irods_path, irods_output_path,
                                         is_sf_agg_file, is_sf_request)
            delete_zip.apply_async((irods_output_path, ),
                                   countdown=(60 * 60 * 24))  # delete after 24 hours
            if not ret_status:
                content_msg = "Zip could not be created."
                response = HttpResponse()
                if rest_call:
                    response.content = content_msg
                else:
                    response.content = "<h1>" + content_msg + "</h1>"
                return response
            # At this point, output_path presumably exists and contains a zipfile
            # to be streamed below

    elif is_bag_download:
        # Shorten request if it contains extra junk at the end
        bag_file_name = res_id + '.zip'
        output_path = os.path.join('bags', bag_file_name)
        if not res.is_federated:
            irods_output_path = output_path
        else:
            irods_output_path = os.path.join(res.resource_federation_path, output_path)

        bag_modified = res.getAVU('bag_modified')
        # recreate the bag if it doesn't exist even if bag_modified is "false".
        if __debug__:
            logger.debug(u"irods_output_path is {}".format(irods_output_path))
        if bag_modified is None or not bag_modified:
            if not istorage.exists(irods_output_path):
                bag_modified = True

        # send signal for pre_check_bag_flag
        # this generates metadata other than that generated by create_bag_files.
        pre_check_bag_flag.send(sender=resource_cls, resource=res)

        metadata_dirty = res.getAVU('metadata_dirty')
        if metadata_dirty is None or metadata_dirty:
            create_bag_files(res)  # sets metadata_dirty to False
            bag_modified = "True"

        if bag_modified is None or bag_modified:
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task
                task = create_bag_by_irods.apply_async((res_id,), countdown=3)
                if rest_call:
                    return HttpResponse(json.dumps({'bag_status': 'Not ready',
                                                    'task_id': task.task_id}),
                                        content_type="application/json")

                request.session['task_id'] = task.task_id
                request.session['download_path'] = request.path
                return HttpResponseRedirect(res.get_absolute_url())
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    if rest_call:
                        response.content = content_msg
                    else:
                        response.content = "<h1>" + content_msg + "</h1>"
                    return response

    else:  # regular file download
        # if fetching main metadata files, then these need to be refreshed.
        if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'):
            metadata_dirty = res.getAVU("metadata_dirty")
            if metadata_dirty is None or metadata_dirty:
                create_bag_files(res)  # sets metadata_dirty to False

        # send signal for pre download file
        # TODO: does not contain subdirectory information: duplicate refreshes possible
        download_file_name = split_path_strs[-1]  # end of path
        # this logs the download request in the tracking system
        pre_download_file.send(sender=resource_cls, resource=res,
                               download_file_name=download_file_name,
                               request=request)

    # If we get this far,
    # * path and irods_path point to true input
    # * output_path and irods_output_path point to true output.
    # Try to stream the file back to the requester.

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(output_path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    # TODO: standardize this to make it less brittle
    stdout = session.run("ils", None, "-l", irods_output_path)[0].split()
    flen = int(stdout[3])

    # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true')
    # and reverse proxy is possible according to configuration (SENDFILE_ON=True)
    # and reverse proxy isn't overridden by user (use_reverse_proxy=True).

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. This deals with federated resources by reading their path, matching local vaults, and
        #    redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At
        #    present, the only one of these is /irods-user, which handles files whose federation
        #    path is stored in the variable 'userpath'.
        # 4. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # If this path is resource_federation_path, then the file is a local user file
        userpath = '/' + os.path.join(
            getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'),
            'home',
            getattr(settings, 'HS_IRODS_PROXY_USER_IN_USER_ZONE', 'localHydroProxy'))

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(irods_output_path):
            content_msg = "file path {} does not exist in iRODS".format(output_path)
            response = HttpResponse(status=404)
            if rest_call:
                response.content = content_msg
            else:
                response.content = "<h1>" + content_msg + "</h1>"
            return response

        if not res.is_federated:
            # track download count
            res.update_download_count()

            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = '/'.join([
                getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path])
            if __debug__:
                logger.debug("Reverse proxying local {}".format(response['X-Accel-Redirect']))
            return response

        elif res.resource_federation_path == userpath:  # this guarantees a "user" resource
            # track download count
            res.update_download_count()

            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = os.path.join(
                getattr(settings, 'IRODS_USER_URI', '/irods-user'), output_path)
            if __debug__:
                logger.debug("Reverse proxying user {}".format(response['X-Accel-Redirect']))
            return response

    # if we get here, none of the above conditions are true
    # if reverse proxy is enabled, then this is because the resource is remote and federated
    # OR the user specifically requested a non-proxied download.

    options = ('-',)  # we're redirecting to stdout.
    # this unusual way of calling works for streaming federated or local resources
    if __debug__:
        logger.debug("Locally streaming {}".format(output_path))
    # track download count
    res.update_download_count()
    proc = session.run_safe('iget', None, irods_output_path, *options)
    response = FileResponse(proc.stdout, content_type=mtype)
    response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
        name=output_path.split('/')[-1])
    response['Content-Length'] = flen
    return response
Exemplo n.º 12
0
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True,
             *args, **kwargs):
    split_path_strs = path.split('/')
    is_bag_download = False
    is_zip_download = False
    is_sf_agg_file = False
    if split_path_strs[0] == 'bags':
        res_id = os.path.splitext(split_path_strs[1])[0]
        is_bag_download = True
    elif split_path_strs[0] == 'zips':
        if path.endswith('.zip'):
            res_id = os.path.splitext(split_path_strs[2])[0]
        else:
            res_id = os.path.splitext(split_path_strs[1])[0]
        is_zip_download = True
    else:
        res_id = split_path_strs[0]

    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(request, res_id,
                                   needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
                                   raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        if rest_call:
            raise PermissionDenied(content_msg)
        else:
            response.content = "<h1>" + content_msg + "</h1>"
            return response

    if res.resource_type == "CompositeResource" and not path.endswith(".zip"):
        for f in ResourceFile.objects.filter(object_id=res.id):
            if path == f.storage_path:
                if f.has_logical_file and f.logical_file.is_single_file_aggregation:
                    is_sf_agg_file = True

    if res.resource_federation_path:
        # the resource is stored in federated zone
        istorage = IrodsStorage('federated')
        federated_path = res.resource_federation_path
        path = os.path.join(federated_path, path)
        session = icommands.ACTIVE_SESSION
    else:
        # TODO: From Alva: I do not understand the use case for changing the environment.
        # TODO: This seems an enormous potential vulnerability, as arguments are
        # TODO: passed from the URI directly to IRODS without verification.
        istorage = IrodsStorage()
        federated_path = ''
        if 'environment' in kwargs:
            environment = int(kwargs['environment'])
            environment = m.RodsEnvironment.objects.get(pk=environment)
            session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH,
                              session_id=uuid4())
            session.create_environment(environment)
            session.run('iinit', None, environment.auth)
        elif getattr(settings, 'IRODS_GLOBAL_SESSION', False):
            session = GLOBAL_SESSION
        elif icommands.ACTIVE_SESSION:
            session = icommands.ACTIVE_SESSION
        else:
            raise KeyError('settings must have IRODS_GLOBAL_SESSION set '
                           'if there is no environment object')

    resource_cls = check_resource_type(res.resource_type)

    if federated_path:
        res_root = os.path.join(federated_path, res_id)
    else:
        res_root = res_id

    if is_zip_download or is_sf_agg_file:
        if not path.endswith(".zip"):  # requesting folder that needs to be zipped
            input_path = path.split(res_id)[1]
            random_hash = random.getrandbits(32)
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            random_hash_path = 'zips/{daily_date}/{res_id}/{rand_folder}'.format(
                daily_date=daily_date, res_id=res_id,
                rand_folder=random_hash)
            output_path = '{random_hash_path}{path}.zip'.format(random_hash_path=random_hash_path,
                                                                path=input_path)

            if res.resource_type == "CompositeResource":
                aggregation_name = input_path[len('/data/contents/'):]
                res.create_aggregation_xml_documents(aggregation_name=aggregation_name)

            if use_async:
                task = create_temp_zip.apply_async((res_id, input_path, output_path,
                                                    is_sf_agg_file), countdown=3)
                delete_zip.apply_async((random_hash_path, ),
                                       countdown=(20 * 60))  # delete after 20 minutes
                if is_sf_agg_file:
                    download_path = request.path.split(res_id)[0] + output_path
                else:
                    download_path = request.path.split("zips")[0] + output_path
                if rest_call:
                    return HttpResponse(json.dumps({'zip_status': 'Not ready',
                                                    'task_id': task.task_id,
                                                    'download_path': download_path}),
                                        content_type="application/json")
                request.session['task_id'] = task.task_id
                request.session['download_path'] = download_path
                return HttpResponseRedirect(res.get_absolute_url())

            ret_status = create_temp_zip(res_id, input_path, output_path, is_sf_agg_file)
            delete_zip.apply_async((random_hash_path, ),
                                   countdown=(20 * 60))  # delete after 20 minutes
            if not ret_status:
                content_msg = "Zip cannot be created successfully. Check log for details."
                response = HttpResponse()
                if rest_call:
                    response.content = content_msg
                else:
                    response.content = "<h1>" + content_msg + "</h1>"
                return response

            path = output_path

    bag_modified = istorage.getAVU(res_root, 'bag_modified')
    # make sure if bag_modified is not set to true, we still recreate the bag if the
    # bag file does not exist for some reason to resolve the error to download a nonexistent
    # bag when bag_modified is false due to the flag being out-of-sync with the real bag status
    if bag_modified is None or bag_modified.lower() == "false":
        # check whether the bag file exists
        bag_file_name = res_id + '.zip'
        if res_root.startswith(res_id):
            bag_full_path = os.path.join('bags', bag_file_name)
        else:
            bag_full_path = os.path.join(federated_path, 'bags', bag_file_name)
        # set bag_modified to 'true' if the bag does not exist so that it can be recreated
        # and the bag_modified AVU will be set correctly as well subsequently
        if not istorage.exists(bag_full_path):
            bag_modified = 'true'

    metadata_dirty = istorage.getAVU(res_root, 'metadata_dirty')
    # do on-demand bag creation
    # needs to check whether res_id collection exists before getting/setting AVU on it
    # to accommodate the case where the very same resource gets deleted by another request
    # when it is getting downloaded

    if is_bag_download:
        # send signal for pre_check_bag_flag
        pre_check_bag_flag.send(sender=resource_cls, resource=res)
        if bag_modified is None or bag_modified.lower() == "true":
            if metadata_dirty is None or metadata_dirty.lower() == 'true':
                create_bag_files(res)
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task
                task = create_bag_by_irods.apply_async((res_id,), countdown=3)
                if rest_call:
                    return HttpResponse(json.dumps({'bag_status': 'Not ready',
                                                    'task_id': task.task_id}),
                                        content_type="application/json")

                request.session['task_id'] = task.task_id
                request.session['download_path'] = request.path
                return HttpResponseRedirect(res.get_absolute_url())
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    if rest_call:
                        response.content = content_msg
                    else:
                        response.content = "<h1>" + content_msg + "</h1>"
                    return response

    elif metadata_dirty is None or metadata_dirty.lower() == 'true':
        if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'):
            # we need to regenerate the metadata xml files
            create_bag_files(res)

    # send signal for pre download file
    download_file_name = split_path_strs[-1]
    pre_download_file.send(sender=resource_cls, resource=res,
                           download_file_name=download_file_name,
                           request=request)

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    stdout = session.run("ils", None, "-l", path)[0].split()
    flen = int(stdout[3])

    # If this path is resource_federation_path, then the file is a local user file
    userpath = '/' + os.path.join(
        getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'),
        'home',
        getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy'))

    # Allow reverse proxy if request was forwarded by nginx
    # (HTTP_X_DJANGO_REVERSE_PROXY is 'true')
    # and reverse proxy is possible according to configuration.

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. This deals with federated resources by reading their path, matching local vaults, and
        #    redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At
        #    present, the only one of these is /irods-user, which handles files whose federation
        #    path is stored in the variable 'userpath'.
        # 4. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(path):
            content_msg = "file path {} does not exist in iRODS".format(path)
            response = HttpResponse(status=404)
            if rest_call:
                response.content = content_msg
            else:
                response.content = "<h1>" + content_msg + "</h1>"
            return response

        if not res.is_federated:
            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = '/'.join([
                getattr(settings, 'IRODS_DATA_URI', '/irods-data'), path])
            return response

        elif res.resource_federation_path == userpath:  # this guarantees a "user" resource
            # invoke X-Accel-Redirect on physical vault file in nginx
            # if path is full user path; strip federation prefix
            if path.startswith(userpath):
                path = path[len(userpath)+1:]
            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = os.path.join(
                getattr(settings, 'IRODS_USER_URI', '/irods-user'), path)
            return response

    # if we get here, none of the above conditions are true
    if flen <= FILE_SIZE_LIMIT:
        options = ('-',)  # we're redirecting to stdout.
        # this unusual way of calling works for federated or local resources
        proc = session.run_safe('iget', None, path, *options)
        response = FileResponse(proc.stdout, content_type=mtype)
        response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
            name=path.split('/')[-1])
        response['Content-Length'] = flen
        return response

    else:
        content_msg = "File larger than 1GB cannot be downloaded directly via HTTP. " \
                      "Please download the large file via iRODS clients."
        response = HttpResponse(status=403)
        if rest_call:
            response.content = content_msg
        else:
            response.content = "<h1>" + content_msg + "</h1>"
        return response
Exemplo n.º 13
0
    def put(self, request, pk):
        # Update science metadata based on resourcemetadata.xml uploaded
        resource, authorized, user = view_utils.authorize(
            request, pk,
            needed_permission=ACTION_TO_AUTHORIZE.EDIT_RESOURCE,
            raises_exception=False)
        if not authorized:
            raise PermissionDenied()

        files = request.FILES.values()
        if len(files) == 0:
            error_msg = {'file': 'No resourcemetadata.xml file was found to update resource '
                                 'metadata.'}
            raise ValidationError(detail=error_msg)
        elif len(files) > 1:
            error_msg = {'file': ('More than one file was found. Only one file, named '
                                  'resourcemetadata.xml, '
                                  'can be used to update resource metadata.')}
            raise ValidationError(detail=error_msg)

        scimeta = files[0]
        if scimeta.content_type not in self.ACCEPT_FORMATS:
            error_msg = {'file': ("Uploaded file has content type {t}, "
                                  "but only these types are accepted: {e}.").format(
                t=scimeta.content_type, e=",".join(self.ACCEPT_FORMATS))}
            raise ValidationError(detail=error_msg)
        expect = 'resourcemetadata.xml'
        if scimeta.name != expect:
            error_msg = {'file': "Uploaded file has name {n}, but expected {e}.".format(
                n=scimeta.name, e=expect)}
            raise ValidationError(detail=error_msg)

        # Temp directory to store resourcemetadata.xml
        tmp_dir = tempfile.mkdtemp()
        try:
            # Fake the bag structure so that GenericResourceMeta.read_metadata_from_resource_bag
            # can read and validate the system and science metadata for us.
            bag_data_path = os.path.join(tmp_dir, 'data')
            os.mkdir(bag_data_path)
            # Copy new science metadata to bag data path
            scimeta_path = os.path.join(bag_data_path, 'resourcemetadata.xml')
            shutil.copy(scimeta.temporary_file_path(), scimeta_path)
            # Copy existing resource map to bag data path
            # (use a file-like object as the file may be in iRODS, so we can't
            #  just copy it to a local path)
            resmeta_path = os.path.join(bag_data_path, 'resourcemap.xml')
            with open(resmeta_path, 'wb') as resmeta:
                storage = get_file_storage()
                resmeta_irods = storage.open(AbstractResource.sysmeta_path(pk))
                shutil.copyfileobj(resmeta_irods, resmeta)

            resmeta_irods.close()

            try:
                # Read resource system and science metadata
                domain = Site.objects.get_current().domain
                rm = GenericResourceMeta.read_metadata_from_resource_bag(tmp_dir,
                                                                         hydroshare_host=domain)
                # Update resource metadata
                rm.write_metadata_to_resource(resource, update_title=True, update_keywords=True)
                create_bag_files(resource)
            except HsDeserializationDependencyException as e:
                msg = ("HsDeserializationDependencyException encountered when updating "
                       "science metadata for resource {pk}; depedent resource was {dep}.")
                msg = msg.format(pk=pk, dep=e.dependency_resource_id)
                logger.error(msg)
                raise ValidationError(detail=msg)
            except HsDeserializationException as e:
                raise ValidationError(detail=e.message)

            resource_modified(resource, request.user, overwrite_bag=False)
            return Response(data={'resource_id': pk}, status=status.HTTP_202_ACCEPTED)
        finally:
            shutil.rmtree(tmp_dir)
Exemplo n.º 14
0
for(dirpath, dirnames, filenames) in os.walk(sys.argv[1]):
    for dirname in dirnames:
        if dirname != 'bags':
            content_path_list.append(os.path.join(sys.argv[1], dirname))
    break

# print content_path_list

from hs_core.serialization import create_resource_from_bag
from hs_core.hydroshare.hs_bagit import create_bag_files

dep_res_meta = []
dep_res = []
for content_path in content_path_list:
    try:
        ret = create_resource_from_bag(content_path)
        if ret:
            dep_res_meta.append(ret[1])
            dep_res.append(ret[2])
    except Exception as ex:
        print ex.message
        continue

for i in range(0, len(dep_res_meta)):
    try:
        dep_res_meta[i].write_metadata_to_resource(dep_res[i])
        create_bag_files(dep_res[i])
    except Exception as ex:
        print ex.message
        continue
Exemplo n.º 15
0
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True,
             *args, **kwargs):
    """ perform a download request, either asynchronously or synchronously

    :param request: the request object.
    :param path: the path of the thing to be downloaded.
    :param rest_call: True if calling from REST API
    :param use_async: True means to utilize asynchronous creation of objects to download.
    :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming.

    The following variables are computed:

    * `path` is the public path of the thing to be downloaded.
    * `irods_path` is the location of `path` in irods.
    * `output_path` is the output path to be reported in the response object.
    * `irods_output_path` is the location of `output_path` in irods

    and there are six cases:

    Zipped query param signal the download should be zipped
        - folders are always zipped regardless of this paramter
        - single file aggregations are zipped with the aggregation metadata files

    A path may point to:
    1. a single file
    2. a single-file-aggregation object in a composite resource.
    3. a folder
    3. a metadata object that may need updating.
    4. a bag that needs to be updated and then returned.
    6. a previously zipped file that was zipped asynchronously.

    """
    if __debug__:
        logger.debug("request path is {}".format(path))

    split_path_strs = path.split('/')
    while split_path_strs[-1] == '':
        split_path_strs.pop()
    path = u'/'.join(split_path_strs)  # no trailing slash

    # initialize case variables
    is_bag_download = False
    is_zip_download = False
    is_zip_request = request.GET.get('zipped', "False").lower() == "true"
    is_sf_agg_file = False
    is_sf_request = False

    if split_path_strs[0] == 'bags':
        is_bag_download = True
        # format is bags/{rid}.zip
        res_id = os.path.splitext(split_path_strs[1])[0]
    elif split_path_strs[0] == 'zips':
        is_zip_download = True
        # zips prefix means that we are following up on an asynchronous download request
        # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid
        res_id = split_path_strs[3]
    else:  # regular download request
        res_id = split_path_strs[0]

    if __debug__:
        logger.debug("resource id is {}".format(res_id))

    # now we have the resource Id and can authorize the request
    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(request, res_id,
                                   needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
                                   raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        if rest_call:
            raise PermissionDenied(content_msg)
        else:
            response.content = "<h1>" + content_msg + "</h1>"
            return response

    # default values are changed later as needed

    istorage = res.get_irods_storage()
    if res.is_federated:
        irods_path = os.path.join(res.resource_federation_path, path)
    else:
        irods_path = path
    # in many cases, path and output_path are the same.
    output_path = path

    irods_output_path = irods_path
    # folder requests are automatically zipped
    if not is_bag_download and not is_zip_download:  # path points into resource: should I zip it?
        store_path = u'/'.join(split_path_strs[1:])  # data/contents/{path-to-something}
        if res.is_folder(store_path):  # automatically zip folders
            is_zip_request = True
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path)
            if res.is_federated:
                irods_output_path = os.path.join(res.resource_federation_path, output_path)
            else:
                irods_output_path = output_path
            if __debug__:
                logger.debug("automatically zipping folder {} to {}".format(path, output_path))
        elif istorage.exists(irods_path):
            if __debug__:
                logger.debug("request for single file {}".format(path))
            is_sf_request = True

            # check for single file aggregations
            if "data/contents/" in path:  # not a metadata file
                for f in ResourceFile.objects.filter(object_id=res.id):
                    if path == f.storage_path:
                        is_sf_agg_file = True
                        if not is_zip_request and f.has_logical_file and \
                                f.logical_file.is_single_file_aggregation:
                            download_url = request.GET.get('url_download', 'false').lower()
                            if download_url == 'false':
                                # redirect to referenced url in the url file instead
                                redirect_url = f.logical_file.redirect_url
                                if redirect_url:
                                    return HttpResponseRedirect(redirect_url)
                        if __debug__:
                            logger.debug(
                                "request for single file aggregation {}".format(path))
                        break

            if is_zip_request:
                daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
                output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path)
                if res.is_federated:
                    irods_output_path = os.path.join(res.resource_federation_path, output_path)
                else:
                    irods_output_path = output_path

    # After this point, we have valid path, irods_path, output_path, and irods_output_path
    # * is_zip_request: signals download should be zipped, folders are always zipped
    # * is_sf_agg_file: path is a single-file aggregation in Composite Resource
    # * is_sf_request: path is a single-file
    # flags for download:
    # * is_bag_download: download a bag in format bags/{rid}.zip
    # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip
    # if none of these are set, it's a normal download

    # determine active session
    if res.is_federated:
        # the resource is stored in federated zone
        session = icommands.ACTIVE_SESSION
    else:
        # TODO: From Alva: I do not understand the use case for changing the environment.
        # TODO: This seems an enormous potential vulnerability, as arguments are
        # TODO: passed from the URI directly to IRODS without verification.
        if 'environment' in kwargs:
            logger.warn("setting iRODS from environment")
            environment = int(kwargs['environment'])
            environment = m.RodsEnvironment.objects.get(pk=environment)
            session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH,
                              session_id=uuid4())
            session.create_environment(environment)
            session.run('iinit', None, environment.auth)
        elif getattr(settings, 'IRODS_GLOBAL_SESSION', False):
            if __debug__:
                logger.debug("using GLOBAL_SESSION")
            session = GLOBAL_SESSION
        elif icommands.ACTIVE_SESSION:
            if __debug__:
                logger.debug("using ACTIVE_SESSION")
            session = icommands.ACTIVE_SESSION
        else:
            raise KeyError('settings must have IRODS_GLOBAL_SESSION set '
                           'if there is no environment object')

    resource_cls = check_resource_type(res.resource_type)

    if is_zip_request:

        if use_async:
            task = create_temp_zip.apply_async((res_id, irods_path, irods_output_path,
                                                is_sf_agg_file, is_sf_request))
            delete_zip.apply_async((irods_output_path, ),
                                   countdown=(60 * 60 * 24))  # delete after 24 hours

            if rest_call:
                return HttpResponse(
                    json.dumps({
                        'zip_status': 'Not ready',
                        'task_id': task.task_id,
                        'download_path': '/django_irods/rest_download/' + output_path}),
                    content_type="application/json")
            else:
                # return status to the UI
                request.session['task_id'] = task.task_id
                # TODO: this is mistaken for a bag download in the UI!
                # TODO: multiple asynchronous downloads don't stack!
                request.session['download_path'] = '/django_irods/download/' + output_path
                # redirect to resource landing page, which interprets session variables.
                return HttpResponseRedirect(res.get_absolute_url())

        else:  # synchronous creation of download
            ret_status = create_temp_zip(res_id, irods_path, irods_output_path,
                                         is_sf_agg_file, is_sf_request)
            delete_zip.apply_async((irods_output_path, ),
                                   countdown=(60 * 60 * 24))  # delete after 24 hours
            if not ret_status:
                content_msg = "Zip could not be created."
                response = HttpResponse()
                if rest_call:
                    response.content = content_msg
                else:
                    response.content = "<h1>" + content_msg + "</h1>"
                return response
            # At this point, output_path presumably exists and contains a zipfile
            # to be streamed below

    elif is_bag_download:
        # Shorten request if it contains extra junk at the end
        bag_file_name = res_id + '.zip'
        output_path = os.path.join('bags', bag_file_name)
        if not res.is_federated:
            irods_output_path = output_path
        else:
            irods_output_path = os.path.join(res.resource_federation_path, output_path)

        bag_modified = res.getAVU('bag_modified')
        # recreate the bag if it doesn't exist even if bag_modified is "false".
        if __debug__:
            logger.debug(u"irods_output_path is {}".format(irods_output_path))
        if bag_modified is None or not bag_modified:
            if not istorage.exists(irods_output_path):
                bag_modified = True

        # send signal for pre_check_bag_flag
        # this generates metadata other than that generated by create_bag_files.
        pre_check_bag_flag.send(sender=resource_cls, resource=res)

        metadata_dirty = res.getAVU('metadata_dirty')
        if metadata_dirty is None or metadata_dirty:
            create_bag_files(res)  # sets metadata_dirty to False
            bag_modified = "True"

        if bag_modified is None or bag_modified:
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task
                task = create_bag_by_irods.apply_async((res_id,), countdown=3)
                if rest_call:
                    return HttpResponse(json.dumps({'bag_status': 'Not ready',
                                                    'task_id': task.task_id}),
                                        content_type="application/json")

                request.session['task_id'] = task.task_id
                request.session['download_path'] = request.path
                return HttpResponseRedirect(res.get_absolute_url())
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    if rest_call:
                        response.content = content_msg
                    else:
                        response.content = "<h1>" + content_msg + "</h1>"
                    return response

    else:  # regular file download
        # if fetching main metadata files, then these need to be refreshed.
        if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'):
            metadata_dirty = res.getAVU("metadata_dirty")
            if metadata_dirty is None or metadata_dirty:
                create_bag_files(res)  # sets metadata_dirty to False

        # send signal for pre download file
        # TODO: does not contain subdirectory information: duplicate refreshes possible
        download_file_name = split_path_strs[-1]  # end of path
        # this logs the download request in the tracking system
        pre_download_file.send(sender=resource_cls, resource=res,
                               download_file_name=download_file_name,
                               request=request)

    # If we get this far,
    # * path and irods_path point to true input
    # * output_path and irods_output_path point to true output.
    # Try to stream the file back to the requester.

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(output_path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    # TODO: standardize this to make it less brittle
    stdout = session.run("ils", None, "-l", irods_output_path)[0].split()
    flen = int(stdout[3])

    # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true')
    # and reverse proxy is possible according to configuration (SENDFILE_ON=True)
    # and reverse proxy isn't overridden by user (use_reverse_proxy=True).

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. This deals with federated resources by reading their path, matching local vaults, and
        #    redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At
        #    present, the only one of these is /irods-user, which handles files whose federation
        #    path is stored in the variable 'userpath'.
        # 4. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # If this path is resource_federation_path, then the file is a local user file
        userpath = '/' + os.path.join(
            getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'),
            'home',
            getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy'))

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(irods_output_path):
            content_msg = "file path {} does not exist in iRODS".format(output_path)
            response = HttpResponse(status=404)
            if rest_call:
                response.content = content_msg
            else:
                response.content = "<h1>" + content_msg + "</h1>"
            return response

        if not res.is_federated:
            # track download count
            res.update_download_count()

            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = '/'.join([
                getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path])
            if __debug__:
                logger.debug("Reverse proxying local {}".format(response['X-Accel-Redirect']))
            return response

        elif res.resource_federation_path == userpath:  # this guarantees a "user" resource
            # track download count
            res.update_download_count()

            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = os.path.join(
                getattr(settings, 'IRODS_USER_URI', '/irods-user'), output_path)
            if __debug__:
                logger.debug("Reverse proxying user {}".format(response['X-Accel-Redirect']))
            return response

    # if we get here, none of the above conditions are true
    # if reverse proxy is enabled, then this is because the resource is remote and federated
    # OR the user specifically requested a non-proxied download.

    options = ('-',)  # we're redirecting to stdout.
    # this unusual way of calling works for streaming federated or local resources
    if __debug__:
        logger.debug("Locally streaming {}".format(output_path))
    # track download count
    res.update_download_count()
    proc = session.run_safe('iget', None, irods_output_path, *options)
    response = FileResponse(proc.stdout, content_type=mtype)
    response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
        name=output_path.split('/')[-1])
    response['Content-Length'] = flen
    return response
Exemplo n.º 16
0
def create_bag_by_irods(resource_id):
    """Create a resource bag on iRODS side by running the bagit rule and ibun zip.

    This function runs as a celery task, invoked asynchronously so that it does not
    block the main web thread when it creates bags for very large files which will take some time.
    :param
    resource_id: the resource uuid that is used to look for the resource to create the bag for.

    :return: True if bag creation operation succeeds;
             False if there is an exception raised or resource does not exist.
    """
    res = utils.get_resource_by_shortkey(resource_id)

    istorage = res.get_irods_storage()

    bag_path = res.bag_path

    metadata_dirty = istorage.getAVU(res.root_path, 'metadata_dirty')
    # if metadata has been changed, then regenerate metadata xml files
    if metadata_dirty is None or metadata_dirty.lower() == "true":
        try:
            create_bag_files(res)
        except Exception as ex:
            logger.error('Failed to create bag files. Error:{}'.format(
                ex.message))
            # release the lock before returning bag creation failure
            return False

    irods_bagit_input_path = res.get_irods_path(resource_id,
                                                prepend_short_id=False)
    # check to see if bagit readme.txt file exists or not
    bagit_readme_file = res.get_irods_path('readme.txt')
    is_bagit_readme_exist = istorage.exists(bagit_readme_file)
    if irods_bagit_input_path.startswith(resource_id):
        # resource is in data zone, need to append the full path for iRODS bagit rule execution
        irods_dest_prefix = "/" + settings.IRODS_ZONE + "/home/" + settings.IRODS_USERNAME
        irods_bagit_input_path = os.path.join(irods_dest_prefix, resource_id)
        bagit_input_resource = "*DESTRESC='{def_res}'".format(
            def_res=settings.IRODS_DEFAULT_RESOURCE)
    else:
        # this will need to be changed with the default resource in whatever federated zone the
        # resource is stored in when we have such use cases to support
        bagit_input_resource = "*DESTRESC='{def_res}'".format(
            def_res=settings.HS_IRODS_USER_ZONE_DEF_RES)

    bagit_input_path = "*BAGITDATA='{path}'".format(
        path=irods_bagit_input_path)

    bagit_files = [
        res.get_irods_path('bagit.txt'),
        res.get_irods_path('manifest-md5.txt'),
        res.get_irods_path('tagmanifest-md5.txt'), bag_path
    ]

    # only proceed when the resource is not deleted potentially by another request
    # when being downloaded
    is_exist = istorage.exists(irods_bagit_input_path)
    if is_exist:
        # if bagit readme.txt does not exist, add it.
        if not is_bagit_readme_exist:
            from_file_name = getattr(settings,
                                     'HS_BAGIT_README_FILE_WITH_PATH',
                                     'docs/bagit/readme.txt')
            istorage.saveFile(from_file_name, bagit_readme_file, True)

        # call iRODS bagit rule here
        bagit_rule_file = getattr(settings, 'IRODS_BAGIT_RULE',
                                  'hydroshare/irods/ruleGenerateBagIt_HS.r')

        try:
            # call iRODS run and ibun command to create and zip the bag, ignore SessionException
            # for now as a workaround which could be raised from potential race conditions when
            # multiple ibun commands try to create the same zip file or the very same resource
            # gets deleted by another request when being downloaded
            istorage.runBagitRule(bagit_rule_file, bagit_input_path,
                                  bagit_input_resource)
            istorage.zipup(irods_bagit_input_path, bag_path)
            if res.raccess.published:
                # compute checksum to meet DataONE distribution requirement
                chksum = istorage.checksum(bag_path)
                res.bag_checksum = chksum
            istorage.setAVU(irods_bagit_input_path, 'bag_modified', "false")
            return True
        except SessionException as ex:
            # if an exception occurs, delete incomplete files potentially being generated by
            # iRODS bagit rule and zipping operations
            for fname in bagit_files:
                if istorage.exists(fname):
                    istorage.delete(fname)
            logger.error(ex.stderr)
            return False
    else:
        logger.error('Resource does not exist.')
        return False
Exemplo n.º 17
0
 def test_create_bag_files(self):
     # this is the api call we are testing
     irods_storage_obj = hs_bagit.create_bag_files(self.test_res)
     self.assertTrue(isinstance(irods_storage_obj, IrodsStorage))
Exemplo n.º 18
0
def create_bag_by_irods(resource_id):
    """Create a resource bag on iRODS side by running the bagit rule and ibun zip.

    This function runs as a celery task, invoked asynchronously so that it does not
    block the main web thread when it creates bags for very large files which will take some time.
    :param
    resource_id: the resource uuid that is used to look for the resource to create the bag for.

    :return: True if bag creation operation succeeds;
             False if there is an exception raised or resource does not exist.
    """
    from hs_core.hydroshare.utils import get_resource_by_shortkey

    res = get_resource_by_shortkey(resource_id)
    istorage = res.get_irods_storage()

    metadata_dirty = istorage.getAVU(res.root_path, 'metadata_dirty')
    # if metadata has been changed, then regenerate metadata xml files
    if metadata_dirty is None or metadata_dirty.lower() == "true":
        try:
            create_bag_files(res)
        except Exception as ex:
            logger.error('Failed to create bag files. Error:{}'.format(
                ex.message))
            return False

    bag_full_name = 'bags/{res_id}.zip'.format(res_id=resource_id)
    if res.resource_federation_path:
        irods_bagit_input_path = os.path.join(res.resource_federation_path,
                                              resource_id)
        is_exist = istorage.exists(irods_bagit_input_path)
        # check to see if bagit readme.txt file exists or not
        bagit_readme_file = '{fed_path}/{res_id}/readme.txt'.format(
            fed_path=res.resource_federation_path, res_id=resource_id)
        is_bagit_readme_exist = istorage.exists(bagit_readme_file)
        bagit_input_path = "*BAGITDATA='{path}'".format(
            path=irods_bagit_input_path)
        bagit_input_resource = "*DESTRESC='{def_res}'".format(
            def_res=settings.HS_IRODS_USER_ZONE_DEF_RES)
        bag_full_name = os.path.join(res.resource_federation_path,
                                     bag_full_name)
        bagit_files = [
            '{fed_path}/{res_id}/bagit.txt'.format(
                fed_path=res.resource_federation_path, res_id=resource_id),
            '{fed_path}/{res_id}/manifest-md5.txt'.format(
                fed_path=res.resource_federation_path, res_id=resource_id),
            '{fed_path}/{res_id}/tagmanifest-md5.txt'.format(
                fed_path=res.resource_federation_path,
                res_id=resource_id), '{fed_path}/bags/{res_id}.zip'.format(
                    fed_path=res.resource_federation_path, res_id=resource_id)
        ]
    else:
        is_exist = istorage.exists(resource_id)
        # check to see if bagit readme.txt file exists or not
        bagit_readme_file = '{res_id}/readme.txt'.format(res_id=resource_id)
        is_bagit_readme_exist = istorage.exists(bagit_readme_file)
        irods_dest_prefix = "/" + settings.IRODS_ZONE + "/home/" + settings.IRODS_USERNAME
        irods_bagit_input_path = os.path.join(irods_dest_prefix, resource_id)
        bagit_input_path = "*BAGITDATA='{path}'".format(
            path=irods_bagit_input_path)
        bagit_input_resource = "*DESTRESC='{def_res}'".format(
            def_res=settings.IRODS_DEFAULT_RESOURCE)
        bagit_files = [
            '{res_id}/bagit.txt'.format(res_id=resource_id),
            '{res_id}/manifest-md5.txt'.format(res_id=resource_id),
            '{res_id}/tagmanifest-md5.txt'.format(res_id=resource_id),
            'bags/{res_id}.zip'.format(res_id=resource_id)
        ]

    # only proceed when the resource is not deleted potentially by another request
    # when being downloaded
    if is_exist:
        # if bagit readme.txt does not exist, add it.
        if not is_bagit_readme_exist:
            from_file_name = getattr(settings,
                                     'HS_BAGIT_README_FILE_WITH_PATH',
                                     'docs/bagit/readme.txt')
            istorage.saveFile(from_file_name, bagit_readme_file, True)

        # call iRODS bagit rule here
        bagit_rule_file = getattr(settings, 'IRODS_BAGIT_RULE',
                                  'hydroshare/irods/ruleGenerateBagIt_HS.r')

        try:
            # call iRODS run and ibun command to create and zip the bag, ignore SessionException
            # for now as a workaround which could be raised from potential race conditions when
            # multiple ibun commands try to create the same zip file or the very same resource
            # gets deleted by another request when being downloaded
            istorage.runBagitRule(bagit_rule_file, bagit_input_path,
                                  bagit_input_resource)
            istorage.zipup(irods_bagit_input_path, bag_full_name)
            istorage.setAVU(irods_bagit_input_path, 'bag_modified', "false")
            return True
        except SessionException as ex:
            # if an exception occurs, delete incomplete files potentially being generated by
            # iRODS bagit rule and zipping operations
            for fname in bagit_files:
                if istorage.exists(fname):
                    istorage.delete(fname)
            logger.error(ex.stderr)
            return False
    else:
        logger.error('Resource does not exist.')
        return False
Exemplo n.º 19
0
def create_bag_by_irods(resource_id):
    """Create a resource bag on iRODS side by running the bagit rule and ibun zip.

    This function runs as a celery task, invoked asynchronously so that it does not
    block the main web thread when it creates bags for very large files which will take some time.
    :param
    resource_id: the resource uuid that is used to look for the resource to create the bag for.

    :return: True if bag creation operation succeeds;
             False if there is an exception raised or resource does not exist.
    """
    from hs_core.hydroshare.utils import get_resource_by_shortkey

    res = get_resource_by_shortkey(resource_id)
    istorage = res.get_irods_storage()

    metadata_dirty = istorage.getAVU(res.root_path, 'metadata_dirty')
    # if metadata has been changed, then regenerate metadata xml files
    if metadata_dirty is None or metadata_dirty.lower() == "true":
        try:
            create_bag_files(res)
        except Exception as ex:
            logger.error('Failed to create bag files. Error:{}'.format(ex.message))
            return False

    bag_full_name = 'bags/{res_id}.zip'.format(res_id=resource_id)
    if res.resource_federation_path:
        irods_bagit_input_path = os.path.join(res.resource_federation_path, resource_id)
        is_exist = istorage.exists(irods_bagit_input_path)
        # check to see if bagit readme.txt file exists or not
        bagit_readme_file = '{fed_path}/{res_id}/readme.txt'.format(
            fed_path=res.resource_federation_path,
            res_id=resource_id)
        is_bagit_readme_exist = istorage.exists(bagit_readme_file)
        bagit_input_path = "*BAGITDATA='{path}'".format(path=irods_bagit_input_path)
        bagit_input_resource = "*DESTRESC='{def_res}'".format(
            def_res=settings.HS_IRODS_LOCAL_ZONE_DEF_RES)
        bag_full_name = os.path.join(res.resource_federation_path, bag_full_name)
        bagit_files = [
            '{fed_path}/{res_id}/bagit.txt'.format(fed_path=res.resource_federation_path,
                                                   res_id=resource_id),
            '{fed_path}/{res_id}/manifest-md5.txt'.format(
                fed_path=res.resource_federation_path, res_id=resource_id),
            '{fed_path}/{res_id}/tagmanifest-md5.txt'.format(
                fed_path=res.resource_federation_path, res_id=resource_id),
            '{fed_path}/bags/{res_id}.zip'.format(fed_path=res.resource_federation_path,
                                                  res_id=resource_id)
        ]
    else:
        is_exist = istorage.exists(resource_id)
        # check to see if bagit readme.txt file exists or not
        bagit_readme_file = '{res_id}/readme.txt'.format(res_id=resource_id)
        is_bagit_readme_exist = istorage.exists(bagit_readme_file)
        irods_dest_prefix = "/" + settings.IRODS_ZONE + "/home/" + settings.IRODS_USERNAME
        irods_bagit_input_path = os.path.join(irods_dest_prefix, resource_id)
        bagit_input_path = "*BAGITDATA='{path}'".format(path=irods_bagit_input_path)
        bagit_input_resource = "*DESTRESC='{def_res}'".format(
            def_res=settings.IRODS_DEFAULT_RESOURCE)
        bagit_files = [
            '{res_id}/bagit.txt'.format(res_id=resource_id),
            '{res_id}/manifest-md5.txt'.format(res_id=resource_id),
            '{res_id}/tagmanifest-md5.txt'.format(res_id=resource_id),
            'bags/{res_id}.zip'.format(res_id=resource_id)
        ]

    # only proceed when the resource is not deleted potentially by another request
    # when being downloaded
    if is_exist:
        # if bagit readme.txt does not exist, add it.
        if not is_bagit_readme_exist:
            from_file_name = getattr(settings, 'HS_BAGIT_README_FILE_WITH_PATH',
                                     'docs/bagit/readme.txt')
            istorage.saveFile(from_file_name, bagit_readme_file, True)

        # call iRODS bagit rule here
        bagit_rule_file = getattr(settings, 'IRODS_BAGIT_RULE',
                                  'hydroshare/irods/ruleGenerateBagIt_HS.r')

        try:
            # call iRODS run and ibun command to create and zip the bag, ignore SessionException
            # for now as a workaround which could be raised from potential race conditions when
            # multiple ibun commands try to create the same zip file or the very same resource
            # gets deleted by another request when being downloaded
            istorage.runBagitRule(bagit_rule_file, bagit_input_path, bagit_input_resource)
            istorage.zipup(irods_bagit_input_path, bag_full_name)
            istorage.setAVU(irods_bagit_input_path, 'bag_modified', "false")
            return True
        except SessionException as ex:
            # if an exception occurs, delete incomplete files potentially being generated by
            # iRODS bagit rule and zipping operations
            for fname in bagit_files:
                if istorage.exists(fname):
                    istorage.delete(fname)
            logger.error(ex.stderr)
            return False
    else:
        logger.error('Resource does not exist.')
        return False
Exemplo n.º 20
0
 def test_create_bag_files(self):
     # this is the api call we are testing
     irods_storage_obj = hs_bagit.create_bag_files(self.test_res)
     self.assertTrue(isinstance(irods_storage_obj, IrodsStorage))
Exemplo n.º 21
0
    def handle(self, *args, **options):

        if len(options['resource_ids']) > 0:  # an array of resource short_id to check.
            for rid in options['resource_ids']:
                try:
                    resource = BaseResource.objects.get(short_id=rid)
                    istorage = resource.get_irods_storage()

                    scimeta_path = os.path.join(resource.root_path, 'data',
                                                'resourcemetadata.xml')
                    if istorage.exists(scimeta_path):
                        print("found {}".format(scimeta_path))
                    else:
                        print("{} NOT FOUND".format(scimeta_path))

                    resmap_path = os.path.join(resource.root_path, 'data',
                                               'resourcemap.xml')
                    if istorage.exists(resmap_path):
                        print("found {}".format(resmap_path))
                    else:
                        print("{} NOT FOUND".format(resmap_path))

                    if istorage.exists(resource.bag_path):
                        print("found bag {}".format(resource.bag_path))
                    else:
                        print("bag {} NOT FOUND".format(resource.bag_path))

                    dirty = resource.getAVU('metadata_dirty')
                    print("metadata_dirty is {}".format(str(dirty)))

                    modified = resource.getAVU('bag_modified')
                    print("bag_modified is {}".format(str(modified)))

                    if options['generate']:  # generate usable bag

                        create_bag_files(resource)
                        print("metadata generated for {} from Django".format(rid))
                        resource.setAVU('metadata_dirty', 'false')
                        print("metadata_dirty set to false for {}".format(rid))

                        create_bag_by_irods(rid)
                        print("bag generated for {} from iRODs".format(rid))
                        resource.setAVU('bag_modified', 'false')
                        print("bag_modified set to false for {}".format(rid))

                    elif options['generate_metadata']:

                        create_bag_files(resource)
                        print("metadata generated for {} from Django".format(rid))
                        resource.setAVU('metadata_dirty', 'false')
                        print("metadata_dirty set to false for {}".format(rid))

                    elif options['generate_bag']:

                        create_bag_by_irods(rid)
                        print("bag generated for {} from iRODs".format(rid))
                        resource.setAVU('bag_modified', 'false')
                        print("bag_modified set to false for {}".format(rid))

                    elif options['reset']:  # reset all data to pristine

                        resource.setAVU('metadata_dirty', 'true')
                        print("metadata_dirty set to true for {}".format(rid))
                        try:
                            istorage.delete(resource.scimeta_path)
                            print("metadata {} deleted".format(resource.scimeta_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.scimeta_path,
                                          ex.stderr))
                        try:
                            istorage.delete(resource.resmap_path)
                            print("map {} deleted".format(resource.resmap_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.resmap_path,
                                          ex.stderr))

                        resource.setAVU('bag_modified', 'true')
                        print("bag_modified set to true for {}".format(rid))
                        try:
                            istorage.delete(resource.bag_path)
                            print("bag {} deleted".format(resource.bag_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.bag_path,
                                          ex.stderr))

                    elif options['reset_metadata']:

                        resource.setAVU('metadata_dirty', 'true')
                        print("metadata_dirty set to true for {}".format(rid))
                        try:
                            istorage.delete(resource.scimeta_path)
                            print("metadata {} deleted".format(resource.scimeta_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.scimeta_path,
                                          ex.stderr))
                        try:
                            istorage.delete(resource.resmap_path)
                            print("map {} deleted".format(resource.resmap_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.resmap_path,
                                          ex.stderr))

                    elif options['reset_bag']:
                        resource.setAVU('bag_modified', 'true')
                        print("bag_modified set to true for {}".format(rid))
                        try:
                            istorage.delete(resource.bag_path)
                            print("bag {} deleted".format(resource.bag_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.bag_path,
                                          ex.stderr))

                except BaseResource.DoesNotExist:
                    print("Resource with id {} NOT FOUND in Django".format(rid))
Exemplo n.º 22
0
def check_bag(rid, options):
    requests.packages.urllib3.disable_warnings()
    try:
        resource = BaseResource.objects.get(short_id=rid)
        istorage = resource.get_irods_storage()

        root_exists = istorage.exists(resource.root_path)

        if root_exists:
            # print status of metadata/bag system
            scimeta_path = os.path.join(resource.root_path, 'data',
                                        'resourcemetadata.xml')
            scimeta_exists = istorage.exists(scimeta_path)
            if scimeta_exists:
                print("resource metadata {} found".format(scimeta_path))
            else:
                print("resource metadata {} NOT FOUND".format(scimeta_path))

            resmap_path = os.path.join(resource.root_path, 'data', 'resourcemap.xml')
            resmap_exists = istorage.exists(resmap_path)
            if resmap_exists:
                print("resource map {} found".format(resmap_path))
            else:
                print("resource map {} NOT FOUND".format(resmap_path))

            bag_exists = istorage.exists(resource.bag_path)
            if bag_exists:
                print("bag {} found".format(resource.bag_path))
            else:
                print("bag {} NOT FOUND".format(resource.bag_path))

            dirty = resource.getAVU('metadata_dirty')
            print("{}.metadata_dirty is {}".format(rid, str(dirty)))

            modified = resource.getAVU('bag_modified')
            print("{}.bag_modified is {}".format(rid, str(modified)))

            if options['reset']:  # reset all data to pristine
                resource.setAVU('metadata_dirty', 'true')
                print("{}.metadata_dirty set to true".format(rid))
                try:
                    istorage.delete(resource.scimeta_path)
                    print("{} deleted".format(resource.scimeta_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.scimeta_path,
                                  ex.stderr))
                try:
                    istorage.delete(resource.resmap_path)
                    print("{} deleted".format(resource.resmap_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.resmap_path,
                                  ex.stderr))

                resource.setAVU('bag_modified', 'true')
                print("{}.bag_modified set to true".format(rid))
                try:
                    istorage.delete(resource.bag_path)
                    print("{} deleted".format(resource.bag_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.bag_path,
                                  ex.stderr))

            if options['reset_metadata']:
                resource.setAVU('metadata_dirty', 'true')
                print("{}.metadata_dirty set to true".format(rid))
                try:
                    istorage.delete(resource.scimeta_path)
                    print("{} deleted".format(resource.scimeta_path))
                except SessionException as ex:
                    print("delete of {} failed: {}"
                          .format(resource.scimeta_path,
                                  ex.stderr))
                try:
                    istorage.delete(resource.resmap_path)
                    print("{} deleted".format(resource.resmap_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.resmap_path,
                                  ex.stderr))

            if options['reset_bag']:
                resource.setAVU('bag_modified', 'true')
                print("{}.bag_modified set to true".format(rid))
                try:
                    istorage.delete(resource.bag_path)
                    print("{} deleted".format(resource.bag_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.bag_path,
                                  ex.stderr))

            if options['generate']:  # generate usable bag
                if not options['if_needed'] or dirty or not scimeta_exists or not resmap_exists:
                    try:
                        create_bag_files(resource)
                    except ValueError as e:
                        print("{}: value error encountered: {}".format(rid, e.message))
                        return

                    print("{} metadata generated from Django".format(rid))
                    resource.setAVU('metadata_dirty', 'false')
                    resource.setAVU('bag_modified', 'true')
                    print("{}.metadata_dirty set to false".format(rid))

                if not options['if_needed'] or modified or not bag_exists:
                    create_bag_by_irods(rid)
                    print("{} bag generated from iRODs".format(rid))
                    resource.setAVU('bag_modified', 'false')
                    print("{}.bag_modified set to false".format(rid))

            if options['generate_metadata']:
                if not options['if_needed'] or dirty or not scimeta_exists or not resmap_exists:
                    try:
                        create_bag_files(resource)
                    except ValueError as e:
                        print("{}: value error encountered: {}".format(rid, e.message))
                        return
                    print("{}: metadata generated from Django".format(rid))
                    resource.setAVU('metadata_dirty', 'false')
                    print("{}.metadata_dirty set to false".format(rid))
                    resource.setAVU('bag_modified', 'true')
                    print("{}.bag_modified set to false".format(rid))

            if options['generate_bag']:
                if not options['if_needed'] or modified or not bag_exists:
                    create_bag_by_irods(rid)
                    print("{}: bag generated from iRODs".format(rid))
                    resource.setAVU('bag_modified', 'false')
                    print("{}.bag_modified set to false".format(rid))

            if options['download_bag']:
                if options['password']:
                    server = getattr(settings, 'FQDN_OR_IP', 'www.hydroshare.org')
                    uri = "https://{}/hsapi/resource/{}/".format(server, rid)
                    print("download uri is {}".format(uri))
                    r = hs_requests.get(uri, verify=False, stream=True,
                                        auth=requests.auth.HTTPBasicAuth(options['login'],
                                                                         options['password']))
                    print("download return status is {}".format(str(r.status_code)))
                    print("redirects:")
                    for thing in r.history:
                        print("...url: {}".format(thing.url))
                    filename = 'tmp/check_bag_block'
                    with open(filename, 'wb') as fd:
                        for chunk in r.iter_content(chunk_size=128):
                            fd.write(chunk)
                else:
                    print("cannot download bag without username and password.")

            if options['open_bag']:
                if options['password']:
                    server = getattr(settings, 'FQDN_OR_IP', 'www.hydroshare.org')
                    uri = "https://{}/hsapi/resource/{}/".format(server, rid)
                    print("download uri is {}".format(uri))
                    r = hs_requests.get(uri, verify=False, stream=True,
                                        auth=requests.auth.HTTPBasicAuth(options['login'],
                                                                         options['password']))
                    print("download return status is {}".format(str(r.status_code)))
                    print("redirects:")
                    for thing in r.history:
                        print("...url: {}".format(thing.url))
                    filename = 'tmp/check_bag_block'
                    with open(filename, 'wb') as fd:
                        for chunk in r.iter_content(chunk_size=128):
                            fd.write(chunk)
                            break
                else:
                    print("cannot open bag without username and password.")
        else:
            print("Resource with id {} does not exist in iRODS".format(rid))
    except BaseResource.DoesNotExist:
        print("Resource with id {} NOT FOUND in Django".format(rid))
Exemplo n.º 23
0
    def put(self, request, pk):
        # Update science metadata based on resourcemetadata.xml uploaded
        resource, authorized, user = view_utils.authorize(
            request,
            pk,
            needed_permission=ACTION_TO_AUTHORIZE.EDIT_RESOURCE,
            raises_exception=False)
        if not authorized:
            raise PermissionDenied()

        files = request.FILES.values()
        if len(files) == 0:
            error_msg = {
                'file':
                'No resourcemetadata.xml file was found to update resource '
                'metadata.'
            }
            raise ValidationError(detail=error_msg)
        elif len(files) > 1:
            error_msg = {
                'file': ('More than one file was found. Only one file, named '
                         'resourcemetadata.xml, '
                         'can be used to update resource metadata.')
            }
            raise ValidationError(detail=error_msg)

        scimeta = files[0]
        if scimeta.content_type not in self.ACCEPT_FORMATS:
            error_msg = {
                'file': ("Uploaded file has content type {t}, "
                         "but only these types are accepted: {e}.").format(
                             t=scimeta.content_type,
                             e=",".join(self.ACCEPT_FORMATS))
            }
            raise ValidationError(detail=error_msg)
        expect = 'resourcemetadata.xml'
        if scimeta.name != expect:
            error_msg = {
                'file':
                "Uploaded file has name {n}, but expected {e}.".format(
                    n=scimeta.name, e=expect)
            }
            raise ValidationError(detail=error_msg)

        # Temp directory to store resourcemetadata.xml
        tmp_dir = tempfile.mkdtemp()
        try:
            # Fake the bag structure so that GenericResourceMeta.read_metadata_from_resource_bag
            # can read and validate the system and science metadata for us.
            bag_data_path = os.path.join(tmp_dir, 'data')
            os.mkdir(bag_data_path)
            # Copy new science metadata to bag data path
            scimeta_path = os.path.join(bag_data_path, 'resourcemetadata.xml')
            shutil.copy(scimeta.temporary_file_path(), scimeta_path)
            # Copy existing resource map to bag data path
            # (use a file-like object as the file may be in iRODS, so we can't
            #  just copy it to a local path)
            resmeta_path = os.path.join(bag_data_path, 'resourcemap.xml')
            with open(resmeta_path, 'wb') as resmeta:
                storage = get_file_storage()
                resmeta_irods = storage.open(AbstractResource.sysmeta_path(pk))
                shutil.copyfileobj(resmeta_irods, resmeta)

            resmeta_irods.close()

            try:
                # Read resource system and science metadata
                domain = Site.objects.get_current().domain
                rm = GenericResourceMeta.read_metadata_from_resource_bag(
                    tmp_dir, hydroshare_host=domain)
                # Update resource metadata
                rm.write_metadata_to_resource(resource,
                                              update_title=True,
                                              update_keywords=True)
                create_bag_files(resource)
            except HsDeserializationDependencyException as e:
                msg = (
                    "HsDeserializationDependencyException encountered when updating "
                    "science metadata for resource {pk}; depedent resource was {dep}."
                )
                msg = msg.format(pk=pk, dep=e.dependency_resource_id)
                logger.error(msg)
                raise ValidationError(detail=msg)
            except HsDeserializationException as e:
                raise ValidationError(detail=e.message)

            resource_modified(resource, request.user, overwrite_bag=False)
            return Response(data={'resource_id': pk},
                            status=status.HTTP_202_ACCEPTED)
        finally:
            shutil.rmtree(tmp_dir)
Exemplo n.º 24
0
def download(request,
             path,
             rest_call=False,
             use_async=True,
             use_reverse_proxy=True,
             *args,
             **kwargs):
    split_path_strs = path.split('/')
    is_bag_download = False
    is_zip_download = False
    is_sf_agg_file = False
    if split_path_strs[0] == 'bags':
        res_id = os.path.splitext(split_path_strs[1])[0]
        is_bag_download = True
    elif split_path_strs[0] == 'zips':
        if path.endswith('.zip'):
            res_id = os.path.splitext(split_path_strs[2])[0]
        else:
            res_id = os.path.splitext(split_path_strs[1])[0]
        is_zip_download = True
    else:
        res_id = split_path_strs[0]

    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(
        request,
        res_id,
        needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
        raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        if rest_call:
            raise PermissionDenied(content_msg)
        else:
            response.content = "<h1>" + content_msg + "</h1>"
            return response

    if res.resource_type == "CompositeResource" and not path.endswith(".zip"):
        for f in ResourceFile.objects.filter(object_id=res.id):
            if path == f.storage_path:
                if f.has_logical_file and f.logical_file.is_single_file_aggregation:
                    is_sf_agg_file = True

    if res.resource_federation_path:
        # the resource is stored in federated zone
        istorage = IrodsStorage('federated')
        federated_path = res.resource_federation_path
        path = os.path.join(federated_path, path)
        session = icommands.ACTIVE_SESSION
    else:
        # TODO: From Alva: I do not understand the use case for changing the environment.
        # TODO: This seems an enormous potential vulnerability, as arguments are
        # TODO: passed from the URI directly to IRODS without verification.
        istorage = IrodsStorage()
        federated_path = ''
        if 'environment' in kwargs:
            environment = int(kwargs['environment'])
            environment = m.RodsEnvironment.objects.get(pk=environment)
            session = Session("/tmp/django_irods",
                              settings.IRODS_ICOMMANDS_PATH,
                              session_id=uuid4())
            session.create_environment(environment)
            session.run('iinit', None, environment.auth)
        elif getattr(settings, 'IRODS_GLOBAL_SESSION', False):
            session = GLOBAL_SESSION
        elif icommands.ACTIVE_SESSION:
            session = icommands.ACTIVE_SESSION
        else:
            raise KeyError('settings must have IRODS_GLOBAL_SESSION set '
                           'if there is no environment object')

    resource_cls = check_resource_type(res.resource_type)

    if federated_path:
        res_root = os.path.join(federated_path, res_id)
    else:
        res_root = res_id

    if is_zip_download or is_sf_agg_file:
        if not path.endswith(
                ".zip"):  # requesting folder that needs to be zipped
            input_path = path.split(res_id)[1]
            random_hash = random.getrandbits(32)
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            random_hash_path = 'zips/{daily_date}/{res_id}/{rand_folder}'.format(
                daily_date=daily_date, res_id=res_id, rand_folder=random_hash)
            output_path = '{random_hash_path}{path}.zip'.format(
                random_hash_path=random_hash_path, path=input_path)

            if res.resource_type == "CompositeResource":
                aggregation_name = input_path[len('/data/contents/'):]
                res.create_aggregation_xml_documents(
                    aggregation_name=aggregation_name)

            if use_async:
                task = create_temp_zip.apply_async(
                    (res_id, input_path, output_path, is_sf_agg_file),
                    countdown=3)
                delete_zip.apply_async(
                    (random_hash_path, ),
                    countdown=(20 * 60))  # delete after 20 minutes
                if is_sf_agg_file:
                    download_path = request.path.split(res_id)[0] + output_path
                else:
                    download_path = request.path.split("zips")[0] + output_path
                if rest_call:
                    return HttpResponse(json.dumps({
                        'zip_status':
                        'Not ready',
                        'task_id':
                        task.task_id,
                        'download_path':
                        download_path
                    }),
                                        content_type="application/json")
                request.session['task_id'] = task.task_id
                request.session['download_path'] = download_path
                return HttpResponseRedirect(res.get_absolute_url())

            ret_status = create_temp_zip(res_id, input_path, output_path,
                                         is_sf_agg_file)
            delete_zip.apply_async(
                (random_hash_path, ),
                countdown=(20 * 60))  # delete after 20 minutes
            if not ret_status:
                content_msg = "Zip cannot be created successfully. Check log for details."
                response = HttpResponse()
                if rest_call:
                    response.content = content_msg
                else:
                    response.content = "<h1>" + content_msg + "</h1>"
                return response

            path = output_path

    bag_modified = istorage.getAVU(res_root, 'bag_modified')
    # make sure if bag_modified is not set to true, we still recreate the bag if the
    # bag file does not exist for some reason to resolve the error to download a nonexistent
    # bag when bag_modified is false due to the flag being out-of-sync with the real bag status
    if bag_modified is None or bag_modified.lower() == "false":
        # check whether the bag file exists
        bag_file_name = res_id + '.zip'
        if res_root.startswith(res_id):
            bag_full_path = os.path.join('bags', bag_file_name)
        else:
            bag_full_path = os.path.join(federated_path, 'bags', bag_file_name)
        # set bag_modified to 'true' if the bag does not exist so that it can be recreated
        # and the bag_modified AVU will be set correctly as well subsequently
        if not istorage.exists(bag_full_path):
            bag_modified = 'true'

    metadata_dirty = istorage.getAVU(res_root, 'metadata_dirty')
    # do on-demand bag creation
    # needs to check whether res_id collection exists before getting/setting AVU on it
    # to accommodate the case where the very same resource gets deleted by another request
    # when it is getting downloaded

    if is_bag_download:
        # send signal for pre_check_bag_flag
        pre_check_bag_flag.send(sender=resource_cls, resource=res)
        if bag_modified is None or bag_modified.lower() == "true":
            if metadata_dirty is None or metadata_dirty.lower() == 'true':
                create_bag_files(res)
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task
                task = create_bag_by_irods.apply_async((res_id, ), countdown=3)
                if rest_call:
                    return HttpResponse(json.dumps({
                        'bag_status': 'Not ready',
                        'task_id': task.task_id
                    }),
                                        content_type="application/json")

                request.session['task_id'] = task.task_id
                request.session['download_path'] = request.path
                return HttpResponseRedirect(res.get_absolute_url())
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    if rest_call:
                        response.content = content_msg
                    else:
                        response.content = "<h1>" + content_msg + "</h1>"
                    return response

    elif metadata_dirty is None or metadata_dirty.lower() == 'true':
        if path.endswith("resourcemap.xml") or path.endswith(
                'resourcemetadata.xml'):
            # we need to regenerate the metadata xml files
            create_bag_files(res)

    # send signal for pre download file
    download_file_name = split_path_strs[-1]
    pre_download_file.send(sender=resource_cls,
                           resource=res,
                           download_file_name=download_file_name,
                           request=request)

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    stdout = session.run("ils", None, "-l", path)[0].split()
    flen = int(stdout[3])

    # If this path is resource_federation_path, then the file is a local user file
    userpath = '/' + os.path.join(
        getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home',
        getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE',
                'localHydroProxy'))

    # Allow reverse proxy if request was forwarded by nginx
    # (HTTP_X_DJANGO_REVERSE_PROXY is 'true')
    # and reverse proxy is possible according to configuration.

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. This deals with federated resources by reading their path, matching local vaults, and
        #    redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At
        #    present, the only one of these is /irods-user, which handles files whose federation
        #    path is stored in the variable 'userpath'.
        # 4. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(path):
            content_msg = "file path {} does not exist in iRODS".format(path)
            response = HttpResponse(status=404)
            if rest_call:
                response.content = content_msg
            else:
                response.content = "<h1>" + content_msg + "</h1>"
            return response

        if not res.is_federated:
            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response[
                'Content-Disposition'] = 'attachment; filename="{name}"'.format(
                    name=path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = '/'.join(
                [getattr(settings, 'IRODS_DATA_URI', '/irods-data'), path])
            return response

        elif res.resource_federation_path == userpath:  # this guarantees a "user" resource
            # invoke X-Accel-Redirect on physical vault file in nginx
            # if path is full user path; strip federation prefix
            if path.startswith(userpath):
                path = path[len(userpath) + 1:]
            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response[
                'Content-Disposition'] = 'attachment; filename="{name}"'.format(
                    name=path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = os.path.join(
                getattr(settings, 'IRODS_USER_URI', '/irods-user'), path)
            return response

    # if we get here, none of the above conditions are true
    if flen <= FILE_SIZE_LIMIT:
        options = ('-', )  # we're redirecting to stdout.
        # this unusual way of calling works for federated or local resources
        proc = session.run_safe('iget', None, path, *options)
        response = FileResponse(proc.stdout, content_type=mtype)
        response[
            'Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=path.split('/')[-1])
        response['Content-Length'] = flen
        return response

    else:
        content_msg = "File larger than 1GB cannot be downloaded directly via HTTP. " \
                      "Please download the large file via iRODS clients."
        response = HttpResponse(status=403)
        if rest_call:
            response.content = content_msg
        else:
            response.content = "<h1>" + content_msg + "</h1>"
        return response