Пример #1
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    error = None
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ["wsgi.url_scheme"] + "://" 
            if request.environ.get("HTTP_HOST"):
                prefix += request.environ["HTTP_HOST"]
            else:
                prefix += request.environ["SERVER_NAME"]
            # Join the prefix and given pipeline module path, ensuring the
            # path starts with "/".
            uri = prefix + re.sub(r"^(?!/)", "/", uri)
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers["content-type"] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, "POST", body=body, headers=headers)
        if not str(resp.status).startswith("2"):
            error = "Error in enrichment pipeline at %s" % uri
            logger.error(error)
            continue

        body = cont

    return error, body
Пример #2
0
def is_shown_at_transform(d):
    source = ""
    for s in d["handle"] if not isinstance(d["handle"], basestring) else [d["handle"]]:
        if is_absolute(s):
            source = s
            break
    return {"isShownAt": {"@id": source, "format": d.get("format", None), "rights": d.get("rights", None)}}
Пример #3
0
def source_transform(d):
    source = None
    for s in d["handle"]:
        if is_absolute(s):
            source = s
            break
    return {"source": source} if source else {}
Пример #4
0
def source_transform(d):
    source = None
    for s in d["handle"]:
        if is_absolute(s):
            source = s
            break
    return {"source": source} if source else {}
Пример #5
0
    def __new__(cls, arg, uri=None, encoding=None, resolver=None, sourcetype=0):
        """
        arg - a string, Unicode object (only if you really know what you're doing),
              file-like object (stream), file path or URI.  You can also pass an
              InputSource object, in which case the return value is just the same
              object, possibly with the URI modified
        uri - optional override URI.  The base URI for the IS will be set to this
              value

        Returns an input source which can be passed to Amara APIs.
        """
        #do the imports within the function to avoid circular crap
        #from amara._xmlstring import IsXml as isxml

        #These importa are tucked in here because amara.lib.iri is an expensive import
        from amara.lib.iri import is_absolute, os_path_to_uri
        from amara.lib.irihelpers import DEFAULT_RESOLVER
        resolver = resolver or DEFAULT_RESOLVER

        if isinstance(arg, InputSource):
            return arg

        #if arg == (u'', ''): -> UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
        if arg == '':
            #FIXME L10N
            raise ValueError("Cannot parse an empty string as XML")
        
        if isinstance(arg, urllib2.Request):
            uri = arg.get_full_url() #One of the rightly labeled "lame" helper methods in urllib2 ;)
            stream = resolver.resolve(arg)
        elif hasattr(arg, 'read'):
            #Create dummy Uri to use as base
            uri = uri or uuid4().urn
            stream = arg
        #XXX: Should we at this point refuse to proceed unless it's a basestring?
        elif sourcetype == XMLSTRING or isxml(arg):
            #See this article about XML detection heuristics
            #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html
            uri = uri or uuid4().urn
            stream = StringIO(arg)
        elif is_absolute(arg) and not os.path.isfile(arg):
            uri = arg
            stream = resolver.resolve(uri)
        #If the arg is beyond a certain length, don't even try it as a URI
        elif len(arg) < MAX_URI_LENGTH_FOR_HEURISTIC:
            uri = os_path_to_uri(arg)
            stream = resolver.resolve(uri)
        else:
            #FIXME L10N
            raise ValueError("Does not appear to be well-formed XML")

        #We might add the ability to load zips, gzips & bzip2s
        #http://docs.python.org/lib/module-zlib.html
        #http://docs.python.org/lib/module-gzip.html
        #http://docs.python.org/lib/module-bz2.html
        #http://docs.python.org/lib/zipfile-objects.html

        #import inspect; print inspect.stack()
        #InputSource.__new__ is in C: expat/input_source.c:inputsource_new
        return InputSource.__new__(cls, stream, uri, encoding)
Пример #6
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    error = None
    for uri in enrichments:
        if not uri: continue  # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ["wsgi.url_scheme"] + "://"
            if request.environ.get("HTTP_HOST"):
                prefix += request.environ["HTTP_HOST"]
            else:
                prefix += request.environ["SERVER_NAME"]
            # Join the prefix and given pipeline module path, ensuring the
            # path starts with "/".
            uri = prefix + re.sub(r"^(?!/)", "/", uri)
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers["content-type"] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, "POST", body=body, headers=headers)
        if not str(resp.status).startswith("2"):
            error = "Error in enrichment pipeline at %s" % uri
            logger.error(error)
            continue

        body = cont

    return error, body
def cdl_identify_object(body, ctype):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    url = None
    if exists(data, "object"):
        handle = getprop(data, "object")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break
    if exists(data, "originalRecord/doc/isShownBy"):
        handle = getprop(data, "originalRecord/doc/isShownBy")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break

    if url:
        if 'content.cdlib.org' in url:
            base_url, obj_id, object_type = url.rsplit("/", 2)
            is_shown_at = getprop(data, "isShownAt")
            is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1)
            if obj_id != is_shown_at_id:
                logger.warn(
                    "Object url for %s has ARK value (%s) that does not match isShownAt (%s)"
                    % (data["_id"], obj_id, is_shown_at_id))
                obj_id = is_shown_at_id
            url = "/".join([base_url, obj_id, object_type])
            if object_type == "hi-res":
                setprop(data, "hasView", {"@id": url})
                url = url.replace('hi-res', 'thumbnail')

        setprop(data, "object", url)
    else:
        logger.warn("No url found for object in id %s" % data["_id"])
        delprop(data, "object", True)
    return json.dumps(data)
Пример #8
0
def is_shown_at_transform(d):
    source = None
    for s in (d["handle"] if not isinstance(d["handle"],basestring) else [d["handle"]]):
        if is_absolute(s):
            source = s
            break

    return {"isShownAt" : source }
Пример #9
0
def cdl_identify_object(body, ctype):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    url = None
    if exists(data, "object"):
        handle = getprop(data, "object")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break
    if exists(data, "originalRecord/doc/isShownBy"):
        handle = getprop(data, "originalRecord/doc/isShownBy")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break

    if url:
        if 'content.cdlib.org' in url:
            base_url, obj_id, object_type = url.rsplit("/", 2)
            is_shown_at = getprop(data, "isShownAt")
            is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1)
            if obj_id != is_shown_at_id:
                logger.warn("Object url for %s has ARK value (%s) that does not match isShownAt (%s)" % (data["_id"], obj_id, is_shown_at_id))
                obj_id = is_shown_at_id
            url = "/".join([base_url, obj_id, object_type])
            if object_type == "hi-res":
                setprop(data, "hasView", {"@id": url})
                url = url.replace('hi-res', 'thumbnail')

        setprop(data, "object", url)
    else:
        logger.warn("No url found for object in id %s" % data["_id"])
        delprop(data, "object", True)
    return json.dumps(data)
Пример #10
0
def is_shown_at_transform(d):
    source = None
    for s in (d["handle"]
              if not isinstance(d["handle"], basestring) else [d["handle"]]):
        if is_absolute(s):
            source = s
            break

    return {"isShownAt": source}
Пример #11
0
    def evaluate_as_nodeset(self, context):
        arg0, arg1 = self._args
        if arg1 is None:
            base_uri = context.instruction.baseUri
        else:
            for node in arg1.evaluate_as_nodeset(context):
                base_uri = node.xml_base
                break
            else:
                raise XsltRuntimeError(XsltError.DOC_FUNC_EMPTY_NODESET,
                                       context.instruction)
        arg0 = arg0.evaluate(context)
        if isinstance(arg0, datatypes.nodeset):
            uris = set()
            for node in arg0:
                uri = datatypes.string(node)
                if arg1 is None:
                    base_uri = node.xml_base
                assert base_uri or iri.is_absolute(uri)
                uris.add(iri.absolutize(uri, base_uri))
        else:
            uri = datatypes.string(arg0)
            assert base_uri or iri.is_absolute(uri)
            uris = [iri.absolutize(uri, base_uri)]

        documents = context.documents
        sources = context.transform.root.sources
        result = []
        for uri in uris:
            if uri in documents:
                doc = documents[uri]
            else:
                if uri in sources:
                    doc = amara.parse(StringIO(sources[uri]), uri)
                else:
                    doc = amara.parse(uri)
                documents[uri] = doc
            result.append(doc)
        return datatypes.nodeset(result)
Пример #12
0
    def evaluate_as_nodeset(self, context):
        arg0, arg1 = self._args
        if arg1 is None:
            base_uri = context.instruction.baseUri
        else:
            for node in arg1.evaluate_as_nodeset(context):
                base_uri = node.xml_base
                break
            else:
                raise XsltRuntimeError(XsltError.DOC_FUNC_EMPTY_NODESET,
                                       context.instruction)
        arg0 = arg0.evaluate(context)
        if isinstance(arg0, datatypes.nodeset):
            uris = set()
            for node in arg0:
                uri = datatypes.string(node)
                if arg1 is None:
                    base_uri = node.xml_base
                assert base_uri or iri.is_absolute(uri)
                uris.add(iri.absolutize(uri, base_uri))
        else:
            uri = datatypes.string(arg0)
            assert base_uri or iri.is_absolute(uri)
            uris = [iri.absolutize(uri, base_uri)]

        documents = context.documents
        sources = context.transform.root.sources
        result = []
        for uri in uris:
            if uri in documents:
                doc = documents[uri]
            else:
                if uri in sources:
                    doc = amara.parse(StringIO(sources[uri]), uri)
                else:
                    doc = amara.parse(uri)
                documents[uri] = doc
            result.append(doc)
        return datatypes.nodeset(result)
Пример #13
0
def selid(body, ctype, prop='handle', use_source='yes'):
    '''
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''

    if not prop:
        # Remove this document
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property has been selected"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    record_id = None
    if exists(data, prop):
        v = getprop(data, prop)
        if isinstance(v, basestring):
            record_id = v
        else:
            if v:
                for h in (v if isinstance(v, list) else [v]):
                    if is_absolute(h):
                        record_id = h
                if not record_id:
                    record_id = v[0]

    if not record_id:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"
    '''
    If the useSource parameter is True (default) than prepend it to
    the id and use that value when hashing for the DPLA id
    '''
    if use_source.lower() == 'yes':
        data[u'_id'] = couch_rec_id_builder(source_name, record_id)
    else:
        data[u'_id'] = clean_id(record_id)

    data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
Пример #14
0
def oaisetname(body, ctype, sets_service=None):
    '''   
    Service that accepts a JSON document and sets the "name" property based on looking up
    the set in the HTTP_CONTEXT using the service passed in the 'sets_service' parameter.
    Assumes that the set_service returns a JSON array of two-element arrays, where the first
    element is the id and the second element the complete name.
    '''

    if not sets_service:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No set service has been selected"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not is_absolute(sets_service):
        prefix = request.environ['wsgi.url_scheme'] + '://'
        prefix += request.environ['HTTP_HOST'] if request.environ.get(
            'HTTP_HOST') else request.environ['SERVER_NAME']
        sets_service = prefix + sets_service

    H = httplib2.Http('/tmp/.cache')
    H.force_exception_as_status_code = True
    resp, content = H.request(sets_service)
    if not resp[u'status'].startswith('2'):
        print >> sys.stderr, '  HTTP error (' + resp[
            u'status'] + ') resolving URL: ' + sets_service

    try:
        sets = json.loads(content)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse sets service result as JSON: " + repr(content)

    setpos = data['_id'].find('--')
    match = data['_id'][setpos + 2:] if setpos > -1 else data['_id']

    for s in sets:
        if match == s['setSpec']:
            data[u'title'] = s['setName']
            if s['setDescription']:
                data[u'description'] = s['setDescription'].strip()
            break

    return json.dumps(data)
Пример #15
0
def selid(body,
          ctype,
          prop='descriptiveNonRepeating/record_link',
          alternative_prop='descriptiveNonRepeating/record_ID'):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to the
    value of the property named by the "prop" paramater
    '''
    tmpl = "http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA"

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

        request_headers = copy_headers_to_dict(request.environ)
        source_name = request_headers.get('Source')

        id = None

        if exists(data, prop) or exists(data, alternative_prop):
            v = getprop(data, prop, True)
            if not v:
                v = getprop(data, alternative_prop)
                v = tmpl % v
            if isinstance(v, basestring):
                id = v
            else:
                if v:
                    for h in v:
                        if is_absolute(h):
                            id = h
                    if not id:
                        id = v[0]

        if not id:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "No id property was found"

        data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
        data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()
    else:
        logger.error("Prop param in None in %s" % __name__)

    return json.dumps(data)
Пример #16
0
def oaisetname(body,ctype,sets_service=None):
    '''   
    Service that accepts a JSON document and sets the "name" property based on looking up
    the set in the HTTP_CONTEXT using the service passed in the 'sets_service' parameter.
    Assumes that the set_service returns a JSON array of two-element arrays, where the first
    element is the id and the second element the complete name.
    '''   
    
    if not sets_service:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "No set service has been selected"

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    if not is_absolute(sets_service):
        prefix = request.environ['wsgi.url_scheme'] + '://' 
        prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME']
        sets_service = prefix + sets_service
        
    H = httplib2.Http('/tmp/.cache')
    H.force_exception_as_status_code = True
    resp, content = H.request(sets_service)
    if not resp[u'status'].startswith('2'):
         print >> sys.stderr, '  HTTP error ('+resp[u'status']+') resolving URL: '+sets_service

    try :
        sets = json.loads(content)
    except:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse sets service result as JSON: " + repr(content)

    setpos = data['_id'].find('--')
    match = data['_id'][setpos+2:] if setpos > -1 else data['_id']

    for s in sets:
        if match == s['setSpec']:
            data[u'title'] = s['setName']
            if s['setDescription']:
                data[u'description'] = s['setDescription'].strip()
            break

    return json.dumps(data)
Пример #17
0
    def map_is_shown_at(self, index=None):
        if exists(self.provider_data, "handle"):
            is_shown_at = None
            identifiers = [id for id in
                           iterify(self.provider_data["handle"]) if
                           is_absolute(id)]
            if index:
                try:
                    is_shown_at = identifiers[int(index)]
                except:
                    pass
            if not is_shown_at:
                is_shown_at = identifiers[0]

            if is_shown_at:
                self.mapped_data.update({"isShownAt": is_shown_at})
Пример #18
0
    def map_is_shown_at(self, index=None):
        if exists(self.provider_data, "handle"):
            is_shown_at = None
            identifiers = [
                id for id in iterify(self.provider_data["handle"])
                if is_absolute(id)
            ]
            if index:
                try:
                    is_shown_at = identifiers[int(index)]
                except:
                    pass
            if not is_shown_at:
                is_shown_at = identifiers[0]

            if is_shown_at:
                self.mapped_data.update({"isShownAt": is_shown_at})
Пример #19
0
def selid(body,ctype,prop='descriptiveNonRepeating/record_link', alternative_prop='descriptiveNonRepeating/record_ID'):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to the
    value of the property named by the "prop" paramater
    '''   
    tmpl="http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA"
    
    if prop:
        try :
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type','text/plain')
            return "Unable to parse body as JSON"

        request_headers = copy_headers_to_dict(request.environ)
        source_name = request_headers.get('Source')

        id = None

        if exists(data, prop) or exists(data, alternative_prop):
            v = getprop(data,prop, True)
            if not v:
                v = getprop(data, alternative_prop)
                v = tmpl % v
            if isinstance(v,basestring):
                id = v
            else:
                if v:
                    for h in v:
                        if is_absolute(h):
                            id = h
                    if not id:
                        id = v[0]

        if not id:
            response.code = 500
            response.add_header('content-type','text/plain')
            return "No id property was found"

        data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
        data[u'id']  = hashlib.md5(data[u'_id']).hexdigest()
    else:
        logger.error("Prop param in None in %s" % __name__)

    return json.dumps(data)
def selid(body, ctype, prop='handle'):
    '''
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''

    if not prop:
        # Remove this document
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property has been selected"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    id = None
    if exists(data, prop):
        v = getprop(data, prop)
        if isinstance(v, basestring):
            id = v
        else:
            if v:
                for h in (v if isinstance(v, list) else [v]):
                    if is_absolute(h):
                        id = h
                if not id:
                    id = v[0]

    if not id:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
    #   we don't use this, dump it
    #    data[u'id']  = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
Пример #21
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ['wsgi.url_scheme'] + '://' 
            prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME']
            uri = prefix + uri
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers['content-type'] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, 'POST', body=body, headers=headers)
        if not str(resp.status).startswith('2'):
            logger.warn("Error in enrichment pipeline at %s: %s"%(uri,repr(resp)))
            continue

        body = cont
    return body
Пример #22
0
def selid(body, ctype, prop='handle'):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''   
    
    if not prop:
        # Remove this document
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property has been selected"

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    id = None
    if exists(data,prop):
        v = getprop(data,prop)
        if isinstance(v,basestring):
            id = v
        else:
            if v:
                for h in (v if isinstance(v, list) else [v]):
                    if is_absolute(h):
                        id = h
                if not id:
                    id = v[0]

    if not id:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
    data[u'id']  = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
Пример #23
0
    def update_resource(self, path=None):
        '''
        Update a resource based on WSGI environment or a uri path
        '''
        if path:
            docid = path
            if is_absolute(path):
                docid = relativize(path, self.remotedb)
        else:
            docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc'

        if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid))))

        body = self.environ['wsgi.input'].read()

        # If the document already exists, we need to determine its current rev and add it to the
        # input body, skipping the process if rev is provided in the PUT request body
        body_js = json.loads(body)
        rev = json.loads(body).get('_rev',None)
        if not rev:
            # Need to GET the rev
            resp, content = self.h.request(join(self.remotedb, docid), "GET")
            if str(resp.status).startswith('2'):
                rev = json.loads(content).get('_rev',None)

            logger.debug('update_resource: found existing rev = '+repr(rev))

        if rev:
            body_js['_rev'] = rev
            body = json.dumps(body_js)

        headers = {'content-type':self.environ['CONTENT_TYPE']}
        resp, content = self.h.request(join(self.remotedb, docid), "PUT", body=body, headers=headers)
        
        if logger: logger.debug('resp ' + repr((content[:100], resp)))

        self.prep_slave_response(resp)

        if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')):
            if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status))
            return '' #No resource could be retrieved

        return content
Пример #24
0
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False):
    '''
    Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page
    
    original_base - The base URI of the actual Moin instance
    wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance
    link - the relative link, generally from one wiki page to another
    relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base
    raw - the link is a full hierarchical path, rather than relative to the wiki base

    Returns a tuple (wrapped_uri, abs_link)
    
    wrapped_uri - the URI wrapped for REST ops
    abs_link - the full, original wiki URL
    
    >>> from akara.util.moin import wiki_uri
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam')
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam')
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True)
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True)
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam')
    ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam')
    '''
    #rel_link = relativize(abs_link, original_wiki_base)
    #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b
    #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that
    #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/'))))
    if raw and not is_absolute(link):
        (scheme, authority, path, query,
         fragment) = split_uri_ref(original_base)
        link = link[len(path):]
    link = link.lstrip('/')
    abs_link = absolutize(link, original_base.rstrip('/') + '/')
    rel_to_wikibase = relativize(abs_link, original_base.rstrip('/') + '/')
    if not rel_to_wikibase:
        #It's not a relative wiki link
        return None, None
    rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/') + '/')
    return rest_uri, abs_link
Пример #25
0
Файл: moin.py Проект: dpla/akara
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False):
    '''
    Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page
    
    original_base - The base URI of the actual Moin instance
    wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance
    link - the relative link, generally from one wiki page to another
    relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base
    raw - the link is a full hierarchical path, rather than relative to the wiki base

    Returns a tuple (wrapped_uri, abs_link)
    
    wrapped_uri - the URI wrapped for REST ops
    abs_link - the full, original wiki URL
    
    >>> from akara.util.moin import wiki_uri
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam')
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam')
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True)
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True)
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam')
    ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam')
    '''
    #rel_link = relativize(abs_link, original_wiki_base)
    #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b
    #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that
    #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/'))))
    if raw and not is_absolute(link):
        (scheme, authority, path, query, fragment) = split_uri_ref(original_base)
        link = link[len(path):]
    link = link.lstrip('/')
    abs_link = absolutize(link, original_base.rstrip('/')+'/')
    rel_to_wikibase = relativize(abs_link, original_base.rstrip('/')+'/')
    if not rel_to_wikibase:
        #It's not a relative wiki link
        return None, None
    rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/')+'/')
    return rest_uri, abs_link
def selectid(body, ctype):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''   
    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    objid = None
    v = getprop(data, 'identifier')
    if isinstance(v,basestring):
        objid = v
    else:
        if v:
            for h in (v if isinstance(v, list) else [v]):
                if h['text'].startswith('http://ark.cdlib.org/ark:'):
                    if is_absolute(h['text']):
                        objid = h['text']
                if not objid:
                    objid = v[0]

    if not objid:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, objid)
    data[u'id']  = hashlib.md5(data[u'_id']).hexdigest()
    data[u'isShownAt'] = objid
    data[u'isShownBy'] = objid + '/thumbnail'

    return json.dumps(data)
Пример #27
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ['wsgi.url_scheme'] + '://' 
            if request.environ.get('HTTP_HOST'):
                prefix += request.environ['HTTP_HOST']
            else:
                prefix += request.environ['SERVER_NAME']
            uri = prefix + uri
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers['content-type'] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, 'POST', body=body, headers=headers)
        if not str(resp.status).startswith('2'):
            logger.warn("Error in enrichment pipeline at %s: %s" % 
                        (uri, repr(resp)))
            continue
        body = cont

    return body
Пример #28
0
    def delete_resource(self, path=None):
        '''
        Delete a resource based on WSGI environment or a uri path
        '''
        if path:
            docid = path
            if is_absolute(path):
                docid = relativize(path, self.remotedb)
        else:
            docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc'

        if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid))))
        resp, content = self.h.request(join(self.remotedb, docid), "DELETE")#, headers=headers)
        
        if logger: logger.debug('resp ' + repr((content[:100], resp)))

        self.prep_slave_response(resp)

        if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')):
            if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status))
            return '' #No resource could be retrieved

        return content
Пример #29
0
    def resource_factory(self, path=None):
        '''
        Look up and retrieve a new resource based on WSGI environment or a uri path
        '''
        if path:
            docid = path
            if is_absolute(path):
                docid = relativize(path, self.remotedb)
        else:
            docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc'
        #resp, content = self.h.request(slave_uri + ';history', "GET", headers=auth_headers)
        if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid))))
        resp, content = self.h.request(join(self.remotedb, urllib.quote_plus(docid)))
        
        if logger: logger.debug('resp ' + repr((content[:100], resp)))

        self.prep_slave_response(resp)

        if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')):
            if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status))
            return '' #No resource could be retrieved

        data = json.loads(content)
        return resource.factory(self, docid, data)
def contentdm_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail.

    There are two methods of creating the thumbnail URL:
    1. Replacing "cdm/ref" with "utils/getthumbail" in the handle field
       Example:
           handle: http://test.provider/cdm/ref/collection/1/id/1
           thumbnail: http://test.provider/utils/getthumbnail/collection/1/id/1

    2. Splitting the handle field on "u?" and using the parts to compose the
       thumbnail URL.
       Example:
            handle: http://test.provider/u?/ctm,101
            thumbnail: http://test.provider/cgi-bin/thumbnail.exe?CISOROOT=/ctm&CISOPTR=101"
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    handle_field = "originalRecord/handle"
    if exists(data, handle_field):
        url = None
        handle = getprop(data, handle_field)
        for h in handle if not isinstance(handle, basestring) else [handle]:
            if is_absolute(h):
                url = h
                break
        if not url:
            logger.error("There is no URL in %s." % handle_field)
            return body
    else:
        logger.error("Field %s does not exist" % handle_field)
        return body

    if "cdm/ref" in url:
        object = url.replace("cdm/ref", "utils/getthumbnail")
    else:
        p = url.split("u?")
        if len(p) != 2:
            logger.error("Bad URL %s. It should have just one 'u?' part." % url)
            return body

        (base_url, rest) = p

        if base_url == "" or rest == "":
            logger.error("Bad URL: %s. There is no 'u?' part." % url)
            return body

        p = rest.split(",")

        if len(p) != 2:
            logger.error(
                "Bad URL %s. Expected two parts at the end, used " + "in thumbnail URL for CISOROOT and CISOPTR." % url
            )
            return body

        # Thumb url field.
        object = "%scgi-bin/thumbnail.exe?CISOROOT=%s&CISOPTR=%s" % (base_url, p[0], p[1])

    data["object"] = object

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Пример #31
0
def source_transform(d):
    source = ""
    for i,s in enumerate(d["handle"]):
        if is_absolute(s):
            source = s
    return {"source":source}
Пример #32
0
def enrichformat(body,
                 ctype,
                 action="enrich-format",
                 prop="sourceResource/format",
                 type_field="sourceResource/type"):
    """
    Service that accepts a JSON document and enriches the "format" field of
    that document by: 

    a) Setting the format to be all lowercase
    b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
    c) Checking to see if the field is a valid IMT
       See http://www.iana.org/assignments/media-types for list of valid
       media-types. We require that a subtype is defined.
    d) Removing any extra text after the IMT
    e) Moving valid IMT values to hasView/format if hasView exists and
       its format is not set
    f) Setting type field from format field, if it is not set. The format field
       is taken if it is a string, or the first element if it is a list. It is
        then split and the first part of IMT is taken.

    By default works on the 'sourceResource/format' field but can be overridden
    by passing the name of the field to use as the 'prop' parameter.
    """

    FORMAT_2_TYPE_MAPPINGS = {
        "audio": "sound",
        "image": "image",
        "video": "moving image",
        "text": "text"
    }

    REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \
              ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \
              ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \
              ('^jpg$', 'image/jpeg'), ('\W$', '')
    IMT_TYPES = [
        'application', 'audio', 'image', 'message', 'model', 'multipart',
        'text', 'video'
    ]

    def get_ext(s):
        ext = os.path.splitext(s)[1].split('.')

        return ext[1] if len(ext) == 2 else ""

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
            s = re.sub(r"^([a-z0-9/]+)\s.*", r"\1", s)
        return s

    def is_imt(s):
        logger.debug("Checking: " + s)
        imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES]
        return any(regex.match(s) for regex in imt_regexes)

    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    imt_values = []
    if exists(data, prop):
        v = getprop(data, prop)
        format = []
        hasview_format = []

        for s in (v if not isinstance(v, basestring) else [v]):
            if s.startswith("http") and is_absolute(s):
                s = get_ext(s)
            cleaned = cleanup(s)
            if is_imt(cleaned):
                # Append to imt_values for use in type
                imt_values.append(cleaned)
                # Move IMT values to hasView/format else discard
                if exists(data, "hasView") and not \
                    exists(data, "hasView/format") and \
                                cleaned not in hasview_format:
                    hasview_format.append(cleaned)
            else:
                # Retain non-IMT values in sourceResource/format, non-cleaned
                if s not in format:
                    format.append(s)

        if format:
            if len(format) == 1:
                format = format[0]
            setprop(data, prop, format)
        else:
            delprop(data, prop)

        if hasview_format:
            if len(hasview_format) == 1:
                hasview_format = hasview_format[0]
            setprop(data, "hasView/format", hasview_format)

    # Setting the type if it is empty.
    if not exists(data, type_field) and imt_values:
        type = []
        for imt in imt_values:
            t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True)
            if t and t not in type:
                type.append(t)

        if type:
            if len(type) == 1:
                type = type[0]
            setprop(data, type_field, type)

    return json.dumps(data)
Пример #33
0
 def map_is_shown_at(self):
     for h in iterify(self.provider_data.get("handle")):
         if is_absolute(h):
             self.mapped_data.update({"isShownAt": h})
             break
Пример #34
0
    def __new__(cls,
                arg,
                uri=None,
                encoding=None,
                resolver=None,
                sourcetype=0):
        """
        arg - a string, Unicode object (only if you really know what you're doing),
              file-like object (stream), file path or URI.  You can also pass an
              InputSource object, in which case the return value is just the same
              object, possibly with the URI modified
        uri - optional override URI.  The base URI for the IS will be set to this
              value

        Returns an input source which can be passed to Amara APIs.
        """
        #do the imports within the function to avoid circular crap
        #from amara._xmlstring import IsXml as isxml

        #These importa are tucked in here because amara.lib.iri is an expensive import
        from amara.lib.iri import is_absolute, os_path_to_uri
        from amara.lib.irihelpers import DEFAULT_RESOLVER
        resolver = resolver or DEFAULT_RESOLVER

        if isinstance(arg, InputSource):
            return arg

        #if arg == (u'', ''): -> UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
        if arg == '':
            #FIXME L10N
            raise ValueError("Cannot parse an empty string as XML")

        if isinstance(arg, urllib2.Request):
            uri = arg.get_full_url(
            )  #One of the rightly labeled "lame" helper methods in urllib2 ;)
            stream = resolver.resolve(arg)
        elif hasattr(arg, 'read'):
            #Create dummy Uri to use as base
            uri = uri or uuid4().urn
            stream = arg
        #XXX: Should we at this point refuse to proceed unless it's a basestring?
        elif sourcetype == XMLSTRING or isxml(arg):
            #See this article about XML detection heuristics
            #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html
            uri = uri or uuid4().urn
            stream = StringIO(arg)
        elif is_absolute(arg) and not os.path.isfile(arg):
            uri = arg
            stream = resolver.resolve(uri)
        #If the arg is beyond a certain length, don't even try it as a URI
        elif len(arg) < MAX_URI_LENGTH_FOR_HEURISTIC:
            uri = os_path_to_uri(arg)
            stream = resolver.resolve(uri)
        else:
            #FIXME L10N
            raise ValueError("Does not appear to be well-formed XML")

        #We might add the ability to load zips, gzips & bzip2s
        #http://docs.python.org/lib/module-zlib.html
        #http://docs.python.org/lib/module-gzip.html
        #http://docs.python.org/lib/module-bz2.html
        #http://docs.python.org/lib/zipfile-objects.html

        #import inspect; print inspect.stack()
        #InputSource.__new__ is in C: expat/input_source.c:inputsource_new
        return InputSource.__new__(cls, stream, uri, encoding)
def contentdm_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail.

    There are two methods of creating the thumbnail URL:
    1. Replacing "cdm/ref" with "utils/getthumbail" in the handle field
       Example:
           handle: http://test.provider/cdm/ref/collection/1/id/1
           thumbnail: http://test.provider/utils/getthumbnail/collection/1/id/1

    2. Splitting the handle field on "u?" and using the parts to compose the
       thumbnail URL.
       Example:
            handle: http://test.provider/u?/ctm,101
            thumbnail: http://test.provider/cgi-bin/thumbnail.exe?CISOROOT=/ctm&CISOPTR=101"
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    handle_field = "originalRecord/handle"
    if exists(data, handle_field):
        url = None
        handle = getprop(data, handle_field)
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break
        if not url:
            logger.error("There is no URL in %s." % handle_field)
            return body
    else:
        logger.error("Field %s does not exist" % handle_field)
        return body

    if "cdm/ref" in url:
        object = url.replace("cdm/ref", "utils/getthumbnail")
    else:
        p = url.split("u?")
        if len(p) != 2:
            logger.error("Bad URL %s. It should have just one 'u?' part." %
                         url)
            return body

        (base_url, rest) = p

        if base_url == "" or rest == "":
            logger.error("Bad URL: %s. There is no 'u?' part." % url)
            return body

        p = rest.split(",")

        if len(p) != 2:
            logger.error("Bad URL %s. Expected two parts at the end, used " +
                         "in thumbnail URL for CISOROOT and CISOPTR." % url)
            return body

        # Thumb url field.
        object = "%scgi-bin/thumbnail.exe?CISOROOT=%s&CISOPTR=%s" % \
                 (base_url, p[0], p[1])

    data["object"] = object

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
 def map_is_shown_at(self):
     for h in iterify(self.provider_data_source.get("handle")):
         if h and is_absolute(h):
             self.mapped_data.update({"isShownAt": h})
             break
Пример #37
0
def enrichformat(body, ctype, action="enrich-format",
                 prop="sourceResource/format",
                 type_field="sourceResource/type"):
    """
    Service that accepts a JSON document and enriches the "format" field of
    that document by: 

    a) Setting the format to be all lowercase
    b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
    c) Checking to see if the field is a valid IMT
       See http://www.iana.org/assignments/media-types for list of valid
       media-types. We require that a subtype is defined.
    d) Removing any extra text after the IMT
    e) Moving valid IMT values to hasView/format if hasView exists and
       its format is not set
    f) Setting type field from format field, if it is not set. The format field
       is taken if it is a string, or the first element if it is a list. It is
        then split and the first part of IMT is taken.

    By default works on the 'sourceResource/format' field but can be overridden
    by passing the name of the field to use as the 'prop' parameter.
    """

    FORMAT_2_TYPE_MAPPINGS = {
        "audio": "sound",
        "image": "image",
        "video": "moving image",
        "text": "text"
    }

    REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \
              ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \
              ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \
              ('^jpg$', 'image/jpeg'), ('\W$', '')
    IMT_TYPES = ['application', 'audio', 'image', 'message', 'model',
                 'multipart', 'text', 'video']

    def get_ext(s):
        ext = os.path.splitext(s)[1].split('.')

        return ext[1] if len(ext) == 2 else ""

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
            s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1", s)
        return s

    def is_imt(s):
        imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES]
        return any(regex.match(s) for regex in imt_regexes)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    imt_values = []
    if exists(data, prop):
        v = getprop(data, prop)
        format = []
        hasview_format = []

        for s in (filter(None,v) if not isinstance(v, basestring) else [v]):
            if s is not None and s.startswith("http") and is_absolute(s):
                s = get_ext(s)
            cleaned = cleanup(s)
            if is_imt(cleaned):
                # Append to imt_values for use in type
                imt_values.append(cleaned)
                # Move IMT values to hasView/format else discard
                if exists(data, "hasView") and not \
                    exists(data, "hasView/format") and \
                                cleaned not in hasview_format:
                    hasview_format.append(cleaned)
            else:
                # Retain non-IMT values in sourceResource/format, non-cleaned
                if s not in format:
                    format.append(s)

        if format:
            if len(format) == 1:
                format = format[0]
            setprop(data, prop, format)
        else:
            delprop(data, prop)

        if hasview_format:
            if len(hasview_format) == 1:
                hasview_format = hasview_format[0]
            setprop(data, "hasView/format", hasview_format)

    # Setting the type if it is empty.
    if not exists(data, type_field) and imt_values:
        type = []
        for imt in imt_values:
            t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True)
            if t and t not in type:
                type.append(t)

        if type:
            if len(type) == 1:
                type = type[0]
            setprop(data, type_field, type)

    return json.dumps(data)