예제 #1
0
    def handle(self, req):
        path = req.uri[5:]
        form = FieldStorage(req)

        strict = form.get('strict', True)
        if strict in ['false', 'False', '0', None, '']:
            strict = False

        mt = form.get('mimeType', '')
        mt = mt.replace(' ', '+')

        if not mt:
            xtn = form.get('extension', '')
            if xtn:
                if not srlzHash.has_key(xtn):
                    # can't continue
                    raise ValueError(xtn)
                else:
                    mt = srlzHash[xtn].mimeType
        
        if not mt:
            try:
                wanted = req.headers_in['Accept']
                mts = conneg.parse(wanted)
                mt = conneg.best(mts, mimeList)
            except:
                mt = ''

        if mt:
            xtn = mimeHash[str(mt)]
        else:
            # default to rdf/xml
            xtn = "rdf.xml"

        srlz = srlzHash[xtn]

        if form.has_key('aggregation'):
            uri = form.get('aggregation')
        else:
            uri = path

        if not uri:
            data = '<html><body>Instructions etc. goes here</body></html>'
            self.send(data, req, ct="text/html");
            return
        elif not protoUriRe.match(uri):
            self.error("Resource Map URI must be a protocol based URI", req)
            return

        try:
            # fetch
            
            rd = ReMDocument(uri)
        except Exception, e:
            self.error("Could not retrieve Resource Map from '%s': %s" % (uri, e.message), req)
            return
예제 #2
0
    def __init__(self,
                 uri,
                 data='',
                 filename='',
                 mimeType='',
                 format='',
                 accept=''):
        self.uri = uri
        if data:
            self.data = data
        elif filename:
            if os.path.exists(filename):
                fh = file(filename)
                self.data = fh.read()
                fh.close()
        else:
            # try to fetch uri
            try:
                req = urllib.request.Request(uri)
                if accept:
                    # add custom accept header
                    req.add_header('Accept', accept)
                else:
                    # otherwise add default
                    req.add_header('Accept', accept_header)
                fh = urllib.request.urlopen(req)
                self.data = fh.read()
                self.info = fh.info()
                mimeType = self.info.dict.get('content-type', mimeType)
                self.uri = fh.geturl()
                fh.close()
            except:
                raise OreException(
                    'ReMDocument must either have data or filename')

            if not format:
                try:
                    mt = conneg.parse(mimeType)
                    if mt:
                        mimeType = mt[0].mimetype1 + '/' + mt[0].mimetype2
                except:
                    pass
                mimeHash = {
                    'application/atom+xml': 'atom',
                    'application/xhtml+xml': 'rdfa',
                    'application/rdf+xml': 'xml',
                    'text/plain': 'nt',  # yes, really
                    'text/rdf+n3': 'n3',
                    'application/x-turtle': 'turtle',
                    'application/rdf+nt': 'nt'
                }
                format = mimeHash.get(mimeType, '')

        self.mimeType = mimeType
        self.format = format
        StringIO.__init__(self, self.data)
예제 #3
0
def crawl(uri, src):
    if uri not in pageHash:
        pid = len(pageHash)
        pageHash[uri] = pid
    else:
        pid = pageHash[uri]
    linkHash = webGraphs[-1]
    if pid not in linkHash:
        linkHash[pid] = []
    else:
        return

    print("processing %s->%s: %s" % (src, pid, uri))

    if src != -1:
        linkHash[src].append(pid)

    #fetch, find links, record, crawl
    try:
        fh = urllib.request.urlopen(uri)
    except:
        print("... BROKEN")
        return


    ar = AggregatedResource(uri)

    ct = fh.headers['content-type']
    try:
        cl = fh.headers['content-length']
        ar._dc.extent = Literal(cl)
    except:
        pass
    try:
        lm = fh.headers['last-modified']
        ar._dcterms.modified = Literal(lm)
    except:
        pass

    mt = conneg.parse(ct)
    if mt:
        ct = mt[0].mimetype1 + '/' + mt[0].mimetype2
    ar._dc.format = Literal(ct)
    if ct != 'text/html':
        aggr.add_resource(ar)
        try:
            contentTypes[ct] += 1
        except KeyError:
            contentTypes[ct] = 1
        return

    data = fh.read()
    fh.close()

    # hash page for redirects/duplicates etc
    md5 = hashlib.new('md5')
    md5.update(data)
    hd = md5.hexdigest()
    if hd in md5Hash:
        print("%s == %s" % (pid, md5Hash[hd]))
        return
    else:
        md5Hash[hd] = pid
        # only add it here
        aggr.add_resource(ar)

    try:
        dom = etree.parse(io.StringIO(data), parser)
    except:
        print(" --- failed to parse")
        return

    title = dom.xpath('//title/text()')
    if title:
        ar._dc.title = Literal(title[0])

    links = dom.xpath('//a/@href')
    frames = dom.xpath('//frame/@src')
    links.extend(frames)

    imgs = dom.xpath('//img/@src')
    links.extend(imgs)
    css = dom.xpath('//link/@href')
    links.extend(css)

    for l in links:

        l = l.strip()
        if l.find('#') > -1:
            l = l[:l.find('#')]
        if not l:
            # was just a hash URL
            continue

        if l[0] == "/":
            l = urllib.parse.urljoin(uri, l)
        elif l[:7].lower() != "http://" and l[:8].lower() != "https://":
            # check other protocols
            if nonHttpRe.search(l):
                continue
            # put in current directory
            l = urllib.parse.urljoin(uri, l)

        # check if we really want to crawl...
        if nonHtmlRe.search(l):
            # ignore common stuff
            # print "Skipping: %s" % chk
            pass
        elif l in pageHash:
            # ignore already done
            # print "Skipping: %s" % chk
            pass
        else:
            match = 1
            for t in restrictTemplates:
                if not t.match(l):
                    match = 0
                    break
            if match:
                stack.append((l, pid))
예제 #4
0
def crawl(uri, src):
    if not pageHash.has_key(uri):
        pid = len(pageHash)
        pageHash[uri] = pid
    else:
        pid = pageHash[uri]
    linkHash = webGraphs[-1]
    if not linkHash.has_key(pid):
        linkHash[pid] = []
    else:
        return

    print "processing %s->%s: %s" % (src, pid, uri)

    if src != -1:
        linkHash[src].append(pid)

    #fetch, find links, record, crawl
    try:
        fh = urllib.urlopen(uri)
    except:
        print "... BROKEN"
        return


    ar = AggregatedResource(uri)

    ct = fh.headers['content-type']
    try:
        cl = fh.headers['content-length']
        ar._dc.extent = Literal(cl)
    except:
        pass
    try:
        lm = fh.headers['last-modified']
        ar._dcterms.modified = Literal(lm)
    except:
        pass

    mt = conneg.parse(ct)
    if mt:
        ct = mt[0].mimetype1 + '/' + mt[0].mimetype2
    ar._dc.format = Literal(ct)
    if ct != 'text/html':
        aggr.add_resource(ar)
        try:
            contentTypes[ct] += 1
        except KeyError:
            contentTypes[ct] = 1
        return

    data = fh.read()
    fh.close()

    # hash page for redirects/duplicates etc
    md5 = hashlib.new('md5')
    md5.update(data)
    hd = md5.hexdigest()
    if md5Hash.has_key(hd):
        print "%s == %s" % (pid, md5Hash[hd])
        return
    else:
        md5Hash[hd] = pid
        # only add it here
        aggr.add_resource(ar)

    try:
        dom = etree.parse(StringIO.StringIO(data), parser)
    except:
        print " --- failed to parse"
        return

    title = dom.xpath('//title/text()')
    if title:
        ar._dc.title = Literal(title[0])

    links = dom.xpath('//a/@href')
    frames = dom.xpath('//frame/@src')
    links.extend(frames)

    imgs = dom.xpath('//img/@src')
    links.extend(imgs)
    css = dom.xpath('//link/@href')
    links.extend(css)

    for l in links:

        l = l.strip()
        if l.find('#') > -1:
            l = l[:l.find('#')]
        if not l:
            # was just a hash URL
            continue

        if l[0] == "/":
            l = urlparse.urljoin(uri, l)
        elif l[:7].lower() != "http://" and l[:8].lower() != "https://":
            # check other protocols
            if nonHttpRe.search(l):
                continue
            # put in current directory
            l = urlparse.urljoin(uri, l)

        # check if we really want to crawl...
        if nonHtmlRe.search(l):
            # ignore common stuff
            # print "Skipping: %s" % chk
            pass
        elif pageHash.has_key(l):
            # ignore already done
            # print "Skipping: %s" % chk
            pass
        else:
            match = 1
            for t in restrictTemplates:
                if not t.match(l):
                    match = 0
                    break
            if match:
                stack.append((l, pid))
예제 #5
0
srlzHash['old-atom.xml'].mimeType = "application/atom+xml;version=0.9"
srlzHash['pretty.xml'].mimeType += ";format=pretty"

p = RdfLibParser()
p.strict = True
ap = AtomParser()
p.strict = True
rdfap = RdfAParser()
p.strict = True

mimeHash = {}
for (k,v) in srlzHash.items():
    mimeHash[v.mimeType] = k
mimestr = ', '.join(mimeHash.keys())
mimeList = conneg.parse(mimestr)

protoUriRe = re.compile("^([s]?http[s]?://|[t]?ftp:/|z39.50r:|gopher:|imap://|news:|nfs:|nntp:|rtsp:)")

class validateHandler:
    def send(self, text, req, code=200, ct="text/xml"):
        req.content_type = ct
        req.content_length = len(text)
        req.send_http_header()
        if type(text) == unicode:
            req.write(text.encode('utf-8'))
        else:
            req.write(text)

    def error(self, msg, req):
        text = "<html><body><h3>Error</h3><p>%s</p></body></html>" % msg
예제 #6
0
    def handle(self, req):
        path = req.uri[5:]
        form = FieldStorage(req)

        strict = form.get('strict', True)
        if strict in ['false', 'False', '0', None, '']:
            strict = False

        mt = form.get('mimeType', '')
        mt = mt.replace(' ', '+')

        if not mt:
            xtn = form.get('extension', '')
            if xtn:
                if xtn not in srlzHash:
                    # can't continue
                    raise ValueError(xtn)
                else:
                    mt = srlzHash[xtn].mimeType

        if not mt:
            try:
                wanted = req.headers_in['Accept']
                mts = conneg.parse(wanted)
                mt = conneg.best(mts, mimeList)
            except:
                mt = ''

        if mt:
            xtn = mimeHash[str(mt)]
        else:
            # default to rdf/xml
            xtn = "rdf.xml"

        srlz = srlzHash[xtn]

        if 'aggregation' in form:
            uri = form.get('aggregation')
        else:
            uri = path

        if not uri:
            data = '<html><body>Instructions etc. goes here</body></html>'
            self.send(data, req, ct="text/html")
            return
        elif not protoUriRe.match(uri):
            self.error("Resource Map URI must be a protocol based URI", req)
            return

        try:
            # fetch

            rd = ReMDocument(uri)
        except Exception as e:
            self.error(
                "Could not retrieve Resource Map from '%s': %s" %
                (uri, e.message), req)
            return

        try:
            # parse
            if rd.format == 'atom':
                parser = ap
            elif rd.format == 'rdfa':
                parser = rdfap
            else:
                parser = p
            if not strict:
                parser.strict = False
            try:
                rem = parser.parse(rd)
                parser.strict = True
            except:
                parser.strict = True
                raise

        except OreException as e:
            # get exception message
            self.error("Resource Map Invalid: %s" % e.message, req)
            return
        except SAXParseException as e:
            self.error(
                "Could not parse XML: %s (line %s, column %s)" %
                (e.getMessage(), e.getLineNumber(), e.getColumnNumber()), req)
            return
        except:
            raise

        try:
            # serialize
            rem2 = rem._aggregation_.register_serialization(
                srlz, 'http://foresite.cheshire3.org/%s#rem' % req.uri)
            rd = rem2.get_serialization()
            data = rd.data
            if srlz == srlzHash['rdfa.html']:
                data = '<xhtml xmlns="http://www.w3.org/1999/xhtml"><body><i>Invisible RDFa resource map follows, it must have validated okay. [view source] :)</i>' + data + "</body></xhtml>"

        except Exception as e:
            self.error(
                "Could not serialize Aggregation to Resource Map: %s" %
                e.message, req)
            return

        self.send(data, req, ct=srlz.mimeType)
예제 #7
0
srlzHash['old-atom.xml'].mimeType = "application/atom+xml;version=0.9"
srlzHash['pretty.xml'].mimeType += ";format=pretty"

p = RdfLibParser()
p.strict = True
ap = AtomParser()
p.strict = True
rdfap = RdfAParser()
p.strict = True

mimeHash = {}
for (k, v) in list(srlzHash.items()):
    mimeHash[v.mimeType] = k
mimestr = ', '.join(list(mimeHash.keys()))
mimeList = conneg.parse(mimestr)

protoUriRe = re.compile(
    "^([s]?http[s]?://|[t]?ftp:/|z39.50r:|gopher:|imap://|news:|nfs:|nntp:|rtsp:)"
)


class validateHandler:
    def send(self, text, req, code=200, ct="text/xml"):
        req.content_type = ct
        req.content_length = len(text)
        req.send_http_header()
        if type(text) == str:
            req.write(text.encode('utf-8'))
        else:
            req.write(text)