示例#1
0
文件: crawl.py 项目: AwelEshetu/cwm
def doCommand():
    """Command line RDF/N3 crawler
        
 crawl <uriref>

options:
 
See http://www.w3.org/2000/10/swap/doc/cwm  for more documentation.
"""
    global agenda
    global already
    uriref = sys.argv[1]
    uri = join(base(), uriref)
    r = symbol(uri)
    diag.setVerbosity(0)
    print "@prefix : <http://www.w3.org/2000/10/swap/util/semweb#>."
    print "# Generated by crawl.py ", cvsRevision[1:-1]
    agenda = [r]
    while agenda != []:
	r = agenda[0]
	agenda = agenda[1:]
	already.append(r)
	crawl(r)
    print "# ", len(already), "attempts,", successes, "successes."
示例#2
0
def doCommand():
    """Command line RDF/N3 crawler
        
 crawl <uriref>

options:
 
See http://www.w3.org/2000/10/swap/doc/cwm  for more documentation.
"""
    global agenda
    global already
    uriref = sys.argv[1]
    uri = join(base(), uriref)
    r = symbol(uri)
    diag.setVerbosity(0)
    print "@prefix : <http://www.w3.org/2000/10/swap/util/semweb#>."
    print "# Generated by crawl.py ", cvsRevision[1:-1]
    agenda = [r]
    while agenda != []:
        r = agenda[0]
        agenda = agenda[1:]
        already.append(r)
        crawl(r)
    print "# ", len(already), "attempts,", successes, "successes."
示例#3
0
    def crawlFrom(self, addr, prefix, max):
        fmla = self._fmla

        iter = 1
        queue = [addr]
        seen = []
        while queue:
            head = queue.pop()

            progress("crawling at: ", head, " iter ", iter, " of ", max)
            iter = iter + 1
            if iter > max:
                progress("max limit reached.")
                break

            seen.append(head)

            try:
                rep = urllib2.urlopen(head)
                content = rep.read()
            except IOError:
                progress("can't GET", head)
                continue
                #@@ makeStatement(head type NoGood)

            # try to find a short label for
            # a diagram or some such.
            # try the last path segment,
            # or the 2nd last in case of an empty last segment...
            slash = head[:-1].rfind('/')
            label = head[slash + 1:]

            ct = rep.info().getheader('content-type')
            progress("... got content of type ", ct)
            isHTML = ct.find('text/html') == 0

            fmla.add(symbol(head), symbol(DC('type')), literal(ct))

            # note that we're not peeking into the URI
            # to find out if it's HTML; we're just
            # eliding the extension in the case we
            # know (from the HTTP headers) that it's HTML.
            if isHTML and label[-5:] == '.html':
                label = label[:-5]

            fmla.add(symbol(head), symbol(RDFS('label')), literal(label))

            if not isHTML: continue

            progress("... parsing text/html content")
            doc = libxml2.htmlParseDoc(content, 'us-ascii')
            try:
                titles = doc.xpathNewContext().xpathEval('//title')
                title = titles[0].getContent()
            except:  #@@figure out the right exceptions
                pass
            else:
                progress("... found title:", title)
                fmla.add(symbol(head), symbol(DC('title')),
                         literal(str(title)))

            hrefs = doc.xpathNewContext().xpathEval('//a/@href')
            progress("... found ", len(hrefs), " links")

            for h in hrefs:
                h = h.getContent()
                progress("... found href", h)
                i = uripath.join(head, h)
                i = uripath.splitFrag(i)[0]
                progress("... found link", head, ' -> ', i)
                fmla.add(symbol(head), symbol(DC('relation')), symbol(i))
                if i[:len(prefix)] == prefix and i not in seen:
                    queue.append(i)
示例#4
0
    def crawlFrom(self, addr, prefix, max):
        fmla = self._fmla

        iter = 1
        queue = [addr]
        seen = []
        while queue:
            head = queue.pop()

            progress("crawling at: ", head, " iter ", iter, " of ", max)
            iter = iter + 1
            if iter > max:
                progress ("max limit reached.")
                break

            seen.append(head)

            try:
                rep = urllib2.urlopen(head)
                content = rep.read()
            except IOError:
                progress("can't GET", head)
                continue
                #@@ makeStatement(head type NoGood)

            # try to find a short label for
            # a diagram or some such.
            # try the last path segment,
            # or the 2nd last in case of an empty last segment...
            slash = head[:-1].rfind('/')
            label = head[slash+1:]
            
            ct = rep.info().getheader('content-type')
            progress("... got content of type ", ct)
            isHTML = ct.find('text/html') == 0

            fmla.add(symbol(head),
                     symbol(DC('type')),
                     literal(ct))

            # note that we're not peeking into the URI
            # to find out if it's HTML; we're just
            # eliding the extension in the case we
            # know (from the HTTP headers) that it's HTML.
            if isHTML and label[-5:] == '.html':
                label = label[:-5]

            fmla.add(symbol(head),
                     symbol(RDFS('label')),
                     literal(label))

            if not isHTML: continue
            
            progress("... parsing text/html content")
            doc = libxml2.htmlParseDoc(content, 'us-ascii')
            try:
                titles = doc.xpathNewContext().xpathEval('//title')
                title = titles[0].getContent()
            except: #@@figure out the right exceptions
                pass
            else:
                progress("... found title:", title)
                fmla.add(symbol(head),
                         symbol(DC('title')),
                         literal(str(title)) )
            
            hrefs = doc.xpathNewContext().xpathEval('//a/@href')
            progress("... found ", len(hrefs), " links")
                     
            for h in hrefs:
                h = h.getContent()
                progress("... found href", h)
                i = uripath.join(head, h)
                i = uripath.splitFrag(i)[0]
                progress("... found link", head, ' -> ', i)
                fmla.add(symbol(head),
                         symbol(DC('relation')),
                         symbol(i))
                if i[:len(prefix)] == prefix and i not in seen:
                    queue.append(i)