예제 #1
0
 def postprocess(self):
     # build the list of urls that were set up with add_internal() that
     # do not have a parent (they form the base for the site)
     for url in self._internal_urls:
         link = self.linkMap[url].follow_link()
         if link == None:
             debugio.warn('base link %s redirects to nowhere' % url)
             continue
         # add the link to bases
         debugio.debug('crawler.postprocess(): adding %s to bases' % link.url)
         self.bases.append(link)
     # if we got no bases, just use the first internal one
     if len(self.bases) == 0:
         debugio.debug('crawler.postprocess(): fallback to adding %s to bases' % self._internal_urls[0])
         self.bases.append(self.linkMap[self._internal_urls[0]])
     # do a breadth first traversal of the website to determin depth and
     # figure out page children
     tocheck = []
     for link in self.bases:
         link.depth = 0
         tocheck.append(link)
     # repeat until we have nothing more to check
     while len(tocheck) > 0:
         debugio.debug('crawler.postprocess(): items left to examine: %d' % len(tocheck))
         # choose a link from the tocheck list
         link = tocheck.pop(0)
         # figure out page children
         for child in link._pagechildren():
             # skip children already in our list or the wrong depth
             if child in tocheck or child.depth != link.depth+1:
                 continue
             tocheck.append(child)
     # set some compatibility properties
     # TODO: figure out a better way to get to this to the plugins
     self.base = self.bases[0].url
예제 #2
0
def _maketxt(txt, encoding):
    """Return an unicode text of the specified string do correct character
    conversions and replacing html entities with normal characters."""
    # try to decode with the given encoding
    if encoding:
        try:
            return htmlunescape(unicode(txt, encoding, 'replace'))
        except (LookupError, TypeError, ValueError), e:
            debugio.warn('page has unknown encoding: %s' % str(encoding))
예제 #3
0
def _maketxt(txt, encoding):
    """Return an unicode text of the specified string do correct character
    conversions and replacing html entities with normal characters."""
    # try to decode with the given encoding
    if encoding:
        try:
            return htmlunescape(unicode(txt, encoding, 'replace'))
        except (LookupError, TypeError, ValueError), e:
            debugio.warn('page has unknown encoding: %s' % str(encoding))
예제 #4
0
def install_file(source, text=False):
    """Install the given file in the output directory.
    If the text flag is set to true it is assumed the file is text,
    translating line endings."""
    import shutil
    import urlparse
    # figure out mode to open the file with
    mode = 'r'
    if text:
        mode += 'U'
    # check with what kind of argument we are called
    scheme = urlparse.urlsplit(source)[0]
    if scheme == 'file':
        # this is a file:/// url, translate to normal path and open
        import urllib
        source = urllib.url2pathname(urlparse.urlsplit(source)[2])
    elif scheme == '' and os.path.isabs(source):
        # this is an absolute path, just open it as is
        pass
    elif scheme == '':
        # this is a relavite path, try to fetch it from the python path
        for directory in sys.path:
            tst = os.path.join(directory, source)
            if os.path.isfile(tst):
                source = tst
                break
    # TODO: support more schemes here
    # figure out the destination name
    target = os.path.join(config.OUTPUT_DIR, os.path.basename(source))
    # test if source and target are the same
    source = os.path.realpath(source)
    if source == os.path.realpath(target):
        debugio.warn('attempt to overwrite %(fname)s with itself' %
                     {'fname': source})
        return
    # open the input file
    sfp = None
    try:
        sfp = open(source, mode)
    except IOError, (errno, strerror):
        debugio.error('%(fname)s: %(strerror)s' % {
            'fname': source,
            'strerror': strerror
        })
        sys.exit(1)
예제 #5
0
 def postprocess(self):
     # build the list of urls that were set up with add_internal() that
     # do not have a parent (they form the base for the site)
     for url in self._internal_urls:
         link = self.linkMap[url].follow_link()
         if link == None:
             debugio.warn('base link %s redirects to nowhere' % url)
             continue
         # add the link to bases
         debugio.debug('crawler.postprocess(): adding %s to bases' %
                       link.url)
         self.bases.append(link)
     # if we got no bases, just use the first internal one
     if len(self.bases) == 0:
         debugio.debug(
             'crawler.postprocess(): fallback to adding %s to bases' %
             self._internal_urls[0])
         self.bases.append(self.linkMap[self._internal_urls[0]])
     # do a breadth first traversal of the website to determin depth and
     # figure out page children
     tocheck = []
     for link in self.bases:
         link.depth = 0
         tocheck.append(link)
     # repeat until we have nothing more to check
     while len(tocheck) > 0:
         debugio.debug('crawler.postprocess(): items left to examine: %d' %
                       len(tocheck))
         # choose a link from the tocheck list
         link = tocheck.pop(0)
         # figure out page children
         for child in link._pagechildren():
             # skip children already in our list or the wrong depth
             if child in tocheck or child.depth != link.depth + 1:
                 continue
             tocheck.append(child)
     # set some compatibility properties
     # TODO: figure out a better way to get to this to the plugins
     self.base = self.bases[0].url
예제 #6
0
파일: crawler.py 프로젝트: hryshtk/python
 def crawl(self):
     """Crawl the website based on the urls specified with
     add_internal()."""
     # TODO: have some different scheme to crawl a site (e.g. separate
     #       internal and external queues, threading, etc)
     tocheck = []
     for u in self._internal_urls:
         tocheck.append(self._get_link(u))
     # repeat until we have nothing more to check
     while len(tocheck) > 0:
         debugio.debug("crawler.crawl(): items left to check: %d" % len(tocheck))
         # choose a link from the tocheck list
         link=tocheck.pop(0)
         # skip link it there is nothing to check
         if link.isyanked or link.isfetched:
             continue
         # fetch the link's contents
         link.fetch()
         # add children to tocheck
         for child in link.children:
             if not child.isyanked and not child.isfetched and not child in tocheck:
                 tocheck.append(child)
         # add embedded content
         for embed in link.embedded:
             if not embed.isyanked and not embed.isfetched and not embed in tocheck:
                 tocheck.append(embed)
         # sleep between requests if configured
         if config.WAIT_BETWEEN_REQUESTS > 0:
             debugio.debug('sleeping %s seconds' %  config.WAIT_BETWEEN_REQUESTS)
             time.sleep(config.WAIT_BETWEEN_REQUESTS)
     # build the list of urls that were set up with add_internal() that
     # do not have a parent (they form the base for the site)
     bases = [ ]
     for u in self._internal_urls:
         l = self.linkMap[u].follow_link()
         if l == None:
             debugio.warn('base link %s redirects to nowhere' % u)
             continue
         # if the link has no parent add it to the result list unless it is the first one
         if len(l.parents) == 0 or len(bases) == 0:
             debugio.debug('crawler.crawl(): adding %s to bases' % l.url)
             bases.append(l)
     # if we got no bases, just use the first internal one
     if len(bases) == 0:
         debugio.debug('crawler.crawl(): fallback to adding %s to bases' % self._internal_urls[0])
         bases.append(self.linkMap[self._internal_urls[0]])
     # do a breadth first traversal of the website to determin depth and
     # figure out page children
     tocheck = []
     for link in bases:
         link.depth = 0
         tocheck.append(link)
     # repeat until we have nothing more to check
     while len(tocheck) > 0:
         debugio.debug("crawler.crawl(): items left to examine: %d" % len(tocheck))
         # choose a link from the tocheck list
         link = tocheck.pop(0)
         # figure out page children
         for child in link._pagechildren():
             # skip children already in our list or the wrong depth
             if child in tocheck or child.depth != link.depth+1:
                 continue
             tocheck.append(child)
     # set some compatibility properties
     # TODO: figure out a better way to get to this to the plugins
     self.base = bases[0].url