Пример #1
0
 def fetch(self):
     """Attempt to fetch the url (if isyanked is not True) and fill in link
     attributes (based on isinternal)."""
     # fully ignore links that should not be feteched
     if self.isyanked:
         debugio.info('  %s' % self.url)
         debugio.info('    ' + self.isyanked)
         return
     # see if we can import the proper module for this scheme
     schememodule = schemes.get_schememodule(self.scheme)
     if schememodule is None:
         self.isyanked = 'unsupported scheme (' + self.scheme + ')'
         self._ischanged = True
         debugio.info('  %s' % self.url)
         debugio.info('    ' + self.isyanked)
         return
     debugio.info('  %s' % self.url)
     content = schememodule.fetch(self, parsers.get_mimetypes())
     self.isfetched = True
     self._ischanged = True
     # skip parsing of content if we were returned nothing
     if content is None:
         return
     # find a parser for the content-type
     parsermodule = parsers.get_parsermodule(self.mimetype)
     if parsermodule is None:
         debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' % self.mimetype)
         return
     # parse the content
     parsermodule.parse(content, self)
Пример #2
0
 def check_for_whole_start_tag(self, i):
     """Override to catch assertion exception."""
     try:
         return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
     except AssertionError:
         debugio.debug('parsers.html._MyHTMLParser.check_for_whole_start_tag(): caught assertion error')
         return None
Пример #3
0
 def _get_robotparser(self, link):
     """Return the proper robots parser for the given url or None if one
     cannot be constructed. Robot parsers are cached per scheme and
     netloc."""
     # only some schemes have a meaningful robots.txt file
     if link.scheme != 'http' and link.scheme != 'https':
         debugio.debug(
             'crawler._get_robotparser() called with unsupported scheme (%s)'
             % link.scheme)
         return None
     # split out the key part of the url
     location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
     # try to create a new robotparser if we don't already have one
     if not self._robotparsers.has_key(location):
         import httplib
         debugio.info('  getting robots.txt for %s' % location)
         self._robotparsers[location] = None
         try:
             rp = robotparser.RobotFileParser()
             rp.set_url(
                 urlparse.urlunsplit(
                     (link.scheme, link.netloc, '/robots.txt', '', '')))
             rp.read()
             self._robotparsers[location] = rp
         except (TypeError, IOError, httplib.HTTPException):
             # ignore any problems setting up robot parser
             pass
     return self._robotparsers[location]
Пример #4
0
 def _get_robotparser(self, link):
     """Return the proper robots parser for the given url or None if one
     cannot be constructed. Robot parsers are cached per scheme and
     netloc."""
     # only some schemes have a meaningful robots.txt file
     if link.scheme != 'http' and link.scheme != 'https':
         debugio.debug('crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme)
         return None
     # split out the key part of the url
     location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
     # try to create a new robotparser if we don't already have one
     if not self._robotparsers.has_key(location):
         import httplib
         debugio.info('  getting robots.txt for %s' % location)
         self._robotparsers[location] = None
         try:
             rp = robotparser.RobotFileParser()
             rp.set_url(urlparse.urlunsplit(
               (link.scheme, link.netloc, '/robots.txt', '', '') ))
             rp.read()
             self._robotparsers[location] = rp
         except (TypeError, IOError, httplib.HTTPException):
             # ignore any problems setting up robot parser
             pass
     return self._robotparsers[location]
Пример #5
0
 def fetch(self):
     """Attempt to fetch the url (if isyanked is not True) and fill in link
     attributes (based on isinternal)."""
     # fully ignore links that should not be feteched
     if self.isyanked:
         debugio.info('  %s' % self.url)
         debugio.info('    ' + self.isyanked)
         return
     # see if we can import the proper module for this scheme
     schememodule = schemes.get_schememodule(self.scheme)
     if schememodule is None:
         self.isyanked = 'unsupported scheme (' + self.scheme + ')'
         self._ischanged = True
         debugio.info('  %s' % self.url)
         debugio.info('    ' + self.isyanked)
         return
     debugio.info('  %s' % self.url)
     content = schememodule.fetch(self, parsers.get_mimetypes())
     self.isfetched = True
     self._ischanged = True
     # skip parsing of content if we were returned nothing
     if content is None:
         return
     # find a parser for the content-type
     parsermodule = parsers.get_parsermodule(self.mimetype)
     if parsermodule is None:
         debugio.debug(
             'crawler.Link.fetch(): unsupported content-type: %s' %
             self.mimetype)
         return
     # parse the content
     parsermodule.parse(content, self)
Пример #6
0
 def crawl(self, serfp=None):
     """Crawl the website based on the urls specified with
     add_internal(). If the serialization file pointer
     is specified the crawler writes out updated links to
     the file while crawling the site."""
     # TODO: have some different scheme to crawl a site (e.g. separate
     #       internal and external queues, threading, etc)
     tocheck = []
     # add all unfetched site urls
     for link in self.linkMap.values():
         if not link.isyanked and not link.isfetched:
             tocheck.append(link)
     # add all internal urls
     for url in self._internal_urls:
         tocheck.append(self.get_link(url))
     # repeat until we have nothing more to check
     fetchedlinks = 0
     while len(tocheck) > 0:
         debugio.debug('crawler.crawl(): items left to check: %d' %
                       len(tocheck))
         # choose a link from the tocheck list
         link = tocheck.pop(0)
         # skip link it there is nothing to check
         if link.isyanked or link.isfetched:
             continue
         # fetch the link's contents
         link.fetch()
         # add children to tocheck
         for child in link.children:
             if not child.isyanked and not child.isfetched and not child in tocheck:
                 tocheck.append(child)
         # add embedded content
         for embed in link.embedded:
             if not embed.isyanked and not embed.isfetched and not embed in tocheck:
                 tocheck.append(embed)
         # serialize all as of yet unserialized links
         fetchedlinks += 1
         # TODO: make this configurable
         if serfp and fetchedlinks >= 5:
             fetchedlinks = 0
             import serialize
             for link in self.linkMap.values():
                 if link._ischanged:
                     serialize.serialize_link(serfp, link)
                     link._ischanged = False
             serfp.flush()
         # sleep between requests if configured
         if config.WAIT_BETWEEN_REQUESTS > 0:
             debugio.debug('crawler.crawl(): sleeping %s seconds' %
                           config.WAIT_BETWEEN_REQUESTS)
             time.sleep(config.WAIT_BETWEEN_REQUESTS)
     # serialize remaining changed links
     if serfp:
         import serialize
         for link in self.linkMap.values():
             if link._ischanged:
                 serialize.serialize_link(serfp, link)
                 link._ischanged = False
         serfp.flush()
Пример #7
0
 def crawl(self, serfp=None):
     """Crawl the website based on the urls specified with
     add_internal(). If the serialization file pointer
     is specified the crawler writes out updated links to
     the file while crawling the site."""
     # TODO: have some different scheme to crawl a site (e.g. separate
     #       internal and external queues, threading, etc)
     tocheck = []
     # add all unfetched site urls
     for link in self.linkMap.values():
         if not link.isyanked and not link.isfetched:
             tocheck.append(link)
     # add all internal urls
     for url in self._internal_urls:
         tocheck.append(self.get_link(url))
     # repeat until we have nothing more to check
     fetchedlinks = 0
     while len(tocheck) > 0:
         debugio.debug('crawler.crawl(): items left to check: %d' % len(tocheck))
         # choose a link from the tocheck list
         link = tocheck.pop(0)
         # skip link it there is nothing to check
         if link.isyanked or link.isfetched:
             continue
         # fetch the link's contents
         link.fetch()
         # add children to tocheck
         for child in link.children:
             if not child.isyanked and not child.isfetched and not child in tocheck:
                 tocheck.append(child)
         # add embedded content
         for embed in link.embedded:
             if not embed.isyanked and not embed.isfetched and not embed in tocheck:
                 tocheck.append(embed)
         # serialize all as of yet unserialized links
         fetchedlinks += 1
         # TODO: make this configurable
         if serfp and fetchedlinks >= 5:
             fetchedlinks = 0
             import serialize
             for link in self.linkMap.values():
                 if link._ischanged:
                     serialize.serialize_link(serfp, link)
                     link._ischanged = False
             serfp.flush()
         # sleep between requests if configured
         if config.WAIT_BETWEEN_REQUESTS > 0:
             debugio.debug('crawler.crawl(): sleeping %s seconds' % config.WAIT_BETWEEN_REQUESTS)
             time.sleep(config.WAIT_BETWEEN_REQUESTS)
     # serialize remaining changed links
     if serfp:
         import serialize
         for link in self.linkMap.values():
             if link._ischanged:
                 serialize.serialize_link(serfp, link)
                 link._ischanged = False
         serfp.flush()
Пример #8
0
 def check_for_whole_start_tag(self, i):
     """Override to catch assertion exception."""
     try:
         return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
     except AssertionError:
         debugio.debug(
             'parsers.html._MyHTMLParser.check_for_whole_start_tag(): caught assertion error'
         )
         return None
Пример #9
0
def parse(content, link):
    """Parse the specified content and extract an url list, a list of images a
    title and an author. The content is assumed to contain HMTL."""
    # create parser and feed it the content
    parser = _MyHTMLParser(link)
    try:
        parser.feed(content)
        parser.close()
    except Exception, e:
        # ignore (but log) all errors
        debugio.debug('parsers.html.parse(): caught exception: '+str(e))
Пример #10
0
def parse(content, link):
    """Parse the specified content and extract an url list, a list of images a
    title and an author. The content is assumed to contain HMTL."""
    # create parser and feed it the content
    parser = _MyHTMLParser(link)
    try:
        parser.feed(content)
        parser.close()
    except Exception, e:
        # ignore (but log) all errors
        debugio.debug('parsers.html.parse(): caught exception: ' + str(e))
Пример #11
0
 def error(self, message):
     """Override superclass' error() method to ignore errors."""
     # construct error message
     message += ', ' + self._location()
     # store error message
     debugio.debug('parsers.html._MyHTMLParser.error(): problem parsing html: '+message)
     if self.errmsg is None:
         self.errmsg = message
     # increment error count
     self.errcount += 1
     if self.errcount > 10:
         raise HTMLParser.HTMLParseError(message, self.getpos())
Пример #12
0
def _deserialize_site(site, key, value):
    """The data in the key value pair is fed into the site."""
    debugio.debug("%s=%s" % (key, value))
    if key == 'internal_url':
        site.add_internal(_readstring(value, False))
    elif key == 'internal_re':
        site.add_internal_re(_readstring(value))
    elif key == 'external_re':
        site.add_external_re(_readstring(value))
    elif key == 'yanked_re':
        site.add_yanked_re(_readstring(value))
    else:
        raise DeSerializeException('parse error')
Пример #13
0
def _deserialize_site(site, key, value):
    """The data in the key value pair is fed into the site."""
    debugio.debug("%s=%s" % (key, value))
    if key == 'internal_url':
        site.add_internal(_readstring(value, False))
    elif key == 'internal_re':
        site.add_internal_re(_readstring(value))
    elif key == 'external_re':
        site.add_external_re(_readstring(value))
    elif key == 'yanked_re':
        site.add_yanked_re(_readstring(value))
    else:
        raise DeSerializeException('parse error')
Пример #14
0
 def error(self, message):
     """Override superclass' error() method to ignore errors."""
     # construct error message
     message += ', ' + self._location()
     # store error message
     debugio.debug(
         'parsers.html._MyHTMLParser.error(): problem parsing html: ' +
         message)
     if self.errmsg is None:
         self.errmsg = message
     # increment error count
     self.errcount += 1
     if self.errcount > 10:
         raise HTMLParser.HTMLParseError(message, self.getpos())
Пример #15
0
def _cwd(ftp, path):
    """Go down the path on the ftp server returning the part that cannot be
    changed into."""
    # split the path into directories
    dirs = path.split('/')
    try:
        # decend down the tree
        while len(dirs) > 0:
            d = dirs[0]
            if d != '':
                debugio.debug('schemes.ftp._cwd(): CWD '+d+': '+ftp.cwd(d))
            dirs.pop(0)
        return None
    except ftplib.error_perm, e:
        debugio.debug('schemes.ftp._cwd(): CWD '+d+': '+str(e))
        return '/'.join(dirs)
Пример #16
0
def _getconnection(netloc):
    """Return a FTP connection object to the specified server."""
    # NOTE: this method is not thread safe
    if _ftpconnections.has_key(netloc):
        return _ftpconnections[netloc]
    # split url into useful parts
    (userpass, host) = urllib.splituser(netloc)
    if userpass is not None:
        (user, passwd) = urllib.splitpasswd(userpass)
    else:
        (user, passwd) = ('anonymous', '')
    (host, port) = urllib.splitnport(host, ftplib.FTP_PORT)
    # initialize a new connection
    ftp = ftplib.FTP()
    debugio.debug('schemes.ftp._getconnection(): CONNECT: '+ftp.connect(host, port))
    debugio.debug('schemes.ftp._getconnection(): LOGIN: '+ftp.login(user, passwd))
    _ftpconnections[netloc] = ftp
    return ftp
Пример #17
0
def fetch(link, acceptedtypes):
    """Fetch the specified link."""
    # try to fetch the document
    try:
        ftp = _getconnection(link.netloc)
        debugio.debug('schemes.ftp.fetch(): CWD /: '+ftp.cwd('/'))
        # descend down the directory tree as far as we can go
        path = urllib.unquote(link.path)
        path = _cwd(ftp, path)
        # check if we are dealing with an (exising) directory
        if path is None:
            return _fetch_directory(link, ftp, acceptedtypes)
        else:
            return _fetch_file(link, ftp, path, acceptedtypes)
    except ftplib.all_errors, e:
        debugio.debug('schemes.ftp.fetch(): CAUGHT '+str(e))
        link.add_linkproblem(str(e))
        return None
Пример #18
0
def _fetch_directory(link, path, acceptedtypes):
    """Retrieve some basic information about the directory.
    This checks that the directory has a trailing slash and
    returns a list of files in the directory, unless a configured
    filename is found in the directory (in which case this function
    acts as if the file was fetched)."""
    # if the name does not end with a slash, redirect
    if path[-1:] != os.path.sep:
        debugio.debug('directory referenced without trailing slash')
        link.redirect(urlparse.urljoin(link.url, link.path+'/'))
        return None
    # check contents of directory for some common files
    for fname in config.FILE_INDEXES:
        if os.path.isfile(os.path.join(path, fname)):
            debugio.debug('pick up %s from directory' % fname)
            # the the directory contains an index.html, use that
            return _fetch_file(link, os.path.join(path, fname), acceptedtypes)
    # otherwise add the directory's files as children
    debugio.debug('add files as children of this page')
    try:
        link.ispage = True
        for f in os.listdir(path):
            link.add_child(urlparse.urljoin(link.url, urllib.pathname2url(f)))
    except os.error, e:
        link.add_linkproblem(str(e))
Пример #19
0
def _fetch_directory(link, path, acceptedtypes):
    """Retrieve some basic information about the directory.
    This checks that the directory has a trailing slash and
    returns a list of files in the directory, unless a configured
    filename is found in the directory (in which case this function
    acts as if the file was fetched)."""
    # if the name does not end with a slash, redirect
    if path[-1:] != os.path.sep:
        debugio.debug('directory referenced without trailing slash')
        link.redirect(urlparse.urljoin(link.url, link.path+'/'))
        return None
    # check contents of directory for some common files
    for fname in config.FILE_INDEXES:
        if os.path.isfile(os.path.join(path, fname)):
            debugio.debug('pick up %s from directory' % fname)
            # the the directory contains an index.html, use that
            return _fetch_file(link, os.path.join(path, fname), acceptedtypes)
    # otherwise add the directory's files as children
    debugio.debug('add files as children of this page')
    try:
        link.ispage = True
        for f in os.listdir(path):
            link.add_child(urlparse.urljoin(link.url, urllib.pathname2url(f)))
    except os.error, e:
        link.add_linkproblem(str(e))
Пример #20
0
 def postprocess(self):
     # build the list of urls that were set up with add_internal() that
     # do not have a parent (they form the base for the site)
     for url in self._internal_urls:
         link = self.linkMap[url].follow_link()
         if link == None:
             debugio.warn('base link %s redirects to nowhere' % url)
             continue
         # add the link to bases
         debugio.debug('crawler.postprocess(): adding %s to bases' % link.url)
         self.bases.append(link)
     # if we got no bases, just use the first internal one
     if len(self.bases) == 0:
         debugio.debug('crawler.postprocess(): fallback to adding %s to bases' % self._internal_urls[0])
         self.bases.append(self.linkMap[self._internal_urls[0]])
     # do a breadth first traversal of the website to determin depth and
     # figure out page children
     tocheck = []
     for link in self.bases:
         link.depth = 0
         tocheck.append(link)
     # repeat until we have nothing more to check
     while len(tocheck) > 0:
         debugio.debug('crawler.postprocess(): items left to examine: %d' % len(tocheck))
         # choose a link from the tocheck list
         link = tocheck.pop(0)
         # figure out page children
         for child in link._pagechildren():
             # skip children already in our list or the wrong depth
             if child in tocheck or child.depth != link.depth+1:
                 continue
             tocheck.append(child)
     # set some compatibility properties
     # TODO: figure out a better way to get to this to the plugins
     self.base = self.bases[0].url
Пример #21
0
def _fetch_directory(link, ftp, acceptedtypes):
    """Handle the ftp directory."""
    # check that the url ends with a slash
    if link.path[-1:] != '/':
        debugio.debug('schemes.ftp._fetch_directory(): directory referenced without trailing slash')
        link.redirect(urlparse.urljoin(link.url, link.path+'/'))
        return None
    # retreive the contents of the directory
    # FIXME: this raises an exception for empty directories, probably replace with own command
    contents = ftp.nlst()
    # check contents of directory for some common files
    for f in config.FTP_INDEXES:
        if f in contents:
            debugio.debug('schemes.ftp._fetch_directory(): pick up %s from directory' % f)
            # the the directory contains an index.html, use that
            return _fetch_file(link, ftp, f, acceptedtypes)
    # just add files in directory as children
    debugio.debug('schemes.ftp._fetch_directory(): add files as children of this page')
    link.ispage = True
    debugio.debug('schemes.ftp._fetch_directory(): TYPE A: '+ftp.voidcmd('TYPE A'))
    # FIXME: this raises an exception for empty directories
    for f in contents:
        link.add_child(urlparse.urljoin(link.url, urllib.quote(f)))
    return None
Пример #22
0
def _fetch_file(link, ftp, path, acceptedtypes):
    """Try to download the file in path that should be in the current
    directory of the ftp instance. The path can also point to a non-existant
    file or directory."""
    # figure out the size of the document
    link.size = ftp.size(path)
    debugio.debug('schemes.ftp.fetch(): size='+str(link.size))
    # guess the mimetype of the document
    if link.mimetype is None:
        link.mimetype = mimetypes.guess_type(path)[0]
    # try to fetch file
    if link.mimetype in acceptedtypes:
        debugio.debug('schemes.ftp.fetch(): TYPE I: '+ftp.voidcmd('TYPE I'))
        (conn, size) = ftp.ntransfercmd('RETR ' + path)
        if size:
            content = conn.makefile().read(size)
        else:
            content = conn.makefile().read()
        debugio.debug('schemes.ftp.fetch(): fetched, size=%d' % len(content))
        return content
    return None
Пример #23
0
 def postprocess(self):
     # build the list of urls that were set up with add_internal() that
     # do not have a parent (they form the base for the site)
     for url in self._internal_urls:
         link = self.linkMap[url].follow_link()
         if link == None:
             debugio.warn('base link %s redirects to nowhere' % url)
             continue
         # add the link to bases
         debugio.debug('crawler.postprocess(): adding %s to bases' %
                       link.url)
         self.bases.append(link)
     # if we got no bases, just use the first internal one
     if len(self.bases) == 0:
         debugio.debug(
             'crawler.postprocess(): fallback to adding %s to bases' %
             self._internal_urls[0])
         self.bases.append(self.linkMap[self._internal_urls[0]])
     # do a breadth first traversal of the website to determin depth and
     # figure out page children
     tocheck = []
     for link in self.bases:
         link.depth = 0
         tocheck.append(link)
     # repeat until we have nothing more to check
     while len(tocheck) > 0:
         debugio.debug('crawler.postprocess(): items left to examine: %d' %
                       len(tocheck))
         # choose a link from the tocheck list
         link = tocheck.pop(0)
         # figure out page children
         for child in link._pagechildren():
             # skip children already in our list or the wrong depth
             if child in tocheck or child.depth != link.depth + 1:
                 continue
             tocheck.append(child)
     # set some compatibility properties
     # TODO: figure out a better way to get to this to the plugins
     self.base = self.bases[0].url
Пример #24
0
def fetch(link, acceptedtypes):
    """Open connection to url and report information given by GET command."""
    # TODO: HTTP connection pooling?
    # TODO: implement proxy requests for https
    # split netloc in user:pass part and host:port part
    (userpass, netloc) = urllib.splituser(link.netloc)
    proxyuserpass = None
    scheme = link.scheme
    # check which host to connect to (if using proxies)
    if config.PROXIES and config.PROXIES.has_key(link.scheme):
        # pass the complete url in the request, connecting to the proxy
        path = urlparse.urlunsplit((link.scheme, netloc, link.path, link.query, ''))
        (scheme, netloc) = urlparse.urlsplit(config.PROXIES[link.scheme])[0:2]
        (proxyuserpass, netloc) = urllib.splituser(netloc)
    else:
        # otherwise direct connect to the server with partial url
        path = urlparse.urlunsplit(('', '', link.path, link.query, ''))
    # remove trailing : from netloc
    if netloc[-1] == ':':
        netloc = netloc[:-1]
    conn = None
    try:
        try:
            # create the connection
            debugio.debug('schemes.http.fetch: connecting to %s' % netloc)
            if scheme == 'http':
                conn = httplib.HTTPConnection(netloc)
            elif scheme == 'https':
                conn = httplib.HTTPSConnection(netloc)
            # the requests adds a correct host header for us
            conn.putrequest('GET', path)
            if userpass is not None:
                (user, passwd) = urllib.splitpasswd(userpass)
                conn.putheader(
                  'Authorization',
                  'Basic '+base64.encodestring(str(user)+':'+str(passwd)).strip() )
            if proxyuserpass is not None:
                (user, passwd) = urllib.splitpasswd(proxyuserpass)
                conn.putheader(
                  'Proxy-Authorization',
                  'Basic '+base64.encodestring(str(user)+':'+str(passwd)).strip() )
            # bypass proxy cache
            if config.BYPASSHTTPCACHE:
                conn.putheader('Cache-control', 'no-cache')
                conn.putheader('Pragma', 'no-cache')
            conn.putheader('User-Agent','webcheck %s' % config.VERSION)
            conn.endheaders()
            # wait for the response
            response = conn.getresponse()
            link.status = '%s %s' % (response.status, response.reason)
            debugio.debug('schemes.http.fetch(): HTTP response: %s' % link.status)
            # dump proxy hit/miss debugging info
            if config.PROXIES and config.PROXIES.has_key(link.scheme):
                try:
                    debugio.debug('schemes.http.fetch(): X-Cache: %s' % str(response.getheader('X-Cache')))
                except AttributeError:
                    pass
            # retrieve some information from the headers
            try:
                link.mimetype = response.msg.gettype()
                debugio.debug('schemes.http.fetch(): mimetype: %s' % str(link.mimetype))
            except AttributeError:
                pass
            try:
                link.set_encoding(_charsetpattern.search(response.getheader('Content-type')).group(1))
            except (AttributeError, TypeError):
                pass
            try:
                link.size = int(response.getheader('Content-length'))
                debugio.debug('schemes.http.fetch(): size: %s' % str(link.size))
            except (KeyError, TypeError):
                pass
            try:
                link.mtime = time.mktime(response.msg.getdate('Last-Modified'))
                debugio.debug('schemes.http.fetch(): mtime: %s' % time.strftime('%c', time.localtime(link.mtime)))
            except (OverflowError, TypeError, ValueError):
                pass
            # handle redirects
            # 301=moved permanently, 302=found, 303=see other, 307=temporary redirect
            if response.status in (301, 302, 303, 307):
                # consider a 301 (moved permanently) a problem
                if response.status == 301:
                    link.add_linkproblem(str(response.status)+': '+response.reason)
                # find url that is redirected to
                location = urlparse.urljoin(link.url, response.getheader('Location', ''))
                # create the redirect
                link.redirect(location)
                return None
            elif response.status != 200:
                # handle error responses
                link.add_linkproblem(str(response.status)+': '+response.reason)
                return None
            elif link.mimetype in acceptedtypes:
                # return succesful responses
                # TODO: support gzipped content
                # TODO: add checking for size
                return response.read()
        except httplib.HTTPException, e:
            debugio.debug('error reading HTTP response: '+str(e))
            link.add_linkproblem('error reading HTTP response: '+str(e))
            return None
        except (socket.error, socket.sslerror), e:
            if hasattr(e, 'args') and len(e.args) == 2:
                debugio.debug("error reading HTTP response: "+str(e.args[1]))
                link.add_linkproblem("error reading HTTP response: "+str(e.args[1]))
            else:
                debugio.debug("error reading HTTP response: "+str(e))
                link.add_linkproblem("error reading HTTP response: "+str(e))
            return None
Пример #25
0
def _fetch_file(link, path, acceptedtypes):
    """Retrieve some basic information of the specified file and return
    the contents of the file."""
    # get stats of file
    try:
        stats = os.stat(path)
        link.size = stats.st_size
        link.mtime = stats.st_mtime
    except os.error, e:
        link.add_linkproblem(str(e))
        return None
    # guess mimetype
    if link.mimetype is None:
        link.mimetype = mimetypes.guess_type(path)[0]
    debugio.debug('mimetype='+str(link.mimetype))
    debugio.debug('acceptedtypes='+str(acceptedtypes))
    # fetch the document if there is any point
    if link.mimetype in acceptedtypes:
        debugio.debug('FETCH')
        try:
            # TODO: add size checking
            return open(path, 'r').read()
        except IOError, e:
            debugio.debug('PROBLEM: '+str(e))
            link.add_linkproblem(str(e))
    return None

def fetch(link, acceptedtypes):
    """Retreive some basic information about the file.
    Store the results in the link object."""
Пример #26
0

def parse(content, link):
    """Parse the specified content and extract an url list, a list of images a
    title and an author. The content is assumed to contain HMTL."""
    # create parser and feed it the content
    parser = _MyHTMLParser(link)
    try:
        parser.feed(content)
        parser.close()
    except Exception, e:
        # ignore (but log) all errors
        debugio.debug('parsers.html.parse(): caught exception: ' + str(e))
    # check for parser errors
    if parser.errmsg is not None:
        debugio.debug('parsers.html.parse(): problem parsing html: ' +
                      parser.errmsg)
        link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
    # dump encoding
    debugio.debug('parsers.html.parse(): html encoding: %s' %
                  str(link.encoding))
    # flag that the link contains a valid page
    link.ispage = True
    # save the title
    if parser.title is not None:
        link.title = _maketxt(parser.title, link.encoding).strip()
    # save the author
    if parser.author is not None:
        link.author = _maketxt(parser.author, link.encoding).strip()
    # figure out the base of the document (for building the other urls)
    base = link.url
    if parser.base is not None:
Пример #27
0
def fetch(link, acceptedtypes):
    """Open connection to url and report information given by GET command."""
    # TODO: HTTP connection pooling?
    # TODO: implement proxy requests for https
    # split netloc in user:pass part and host:port part
    (userpass, netloc) = urllib.splituser(link.netloc)
    proxyuserpass = None
    scheme = link.scheme
    # check which host to connect to (if using proxies)
    if config.PROXIES and config.PROXIES.has_key(link.scheme):
        # pass the complete url in the request, connecting to the proxy
        path = urlparse.urlunsplit(
            (link.scheme, netloc, link.path, link.query, ''))
        (scheme, netloc) = urlparse.urlsplit(config.PROXIES[link.scheme])[0:2]
        (proxyuserpass, netloc) = urllib.splituser(netloc)
    else:
        # otherwise direct connect to the server with partial url
        path = urlparse.urlunsplit(('', '', link.path, link.query, ''))
    # remove trailing : from netloc
    if netloc[-1] == ':':
        netloc = netloc[:-1]
    conn = None
    try:
        try:
            # create the connection
            debugio.debug('schemes.http.fetch: connecting to %s' % netloc)
            if scheme == 'http':
                conn = httplib.HTTPConnection(netloc)
            elif scheme == 'https':
                conn = httplib.HTTPSConnection(netloc)
            # the requests adds a correct host header for us
            conn.putrequest('GET', path)
            if userpass is not None:
                (user, passwd) = urllib.splitpasswd(userpass)
                conn.putheader(
                    'Authorization', 'Basic ' +
                    base64.encodestring(str(user) + ':' + str(passwd)).strip())
            if proxyuserpass is not None:
                (user, passwd) = urllib.splitpasswd(proxyuserpass)
                conn.putheader(
                    'Proxy-Authorization', 'Basic ' +
                    base64.encodestring(str(user) + ':' + str(passwd)).strip())
            # bypass proxy cache
            if config.BYPASSHTTPCACHE:
                conn.putheader('Cache-control', 'no-cache')
                conn.putheader('Pragma', 'no-cache')
            conn.putheader('User-Agent', 'webcheck %s' % config.VERSION)
            conn.endheaders()
            # wait for the response
            response = conn.getresponse()
            link.status = '%s %s' % (response.status, response.reason)
            debugio.debug('schemes.http.fetch(): HTTP response: %s' %
                          link.status)
            # dump proxy hit/miss debugging info
            if config.PROXIES and config.PROXIES.has_key(link.scheme):
                try:
                    debugio.debug('schemes.http.fetch(): X-Cache: %s' %
                                  str(response.getheader('X-Cache')))
                except AttributeError:
                    pass
            # retrieve some information from the headers
            try:
                link.mimetype = response.msg.gettype()
                debugio.debug('schemes.http.fetch(): mimetype: %s' %
                              str(link.mimetype))
            except AttributeError:
                pass
            try:
                link.set_encoding(
                    _charsetpattern.search(
                        response.getheader('Content-type')).group(1))
            except (AttributeError, TypeError):
                pass
            try:
                link.size = int(response.getheader('Content-length'))
                debugio.debug('schemes.http.fetch(): size: %s' %
                              str(link.size))
            except (KeyError, TypeError):
                pass
            try:
                link.mtime = time.mktime(response.msg.getdate('Last-Modified'))
                debugio.debug('schemes.http.fetch(): mtime: %s' %
                              time.strftime('%c', time.localtime(link.mtime)))
            except (OverflowError, TypeError, ValueError):
                pass
            # handle redirects
            # 301=moved permanently, 302=found, 303=see other, 307=temporary redirect
            if response.status in (301, 302, 303, 307):
                # consider a 301 (moved permanently) a problem
                if response.status == 301:
                    link.add_linkproblem(
                        str(response.status) + ': ' + response.reason)
                # find url that is redirected to
                location = urlparse.urljoin(link.url,
                                            response.getheader('Location', ''))
                # create the redirect
                link.redirect(location)
                return None
            elif response.status != 200:
                # handle error responses
                link.add_linkproblem(
                    str(response.status) + ': ' + response.reason)
                return None
            elif link.mimetype in acceptedtypes:
                # return succesful responses
                # TODO: support gzipped content
                # TODO: add checking for size
                return response.read()
        except httplib.HTTPException, e:
            debugio.debug('error reading HTTP response: ' + str(e))
            link.add_linkproblem('error reading HTTP response: ' + str(e))
            return None
        except (socket.error, socket.sslerror), e:
            if hasattr(e, 'args') and len(e.args) == 2:
                debugio.debug("error reading HTTP response: " + str(e.args[1]))
                link.add_linkproblem("error reading HTTP response: " +
                                     str(e.args[1]))
            else:
                debugio.debug("error reading HTTP response: " + str(e))
                link.add_linkproblem("error reading HTTP response: " + str(e))
            return None
Пример #28
0
                link.add_linkproblem(str(response.status)+': '+response.reason)
                return None
            elif link.mimetype in acceptedtypes:
                # return succesful responses
                # TODO: support gzipped content
                # TODO: add checking for size
                return response.read()
        except httplib.HTTPException, e:
            debugio.debug('error reading HTTP response: '+str(e))
            link.add_linkproblem('error reading HTTP response: '+str(e))
            return None
        except (socket.error, socket.sslerror), e:
            if hasattr(e, 'args') and len(e.args) == 2:
                debugio.debug("error reading HTTP response: "+str(e.args[1]))
                link.add_linkproblem("error reading HTTP response: "+str(e.args[1]))
            else:
                debugio.debug("error reading HTTP response: "+str(e))
                link.add_linkproblem("error reading HTTP response: "+str(e))
            return None
        except Exception, e:
            # handle all other exceptions
            debugio.debug('unknown exception caught: '+str(e))
            link.add_linkproblem('error reading HTTP response: '+str(e))
            import traceback
            traceback.print_exc()
            return None
    finally:
        # close the connection before returning
        if conn is not None:
            conn.close()
Пример #29
0
def _fetch_file(link, path, acceptedtypes):
    """Retrieve some basic information of the specified file and return
    the contents of the file."""
    # get stats of file
    try:
        stats = os.stat(path)
        link.size = stats.st_size
        link.mtime = stats.st_mtime
    except os.error, e:
        link.add_linkproblem(str(e))
        return None
    # guess mimetype
    if link.mimetype is None:
        link.mimetype = mimetypes.guess_type(path)[0]
    debugio.debug('mimetype='+str(link.mimetype))
    debugio.debug('acceptedtypes='+str(acceptedtypes))
    # fetch the document if there is any point
    if link.mimetype in acceptedtypes:
        debugio.debug('FETCH')
        try:
            # TODO: add size checking
            return open(path, 'r').read()
        except IOError, e:
            debugio.debug('PROBLEM: '+str(e))
            link.add_linkproblem(str(e))
    return None

def fetch(link, acceptedtypes):
    """Retreive some basic information about the file.
    Store the results in the link object."""
Пример #30
0
    return htmlunescape(unicode(txt, errors='replace'))

def parse(content, link):
    """Parse the specified content and extract an url list, a list of images a
    title and an author. The content is assumed to contain HMTL."""
    # create parser and feed it the content
    parser = _MyHTMLParser(link)
    try:
        parser.feed(content)
        parser.close()
    except Exception, e:
        # ignore (but log) all errors
        debugio.debug('parsers.html.parse(): caught exception: '+str(e))
    # check for parser errors
    if parser.errmsg is not None:
        debugio.debug('parsers.html.parse(): problem parsing html: '+parser.errmsg)
        link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
    # dump encoding
    debugio.debug('parsers.html.parse(): html encoding: %s' % str(link.encoding))
    # flag that the link contains a valid page
    link.ispage = True
    # save the title
    if parser.title is not None:
        link.title = _maketxt(parser.title, link.encoding).strip()
    # save the author
    if parser.author is not None:
        link.author = _maketxt(parser.author, link.encoding).strip()
    # figure out the base of the document (for building the other urls)
    base = link.url
    if parser.base is not None:
        base = parser.base
Пример #31
0
                return None
            elif link.mimetype in acceptedtypes:
                # return succesful responses
                # TODO: support gzipped content
                # TODO: add checking for size
                return response.read()
        except httplib.HTTPException, e:
            debugio.debug('error reading HTTP response: ' + str(e))
            link.add_linkproblem('error reading HTTP response: ' + str(e))
            return None
        except (socket.error, socket.sslerror), e:
            if hasattr(e, 'args') and len(e.args) == 2:
                debugio.debug("error reading HTTP response: " + str(e.args[1]))
                link.add_linkproblem("error reading HTTP response: " +
                                     str(e.args[1]))
            else:
                debugio.debug("error reading HTTP response: " + str(e))
                link.add_linkproblem("error reading HTTP response: " + str(e))
            return None
        except Exception, e:
            # handle all other exceptions
            debugio.debug('unknown exception caught: ' + str(e))
            link.add_linkproblem('error reading HTTP response: ' + str(e))
            import traceback
            traceback.print_exc()
            return None
    finally:
        # close the connection before returning
        if conn is not None:
            conn.close()
Пример #32
0
 def crawl(self):
     """Crawl the website based on the urls specified with
     add_internal()."""
     # TODO: have some different scheme to crawl a site (e.g. separate
     #       internal and external queues, threading, etc)
     tocheck = []
     for u in self._internal_urls:
         tocheck.append(self._get_link(u))
     # repeat until we have nothing more to check
     while len(tocheck) > 0:
         debugio.debug("crawler.crawl(): items left to check: %d" % len(tocheck))
         # choose a link from the tocheck list
         link=tocheck.pop(0)
         # skip link it there is nothing to check
         if link.isyanked or link.isfetched:
             continue
         # fetch the link's contents
         link.fetch()
         # add children to tocheck
         for child in link.children:
             if not child.isyanked and not child.isfetched and not child in tocheck:
                 tocheck.append(child)
         # add embedded content
         for embed in link.embedded:
             if not embed.isyanked and not embed.isfetched and not embed in tocheck:
                 tocheck.append(embed)
         # sleep between requests if configured
         if config.WAIT_BETWEEN_REQUESTS > 0:
             debugio.debug('sleeping %s seconds' %  config.WAIT_BETWEEN_REQUESTS)
             time.sleep(config.WAIT_BETWEEN_REQUESTS)
     # build the list of urls that were set up with add_internal() that
     # do not have a parent (they form the base for the site)
     bases = [ ]
     for u in self._internal_urls:
         l = self.linkMap[u].follow_link()
         if l == None:
             debugio.warn('base link %s redirects to nowhere' % u)
             continue
         # if the link has no parent add it to the result list unless it is the first one
         if len(l.parents) == 0 or len(bases) == 0:
             debugio.debug('crawler.crawl(): adding %s to bases' % l.url)
             bases.append(l)
     # if we got no bases, just use the first internal one
     if len(bases) == 0:
         debugio.debug('crawler.crawl(): fallback to adding %s to bases' % self._internal_urls[0])
         bases.append(self.linkMap[self._internal_urls[0]])
     # do a breadth first traversal of the website to determin depth and
     # figure out page children
     tocheck = []
     for link in bases:
         link.depth = 0
         tocheck.append(link)
     # repeat until we have nothing more to check
     while len(tocheck) > 0:
         debugio.debug("crawler.crawl(): items left to examine: %d" % len(tocheck))
         # choose a link from the tocheck list
         link = tocheck.pop(0)
         # figure out page children
         for child in link._pagechildren():
             # skip children already in our list or the wrong depth
             if child in tocheck or child.depth != link.depth+1:
                 continue
             tocheck.append(child)
     # set some compatibility properties
     # TODO: figure out a better way to get to this to the plugins
     self.base = bases[0].url