def fetch(self): """Attempt to fetch the url (if isyanked is not True) and fill in link attributes (based on isinternal).""" # fully ignore links that should not be feteched if self.isyanked: debugio.info(' %s' % self.url) debugio.info(' ' + self.isyanked) return # see if we can import the proper module for this scheme schememodule = schemes.get_schememodule(self.scheme) if schememodule is None: self.isyanked = 'unsupported scheme (' + self.scheme + ')' self._ischanged = True debugio.info(' %s' % self.url) debugio.info(' ' + self.isyanked) return debugio.info(' %s' % self.url) content = schememodule.fetch(self, parsers.get_mimetypes()) self.isfetched = True self._ischanged = True # skip parsing of content if we were returned nothing if content is None: return # find a parser for the content-type parsermodule = parsers.get_parsermodule(self.mimetype) if parsermodule is None: debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' % self.mimetype) return # parse the content parsermodule.parse(content, self)
def check_for_whole_start_tag(self, i): """Override to catch assertion exception.""" try: return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i) except AssertionError: debugio.debug('parsers.html._MyHTMLParser.check_for_whole_start_tag(): caught assertion error') return None
def _get_robotparser(self, link): """Return the proper robots parser for the given url or None if one cannot be constructed. Robot parsers are cached per scheme and netloc.""" # only some schemes have a meaningful robots.txt file if link.scheme != 'http' and link.scheme != 'https': debugio.debug( 'crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme) return None # split out the key part of the url location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', '')) # try to create a new robotparser if we don't already have one if not self._robotparsers.has_key(location): import httplib debugio.info(' getting robots.txt for %s' % location) self._robotparsers[location] = None try: rp = robotparser.RobotFileParser() rp.set_url( urlparse.urlunsplit( (link.scheme, link.netloc, '/robots.txt', '', ''))) rp.read() self._robotparsers[location] = rp except (TypeError, IOError, httplib.HTTPException): # ignore any problems setting up robot parser pass return self._robotparsers[location]
def _get_robotparser(self, link): """Return the proper robots parser for the given url or None if one cannot be constructed. Robot parsers are cached per scheme and netloc.""" # only some schemes have a meaningful robots.txt file if link.scheme != 'http' and link.scheme != 'https': debugio.debug('crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme) return None # split out the key part of the url location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', '')) # try to create a new robotparser if we don't already have one if not self._robotparsers.has_key(location): import httplib debugio.info(' getting robots.txt for %s' % location) self._robotparsers[location] = None try: rp = robotparser.RobotFileParser() rp.set_url(urlparse.urlunsplit( (link.scheme, link.netloc, '/robots.txt', '', '') )) rp.read() self._robotparsers[location] = rp except (TypeError, IOError, httplib.HTTPException): # ignore any problems setting up robot parser pass return self._robotparsers[location]
def fetch(self): """Attempt to fetch the url (if isyanked is not True) and fill in link attributes (based on isinternal).""" # fully ignore links that should not be feteched if self.isyanked: debugio.info(' %s' % self.url) debugio.info(' ' + self.isyanked) return # see if we can import the proper module for this scheme schememodule = schemes.get_schememodule(self.scheme) if schememodule is None: self.isyanked = 'unsupported scheme (' + self.scheme + ')' self._ischanged = True debugio.info(' %s' % self.url) debugio.info(' ' + self.isyanked) return debugio.info(' %s' % self.url) content = schememodule.fetch(self, parsers.get_mimetypes()) self.isfetched = True self._ischanged = True # skip parsing of content if we were returned nothing if content is None: return # find a parser for the content-type parsermodule = parsers.get_parsermodule(self.mimetype) if parsermodule is None: debugio.debug( 'crawler.Link.fetch(): unsupported content-type: %s' % self.mimetype) return # parse the content parsermodule.parse(content, self)
def crawl(self, serfp=None): """Crawl the website based on the urls specified with add_internal(). If the serialization file pointer is specified the crawler writes out updated links to the file while crawling the site.""" # TODO: have some different scheme to crawl a site (e.g. separate # internal and external queues, threading, etc) tocheck = [] # add all unfetched site urls for link in self.linkMap.values(): if not link.isyanked and not link.isfetched: tocheck.append(link) # add all internal urls for url in self._internal_urls: tocheck.append(self.get_link(url)) # repeat until we have nothing more to check fetchedlinks = 0 while len(tocheck) > 0: debugio.debug('crawler.crawl(): items left to check: %d' % len(tocheck)) # choose a link from the tocheck list link = tocheck.pop(0) # skip link it there is nothing to check if link.isyanked or link.isfetched: continue # fetch the link's contents link.fetch() # add children to tocheck for child in link.children: if not child.isyanked and not child.isfetched and not child in tocheck: tocheck.append(child) # add embedded content for embed in link.embedded: if not embed.isyanked and not embed.isfetched and not embed in tocheck: tocheck.append(embed) # serialize all as of yet unserialized links fetchedlinks += 1 # TODO: make this configurable if serfp and fetchedlinks >= 5: fetchedlinks = 0 import serialize for link in self.linkMap.values(): if link._ischanged: serialize.serialize_link(serfp, link) link._ischanged = False serfp.flush() # sleep between requests if configured if config.WAIT_BETWEEN_REQUESTS > 0: debugio.debug('crawler.crawl(): sleeping %s seconds' % config.WAIT_BETWEEN_REQUESTS) time.sleep(config.WAIT_BETWEEN_REQUESTS) # serialize remaining changed links if serfp: import serialize for link in self.linkMap.values(): if link._ischanged: serialize.serialize_link(serfp, link) link._ischanged = False serfp.flush()
def check_for_whole_start_tag(self, i): """Override to catch assertion exception.""" try: return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i) except AssertionError: debugio.debug( 'parsers.html._MyHTMLParser.check_for_whole_start_tag(): caught assertion error' ) return None
def parse(content, link): """Parse the specified content and extract an url list, a list of images a title and an author. The content is assumed to contain HMTL.""" # create parser and feed it the content parser = _MyHTMLParser(link) try: parser.feed(content) parser.close() except Exception, e: # ignore (but log) all errors debugio.debug('parsers.html.parse(): caught exception: '+str(e))
def parse(content, link): """Parse the specified content and extract an url list, a list of images a title and an author. The content is assumed to contain HMTL.""" # create parser and feed it the content parser = _MyHTMLParser(link) try: parser.feed(content) parser.close() except Exception, e: # ignore (but log) all errors debugio.debug('parsers.html.parse(): caught exception: ' + str(e))
def error(self, message): """Override superclass' error() method to ignore errors.""" # construct error message message += ', ' + self._location() # store error message debugio.debug('parsers.html._MyHTMLParser.error(): problem parsing html: '+message) if self.errmsg is None: self.errmsg = message # increment error count self.errcount += 1 if self.errcount > 10: raise HTMLParser.HTMLParseError(message, self.getpos())
def _deserialize_site(site, key, value): """The data in the key value pair is fed into the site.""" debugio.debug("%s=%s" % (key, value)) if key == 'internal_url': site.add_internal(_readstring(value, False)) elif key == 'internal_re': site.add_internal_re(_readstring(value)) elif key == 'external_re': site.add_external_re(_readstring(value)) elif key == 'yanked_re': site.add_yanked_re(_readstring(value)) else: raise DeSerializeException('parse error')
def error(self, message): """Override superclass' error() method to ignore errors.""" # construct error message message += ', ' + self._location() # store error message debugio.debug( 'parsers.html._MyHTMLParser.error(): problem parsing html: ' + message) if self.errmsg is None: self.errmsg = message # increment error count self.errcount += 1 if self.errcount > 10: raise HTMLParser.HTMLParseError(message, self.getpos())
def _cwd(ftp, path): """Go down the path on the ftp server returning the part that cannot be changed into.""" # split the path into directories dirs = path.split('/') try: # decend down the tree while len(dirs) > 0: d = dirs[0] if d != '': debugio.debug('schemes.ftp._cwd(): CWD '+d+': '+ftp.cwd(d)) dirs.pop(0) return None except ftplib.error_perm, e: debugio.debug('schemes.ftp._cwd(): CWD '+d+': '+str(e)) return '/'.join(dirs)
def _getconnection(netloc): """Return a FTP connection object to the specified server.""" # NOTE: this method is not thread safe if _ftpconnections.has_key(netloc): return _ftpconnections[netloc] # split url into useful parts (userpass, host) = urllib.splituser(netloc) if userpass is not None: (user, passwd) = urllib.splitpasswd(userpass) else: (user, passwd) = ('anonymous', '') (host, port) = urllib.splitnport(host, ftplib.FTP_PORT) # initialize a new connection ftp = ftplib.FTP() debugio.debug('schemes.ftp._getconnection(): CONNECT: '+ftp.connect(host, port)) debugio.debug('schemes.ftp._getconnection(): LOGIN: '+ftp.login(user, passwd)) _ftpconnections[netloc] = ftp return ftp
def fetch(link, acceptedtypes): """Fetch the specified link.""" # try to fetch the document try: ftp = _getconnection(link.netloc) debugio.debug('schemes.ftp.fetch(): CWD /: '+ftp.cwd('/')) # descend down the directory tree as far as we can go path = urllib.unquote(link.path) path = _cwd(ftp, path) # check if we are dealing with an (exising) directory if path is None: return _fetch_directory(link, ftp, acceptedtypes) else: return _fetch_file(link, ftp, path, acceptedtypes) except ftplib.all_errors, e: debugio.debug('schemes.ftp.fetch(): CAUGHT '+str(e)) link.add_linkproblem(str(e)) return None
def _fetch_directory(link, path, acceptedtypes): """Retrieve some basic information about the directory. This checks that the directory has a trailing slash and returns a list of files in the directory, unless a configured filename is found in the directory (in which case this function acts as if the file was fetched).""" # if the name does not end with a slash, redirect if path[-1:] != os.path.sep: debugio.debug('directory referenced without trailing slash') link.redirect(urlparse.urljoin(link.url, link.path+'/')) return None # check contents of directory for some common files for fname in config.FILE_INDEXES: if os.path.isfile(os.path.join(path, fname)): debugio.debug('pick up %s from directory' % fname) # the the directory contains an index.html, use that return _fetch_file(link, os.path.join(path, fname), acceptedtypes) # otherwise add the directory's files as children debugio.debug('add files as children of this page') try: link.ispage = True for f in os.listdir(path): link.add_child(urlparse.urljoin(link.url, urllib.pathname2url(f))) except os.error, e: link.add_linkproblem(str(e))
def postprocess(self): # build the list of urls that were set up with add_internal() that # do not have a parent (they form the base for the site) for url in self._internal_urls: link = self.linkMap[url].follow_link() if link == None: debugio.warn('base link %s redirects to nowhere' % url) continue # add the link to bases debugio.debug('crawler.postprocess(): adding %s to bases' % link.url) self.bases.append(link) # if we got no bases, just use the first internal one if len(self.bases) == 0: debugio.debug('crawler.postprocess(): fallback to adding %s to bases' % self._internal_urls[0]) self.bases.append(self.linkMap[self._internal_urls[0]]) # do a breadth first traversal of the website to determin depth and # figure out page children tocheck = [] for link in self.bases: link.depth = 0 tocheck.append(link) # repeat until we have nothing more to check while len(tocheck) > 0: debugio.debug('crawler.postprocess(): items left to examine: %d' % len(tocheck)) # choose a link from the tocheck list link = tocheck.pop(0) # figure out page children for child in link._pagechildren(): # skip children already in our list or the wrong depth if child in tocheck or child.depth != link.depth+1: continue tocheck.append(child) # set some compatibility properties # TODO: figure out a better way to get to this to the plugins self.base = self.bases[0].url
def _fetch_directory(link, ftp, acceptedtypes): """Handle the ftp directory.""" # check that the url ends with a slash if link.path[-1:] != '/': debugio.debug('schemes.ftp._fetch_directory(): directory referenced without trailing slash') link.redirect(urlparse.urljoin(link.url, link.path+'/')) return None # retreive the contents of the directory # FIXME: this raises an exception for empty directories, probably replace with own command contents = ftp.nlst() # check contents of directory for some common files for f in config.FTP_INDEXES: if f in contents: debugio.debug('schemes.ftp._fetch_directory(): pick up %s from directory' % f) # the the directory contains an index.html, use that return _fetch_file(link, ftp, f, acceptedtypes) # just add files in directory as children debugio.debug('schemes.ftp._fetch_directory(): add files as children of this page') link.ispage = True debugio.debug('schemes.ftp._fetch_directory(): TYPE A: '+ftp.voidcmd('TYPE A')) # FIXME: this raises an exception for empty directories for f in contents: link.add_child(urlparse.urljoin(link.url, urllib.quote(f))) return None
def _fetch_file(link, ftp, path, acceptedtypes): """Try to download the file in path that should be in the current directory of the ftp instance. The path can also point to a non-existant file or directory.""" # figure out the size of the document link.size = ftp.size(path) debugio.debug('schemes.ftp.fetch(): size='+str(link.size)) # guess the mimetype of the document if link.mimetype is None: link.mimetype = mimetypes.guess_type(path)[0] # try to fetch file if link.mimetype in acceptedtypes: debugio.debug('schemes.ftp.fetch(): TYPE I: '+ftp.voidcmd('TYPE I')) (conn, size) = ftp.ntransfercmd('RETR ' + path) if size: content = conn.makefile().read(size) else: content = conn.makefile().read() debugio.debug('schemes.ftp.fetch(): fetched, size=%d' % len(content)) return content return None
def postprocess(self): # build the list of urls that were set up with add_internal() that # do not have a parent (they form the base for the site) for url in self._internal_urls: link = self.linkMap[url].follow_link() if link == None: debugio.warn('base link %s redirects to nowhere' % url) continue # add the link to bases debugio.debug('crawler.postprocess(): adding %s to bases' % link.url) self.bases.append(link) # if we got no bases, just use the first internal one if len(self.bases) == 0: debugio.debug( 'crawler.postprocess(): fallback to adding %s to bases' % self._internal_urls[0]) self.bases.append(self.linkMap[self._internal_urls[0]]) # do a breadth first traversal of the website to determin depth and # figure out page children tocheck = [] for link in self.bases: link.depth = 0 tocheck.append(link) # repeat until we have nothing more to check while len(tocheck) > 0: debugio.debug('crawler.postprocess(): items left to examine: %d' % len(tocheck)) # choose a link from the tocheck list link = tocheck.pop(0) # figure out page children for child in link._pagechildren(): # skip children already in our list or the wrong depth if child in tocheck or child.depth != link.depth + 1: continue tocheck.append(child) # set some compatibility properties # TODO: figure out a better way to get to this to the plugins self.base = self.bases[0].url
def fetch(link, acceptedtypes): """Open connection to url and report information given by GET command.""" # TODO: HTTP connection pooling? # TODO: implement proxy requests for https # split netloc in user:pass part and host:port part (userpass, netloc) = urllib.splituser(link.netloc) proxyuserpass = None scheme = link.scheme # check which host to connect to (if using proxies) if config.PROXIES and config.PROXIES.has_key(link.scheme): # pass the complete url in the request, connecting to the proxy path = urlparse.urlunsplit((link.scheme, netloc, link.path, link.query, '')) (scheme, netloc) = urlparse.urlsplit(config.PROXIES[link.scheme])[0:2] (proxyuserpass, netloc) = urllib.splituser(netloc) else: # otherwise direct connect to the server with partial url path = urlparse.urlunsplit(('', '', link.path, link.query, '')) # remove trailing : from netloc if netloc[-1] == ':': netloc = netloc[:-1] conn = None try: try: # create the connection debugio.debug('schemes.http.fetch: connecting to %s' % netloc) if scheme == 'http': conn = httplib.HTTPConnection(netloc) elif scheme == 'https': conn = httplib.HTTPSConnection(netloc) # the requests adds a correct host header for us conn.putrequest('GET', path) if userpass is not None: (user, passwd) = urllib.splitpasswd(userpass) conn.putheader( 'Authorization', 'Basic '+base64.encodestring(str(user)+':'+str(passwd)).strip() ) if proxyuserpass is not None: (user, passwd) = urllib.splitpasswd(proxyuserpass) conn.putheader( 'Proxy-Authorization', 'Basic '+base64.encodestring(str(user)+':'+str(passwd)).strip() ) # bypass proxy cache if config.BYPASSHTTPCACHE: conn.putheader('Cache-control', 'no-cache') conn.putheader('Pragma', 'no-cache') conn.putheader('User-Agent','webcheck %s' % config.VERSION) conn.endheaders() # wait for the response response = conn.getresponse() link.status = '%s %s' % (response.status, response.reason) debugio.debug('schemes.http.fetch(): HTTP response: %s' % link.status) # dump proxy hit/miss debugging info if config.PROXIES and config.PROXIES.has_key(link.scheme): try: debugio.debug('schemes.http.fetch(): X-Cache: %s' % str(response.getheader('X-Cache'))) except AttributeError: pass # retrieve some information from the headers try: link.mimetype = response.msg.gettype() debugio.debug('schemes.http.fetch(): mimetype: %s' % str(link.mimetype)) except AttributeError: pass try: link.set_encoding(_charsetpattern.search(response.getheader('Content-type')).group(1)) except (AttributeError, TypeError): pass try: link.size = int(response.getheader('Content-length')) debugio.debug('schemes.http.fetch(): size: %s' % str(link.size)) except (KeyError, TypeError): pass try: link.mtime = time.mktime(response.msg.getdate('Last-Modified')) debugio.debug('schemes.http.fetch(): mtime: %s' % time.strftime('%c', time.localtime(link.mtime))) except (OverflowError, TypeError, ValueError): pass # handle redirects # 301=moved permanently, 302=found, 303=see other, 307=temporary redirect if response.status in (301, 302, 303, 307): # consider a 301 (moved permanently) a problem if response.status == 301: link.add_linkproblem(str(response.status)+': '+response.reason) # find url that is redirected to location = urlparse.urljoin(link.url, response.getheader('Location', '')) # create the redirect link.redirect(location) return None elif response.status != 200: # handle error responses link.add_linkproblem(str(response.status)+': '+response.reason) return None elif link.mimetype in acceptedtypes: # return succesful responses # TODO: support gzipped content # TODO: add checking for size return response.read() except httplib.HTTPException, e: debugio.debug('error reading HTTP response: '+str(e)) link.add_linkproblem('error reading HTTP response: '+str(e)) return None except (socket.error, socket.sslerror), e: if hasattr(e, 'args') and len(e.args) == 2: debugio.debug("error reading HTTP response: "+str(e.args[1])) link.add_linkproblem("error reading HTTP response: "+str(e.args[1])) else: debugio.debug("error reading HTTP response: "+str(e)) link.add_linkproblem("error reading HTTP response: "+str(e)) return None
def _fetch_file(link, path, acceptedtypes): """Retrieve some basic information of the specified file and return the contents of the file.""" # get stats of file try: stats = os.stat(path) link.size = stats.st_size link.mtime = stats.st_mtime except os.error, e: link.add_linkproblem(str(e)) return None # guess mimetype if link.mimetype is None: link.mimetype = mimetypes.guess_type(path)[0] debugio.debug('mimetype='+str(link.mimetype)) debugio.debug('acceptedtypes='+str(acceptedtypes)) # fetch the document if there is any point if link.mimetype in acceptedtypes: debugio.debug('FETCH') try: # TODO: add size checking return open(path, 'r').read() except IOError, e: debugio.debug('PROBLEM: '+str(e)) link.add_linkproblem(str(e)) return None def fetch(link, acceptedtypes): """Retreive some basic information about the file. Store the results in the link object."""
def parse(content, link): """Parse the specified content and extract an url list, a list of images a title and an author. The content is assumed to contain HMTL.""" # create parser and feed it the content parser = _MyHTMLParser(link) try: parser.feed(content) parser.close() except Exception, e: # ignore (but log) all errors debugio.debug('parsers.html.parse(): caught exception: ' + str(e)) # check for parser errors if parser.errmsg is not None: debugio.debug('parsers.html.parse(): problem parsing html: ' + parser.errmsg) link.add_pageproblem('problem parsing html: %s' % parser.errmsg) # dump encoding debugio.debug('parsers.html.parse(): html encoding: %s' % str(link.encoding)) # flag that the link contains a valid page link.ispage = True # save the title if parser.title is not None: link.title = _maketxt(parser.title, link.encoding).strip() # save the author if parser.author is not None: link.author = _maketxt(parser.author, link.encoding).strip() # figure out the base of the document (for building the other urls) base = link.url if parser.base is not None:
def fetch(link, acceptedtypes): """Open connection to url and report information given by GET command.""" # TODO: HTTP connection pooling? # TODO: implement proxy requests for https # split netloc in user:pass part and host:port part (userpass, netloc) = urllib.splituser(link.netloc) proxyuserpass = None scheme = link.scheme # check which host to connect to (if using proxies) if config.PROXIES and config.PROXIES.has_key(link.scheme): # pass the complete url in the request, connecting to the proxy path = urlparse.urlunsplit( (link.scheme, netloc, link.path, link.query, '')) (scheme, netloc) = urlparse.urlsplit(config.PROXIES[link.scheme])[0:2] (proxyuserpass, netloc) = urllib.splituser(netloc) else: # otherwise direct connect to the server with partial url path = urlparse.urlunsplit(('', '', link.path, link.query, '')) # remove trailing : from netloc if netloc[-1] == ':': netloc = netloc[:-1] conn = None try: try: # create the connection debugio.debug('schemes.http.fetch: connecting to %s' % netloc) if scheme == 'http': conn = httplib.HTTPConnection(netloc) elif scheme == 'https': conn = httplib.HTTPSConnection(netloc) # the requests adds a correct host header for us conn.putrequest('GET', path) if userpass is not None: (user, passwd) = urllib.splitpasswd(userpass) conn.putheader( 'Authorization', 'Basic ' + base64.encodestring(str(user) + ':' + str(passwd)).strip()) if proxyuserpass is not None: (user, passwd) = urllib.splitpasswd(proxyuserpass) conn.putheader( 'Proxy-Authorization', 'Basic ' + base64.encodestring(str(user) + ':' + str(passwd)).strip()) # bypass proxy cache if config.BYPASSHTTPCACHE: conn.putheader('Cache-control', 'no-cache') conn.putheader('Pragma', 'no-cache') conn.putheader('User-Agent', 'webcheck %s' % config.VERSION) conn.endheaders() # wait for the response response = conn.getresponse() link.status = '%s %s' % (response.status, response.reason) debugio.debug('schemes.http.fetch(): HTTP response: %s' % link.status) # dump proxy hit/miss debugging info if config.PROXIES and config.PROXIES.has_key(link.scheme): try: debugio.debug('schemes.http.fetch(): X-Cache: %s' % str(response.getheader('X-Cache'))) except AttributeError: pass # retrieve some information from the headers try: link.mimetype = response.msg.gettype() debugio.debug('schemes.http.fetch(): mimetype: %s' % str(link.mimetype)) except AttributeError: pass try: link.set_encoding( _charsetpattern.search( response.getheader('Content-type')).group(1)) except (AttributeError, TypeError): pass try: link.size = int(response.getheader('Content-length')) debugio.debug('schemes.http.fetch(): size: %s' % str(link.size)) except (KeyError, TypeError): pass try: link.mtime = time.mktime(response.msg.getdate('Last-Modified')) debugio.debug('schemes.http.fetch(): mtime: %s' % time.strftime('%c', time.localtime(link.mtime))) except (OverflowError, TypeError, ValueError): pass # handle redirects # 301=moved permanently, 302=found, 303=see other, 307=temporary redirect if response.status in (301, 302, 303, 307): # consider a 301 (moved permanently) a problem if response.status == 301: link.add_linkproblem( str(response.status) + ': ' + response.reason) # find url that is redirected to location = urlparse.urljoin(link.url, response.getheader('Location', '')) # create the redirect link.redirect(location) return None elif response.status != 200: # handle error responses link.add_linkproblem( str(response.status) + ': ' + response.reason) return None elif link.mimetype in acceptedtypes: # return succesful responses # TODO: support gzipped content # TODO: add checking for size return response.read() except httplib.HTTPException, e: debugio.debug('error reading HTTP response: ' + str(e)) link.add_linkproblem('error reading HTTP response: ' + str(e)) return None except (socket.error, socket.sslerror), e: if hasattr(e, 'args') and len(e.args) == 2: debugio.debug("error reading HTTP response: " + str(e.args[1])) link.add_linkproblem("error reading HTTP response: " + str(e.args[1])) else: debugio.debug("error reading HTTP response: " + str(e)) link.add_linkproblem("error reading HTTP response: " + str(e)) return None
link.add_linkproblem(str(response.status)+': '+response.reason) return None elif link.mimetype in acceptedtypes: # return succesful responses # TODO: support gzipped content # TODO: add checking for size return response.read() except httplib.HTTPException, e: debugio.debug('error reading HTTP response: '+str(e)) link.add_linkproblem('error reading HTTP response: '+str(e)) return None except (socket.error, socket.sslerror), e: if hasattr(e, 'args') and len(e.args) == 2: debugio.debug("error reading HTTP response: "+str(e.args[1])) link.add_linkproblem("error reading HTTP response: "+str(e.args[1])) else: debugio.debug("error reading HTTP response: "+str(e)) link.add_linkproblem("error reading HTTP response: "+str(e)) return None except Exception, e: # handle all other exceptions debugio.debug('unknown exception caught: '+str(e)) link.add_linkproblem('error reading HTTP response: '+str(e)) import traceback traceback.print_exc() return None finally: # close the connection before returning if conn is not None: conn.close()
return htmlunescape(unicode(txt, errors='replace')) def parse(content, link): """Parse the specified content and extract an url list, a list of images a title and an author. The content is assumed to contain HMTL.""" # create parser and feed it the content parser = _MyHTMLParser(link) try: parser.feed(content) parser.close() except Exception, e: # ignore (but log) all errors debugio.debug('parsers.html.parse(): caught exception: '+str(e)) # check for parser errors if parser.errmsg is not None: debugio.debug('parsers.html.parse(): problem parsing html: '+parser.errmsg) link.add_pageproblem('problem parsing html: %s' % parser.errmsg) # dump encoding debugio.debug('parsers.html.parse(): html encoding: %s' % str(link.encoding)) # flag that the link contains a valid page link.ispage = True # save the title if parser.title is not None: link.title = _maketxt(parser.title, link.encoding).strip() # save the author if parser.author is not None: link.author = _maketxt(parser.author, link.encoding).strip() # figure out the base of the document (for building the other urls) base = link.url if parser.base is not None: base = parser.base
return None elif link.mimetype in acceptedtypes: # return succesful responses # TODO: support gzipped content # TODO: add checking for size return response.read() except httplib.HTTPException, e: debugio.debug('error reading HTTP response: ' + str(e)) link.add_linkproblem('error reading HTTP response: ' + str(e)) return None except (socket.error, socket.sslerror), e: if hasattr(e, 'args') and len(e.args) == 2: debugio.debug("error reading HTTP response: " + str(e.args[1])) link.add_linkproblem("error reading HTTP response: " + str(e.args[1])) else: debugio.debug("error reading HTTP response: " + str(e)) link.add_linkproblem("error reading HTTP response: " + str(e)) return None except Exception, e: # handle all other exceptions debugio.debug('unknown exception caught: ' + str(e)) link.add_linkproblem('error reading HTTP response: ' + str(e)) import traceback traceback.print_exc() return None finally: # close the connection before returning if conn is not None: conn.close()
def crawl(self): """Crawl the website based on the urls specified with add_internal().""" # TODO: have some different scheme to crawl a site (e.g. separate # internal and external queues, threading, etc) tocheck = [] for u in self._internal_urls: tocheck.append(self._get_link(u)) # repeat until we have nothing more to check while len(tocheck) > 0: debugio.debug("crawler.crawl(): items left to check: %d" % len(tocheck)) # choose a link from the tocheck list link=tocheck.pop(0) # skip link it there is nothing to check if link.isyanked or link.isfetched: continue # fetch the link's contents link.fetch() # add children to tocheck for child in link.children: if not child.isyanked and not child.isfetched and not child in tocheck: tocheck.append(child) # add embedded content for embed in link.embedded: if not embed.isyanked and not embed.isfetched and not embed in tocheck: tocheck.append(embed) # sleep between requests if configured if config.WAIT_BETWEEN_REQUESTS > 0: debugio.debug('sleeping %s seconds' % config.WAIT_BETWEEN_REQUESTS) time.sleep(config.WAIT_BETWEEN_REQUESTS) # build the list of urls that were set up with add_internal() that # do not have a parent (they form the base for the site) bases = [ ] for u in self._internal_urls: l = self.linkMap[u].follow_link() if l == None: debugio.warn('base link %s redirects to nowhere' % u) continue # if the link has no parent add it to the result list unless it is the first one if len(l.parents) == 0 or len(bases) == 0: debugio.debug('crawler.crawl(): adding %s to bases' % l.url) bases.append(l) # if we got no bases, just use the first internal one if len(bases) == 0: debugio.debug('crawler.crawl(): fallback to adding %s to bases' % self._internal_urls[0]) bases.append(self.linkMap[self._internal_urls[0]]) # do a breadth first traversal of the website to determin depth and # figure out page children tocheck = [] for link in bases: link.depth = 0 tocheck.append(link) # repeat until we have nothing more to check while len(tocheck) > 0: debugio.debug("crawler.crawl(): items left to examine: %d" % len(tocheck)) # choose a link from the tocheck list link = tocheck.pop(0) # figure out page children for child in link._pagechildren(): # skip children already in our list or the wrong depth if child in tocheck or child.depth != link.depth+1: continue tocheck.append(child) # set some compatibility properties # TODO: figure out a better way to get to this to the plugins self.base = bases[0].url