def fetch(self): """Attempt to fetch the url (if isyanked is not True) and fill in link attributes (based on isinternal).""" # fully ignore links that should not be feteched if self.isyanked: debugio.info(' %s' % self.url) debugio.info(' ' + self.isyanked) return # see if we can import the proper module for this scheme schememodule = schemes.get_schememodule(self.scheme) if schememodule is None: self.isyanked = 'unsupported scheme (' + self.scheme + ')' self._ischanged = True debugio.info(' %s' % self.url) debugio.info(' ' + self.isyanked) return debugio.info(' %s' % self.url) content = schememodule.fetch(self, parsers.get_mimetypes()) self.isfetched = True self._ischanged = True # skip parsing of content if we were returned nothing if content is None: return # find a parser for the content-type parsermodule = parsers.get_parsermodule(self.mimetype) if parsermodule is None: debugio.debug( 'crawler.Link.fetch(): unsupported content-type: %s' % self.mimetype) return # parse the content parsermodule.parse(content, self)
def _get_robotparser(self, link): """Return the proper robots parser for the given url or None if one cannot be constructed. Robot parsers are cached per scheme and netloc.""" # only some schemes have a meaningful robots.txt file if link.scheme != 'http' and link.scheme != 'https': debugio.debug('crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme) return None # split out the key part of the url location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', '')) # try to create a new robotparser if we don't already have one if not self._robotparsers.has_key(location): import httplib debugio.info(' getting robots.txt for %s' % location) self._robotparsers[location] = None try: rp = robotparser.RobotFileParser() rp.set_url(urlparse.urlunsplit( (link.scheme, link.netloc, '/robots.txt', '', '') )) rp.read() self._robotparsers[location] = rp except (TypeError, IOError, httplib.HTTPException): # ignore any problems setting up robot parser pass return self._robotparsers[location]
def _get_robotparser(self, link): """Return the proper robots parser for the given url or None if one cannot be constructed. Robot parsers are cached per scheme and netloc.""" # only some schemes have a meaningful robots.txt file if link.scheme != 'http' and link.scheme != 'https': debugio.debug( 'crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme) return None # split out the key part of the url location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', '')) # try to create a new robotparser if we don't already have one if not self._robotparsers.has_key(location): import httplib debugio.info(' getting robots.txt for %s' % location) self._robotparsers[location] = None try: rp = robotparser.RobotFileParser() rp.set_url( urlparse.urlunsplit( (link.scheme, link.netloc, '/robots.txt', '', ''))) rp.read() self._robotparsers[location] = rp except (TypeError, IOError, httplib.HTTPException): # ignore any problems setting up robot parser pass return self._robotparsers[location]
def fetch(self): """Attempt to fetch the url (if isyanked is not True) and fill in link attributes (based on isinternal).""" # fully ignore links that should not be feteched if self.isyanked: debugio.info(' %s' % self.url) debugio.info(' ' + self.isyanked) return # see if we can import the proper module for this scheme schememodule = schemes.get_schememodule(self.scheme) if schememodule is None: self.isyanked = 'unsupported scheme (' + self.scheme + ')' self._ischanged = True debugio.info(' %s' % self.url) debugio.info(' ' + self.isyanked) return debugio.info(' %s' % self.url) content = schememodule.fetch(self, parsers.get_mimetypes()) self.isfetched = True self._ischanged = True # skip parsing of content if we were returned nothing if content is None: return # find a parser for the content-type parsermodule = parsers.get_parsermodule(self.mimetype) if parsermodule is None: debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' % self.mimetype) return # parse the content parsermodule.parse(content, self)
def generate(site): """Generate pages for plugins.""" for p in config.PLUGINS: debugio.info(' ' + p) # import the plugin plugin = __import__('plugins.' + p, globals(), locals(), [p]) # run the plugin plugin.generate(site)
def deserialize(fp): """Read data from the file and construct objects from it. A new site instance is returned. After the site has been deserialized the crawl() and postprocess() functions should be called to regenerate the other link attributes.""" import crawler site = crawler.Site() link = None while True: line = fp.readline() # check for end-of-file if not line: break # skip comments if _commentpattern.search(line): continue # skip empty lines if line.rstrip() == '': continue # find section header match = _sectionpattern.search(line) if match: link = site.get_link(match.group(1)) debugio.info(' %s' % link.url) # clear some data that is annoying if we have duplicates link.anchors = [] link.linkproblems = [] link.pageproblems = [] continue # check for key-value pair match = _keyvaluepattern.search(line) if match: key = match.group(1) value = match.group(2) if link is None: _deserialize_site(site, key, value) else: _deserialize_link(link, key, value) continue # fallthrough raise DeSerializeException('parse error') return site
def main(): """Main program.""" site = crawler.Site() # parse command-line arguments parse_args(site) # read serialized file if config.CONTINUE: fname = os.path.join(config.OUTPUT_DIR, 'webcheck.dat') debugio.info('reading stored crawler data....') try: fp = open(fname, 'r') site = serialize.deserialize(fp) fp.close() except IOError, (errno, strerror): debugio.error('%(fname)s: %(strerror)s' % { 'fname': fname, 'strerror': strerror }) sys.exit(1) debugio.info('done.')
try: fp = open(fname, 'r') site = serialize.deserialize(fp) fp.close() except IOError, (errno, strerror): debugio.error('%(fname)s: %(strerror)s' % { 'fname': fname, 'strerror': strerror }) sys.exit(1) debugio.info('done.') # create seriazlized file fp = plugins.open_file('webcheck.dat', makebackup=True) serialize.serialize_site(fp, site) # crawl through the website debugio.info('checking site....') site.crawl(fp) # this will take a while debugio.info('done.') fp.close() # serialize the final state again fp = plugins.open_file('webcheck.dat', makebackup=True) serialize.serialize_site(fp, site) serialize.serialize_links(fp, site) fp.close() # do postprocessing (building site structure, etc) debugio.info('postprocessing....') site.postprocess() debugio.info('done.') # now we can write out the files # start with the frame-description page debugio.info('generating reports...')