class MyChecker(Checker): link_names = {} #store link->[name] def __init__(self, site_url, cache): self.cache = cache self.site_url = site_url self.reset() def message(self, format, *args): pass # stop printing out crap def reset(self): self.infos = {} self.files = {} self.redirected = {} self.alias_bases = {} self.sortorder = {} self.counter = 0 Checker.reset(self) self.urlopener = CachingURLopener(cache=self.cache, site_url=self.site_url) def resetRun(self): self.roots = [] self.todo = {} self.done = {} self.bad = {} def readhtml(self, url_pair): res = Checker.readhtml(self, url_pair) return res def openhtml(self, url_pair): oldurl, fragment = url_pair f = self.openpage(url_pair) if f: url = f.geturl() if url != oldurl: self.redirected[oldurl] = url self.infos[url] = info = f.info() #Incement counter to get ordering of links within pages over whole site if not self.checkforhtml(info, url): #self.files[url] = f.read() self.files[url] = f #self.safeclose(f) f = None else: url = oldurl return f, url def openpage(self, url_pair): url, fragment = url_pair old_pair = url_pair old_url = url # actually open alias instead #XXX: decommented original code if self.site_url.endswith('/'): realbase = self.site_url[:-1] for a in self.alias_bases: if a.endswith('/'): a = a[:-1] if a and url.startswith(a): base = url[:len(a)] path = url[len(a):] url = realbase + path break try: # XXX: Hack for handle "bad urls" # Some ISPRA Site Urls must ends with '/' if not old_url.endswith('/') and not ( old_url.endswith('.html') or \ old_url.endswith('.html') or \ old_url.endswith('.html1')): old_url = "%s/" % old_url # XXX: hack for concorsi section if '?Id=' in old_url: old_url = old_url.rstrip('/') return self.urlopener.open(old_url) except (OSError, IOError), msg: msg = self.sanitize(msg) self.note(0, "Error %s", msg) if self.verbose > 0: self.show(" HREF ", url, " from", self.todo[url_pair]) self.setbad(old_pair, msg) return None
class MyChecker(Checker): link_names = {} #store link->[name] def __init__(self, site_url, cache): self.cache = cache self.site_url = site_url self.reset() def message(self, format, *args): pass # stop printing out crap def reset(self): self.infos = {} self.files = {} self.redirected = {} self.alias_bases = {} self.sortorder = {} self.counter = 0 Checker.reset(self) self.urlopener = CachingURLopener(cache=self.cache, site_url=self.site_url) def resetRun(self): self.roots = [] self.todo = OrderedDict() self.done = {} self.bad = {} def readhtml(self, url_pair): res = Checker.readhtml(self, url_pair) return res def openhtml(self, url_pair): oldurl, fragment = url_pair f = self.openpage(url_pair) if f: url = f.geturl() if url != oldurl: self.redirected[oldurl] = url self.infos[url] = info = f.info() #Incement counter to get ordering of links within pages over whole site if not self.checkforhtml(info, url): #self.files[url] = f.read() self.files[url] = f #self.safeclose(f) f = None else: url = oldurl return f, url def openpage(self, url_pair): url, fragment = url_pair old_pair = url_pair old_url = url # actually open alias instead # if self.site_url.endswith('/'): # realbase=self.site_url[:-1] # for a in self.alias_bases: # if a.endswith('/'): # a=a[:-1] # if a and url.startswith(a): # base = url[:len(a)] # path = url[len(a):] # url = realbase+path # break try: return self.urlopener.open(old_url) except (OSError, IOError), msg: msg = self.sanitize(msg) self.note(0, "Error %s", msg) if self.verbose > 0: self.show(" HREF ", url, " from", self.todo[url_pair]) self.setbad(old_pair, msg) return None
class MyChecker(Checker): link_names = {} #store link->[name] def __init__(self, site_url, cache): self.cache = cache self.site_url = site_url self.reset() def message(self, format, *args): pass # stop printing out crap def reset(self): self.infos = {} self.files = {} self.redirected = {} self.alias_bases = {} self.sortorder = {} self.counter = 0 Checker.reset(self) self.urlopener = CachingURLopener(cache = self.cache, site_url=self.site_url) def resetRun(self): self.roots = [] self.todo = OrderedDict() self.done = {} self.bad = {} def readhtml(self, url_pair): res = Checker.readhtml(self, url_pair) return res def openhtml(self, url_pair): oldurl, fragment = url_pair f = self.openpage(url_pair) if f: url = f.geturl() if url != oldurl: self.redirected[oldurl] = url self.infos[url] = info = f.info() #Incement counter to get ordering of links within pages over whole site if not self.checkforhtml(info, url): #self.files[url] = f.read() self.files[url] = f #self.safeclose(f) f = None else: url = oldurl return f, url def openpage(self, url_pair): url, fragment = url_pair old_pair = url_pair old_url = url # actually open alias instead # if self.site_url.endswith('/'): # realbase=self.site_url[:-1] # for a in self.alias_bases: # if a.endswith('/'): # a=a[:-1] # if a and url.startswith(a): # base = url[:len(a)] # path = url[len(a):] # url = realbase+path # break try: return self.urlopener.open(old_url) except (OSError, IOError), msg: msg = self.sanitize(msg) self.note(0, "Error %s", msg) if self.verbose > 0: self.show(" HREF ", url, " from", self.todo[url_pair]) self.setbad(old_pair, msg) return None
class MyChecker(Checker): link_names = {} #store link->[name] def __init__(self, site_url, cache): self.cache = cache self.site_url = site_url self.reset() def message(self, format, *args): pass # stop printing out crap def reset(self): self.infos = {} self.files = {} self.redirected = {} self.alias_bases = {} self.sortorder = {} self.counter = 0 Checker.reset(self) self.urlopener = CachingURLopener(cache = self.cache, site_url=self.site_url) def resetRun(self): self.roots = [] self.todo = {} self.done = {} self.bad = {} def readhtml(self, url_pair): res = Checker.readhtml(self, url_pair) return res def openhtml(self, url_pair): oldurl, fragment = url_pair f = self.openpage(url_pair) if f: url = f.geturl() if url != oldurl: self.redirected[oldurl] = url self.infos[url] = info = f.info() #Incement counter to get ordering of links within pages over whole site if not self.checkforhtml(info, url): #self.files[url] = f.read() self.files[url] = f #self.safeclose(f) f = None else: url = oldurl return f, url def openpage(self, url_pair): url, fragment = url_pair old_pair = url_pair old_url = url # actually open alias instead #XXX: decommented original code if self.site_url.endswith('/'): realbase=self.site_url[:-1] for a in self.alias_bases: if a.endswith('/'): a=a[:-1] if a and url.startswith(a): base = url[:len(a)] path = url[len(a):] url = realbase+path break try: # XXX: Hack for handle "bad urls" # Some ISPRA Site Urls must ends with '/' if not old_url.endswith('/') and not ( old_url.endswith('.html') or \ old_url.endswith('.html') or \ old_url.endswith('.html1')): old_url = "%s/" % old_url # XXX: hack for concorsi section if '?Id=' in old_url: old_url = old_url.rstrip('/') return self.urlopener.open(old_url) except (OSError, IOError), msg: msg = self.sanitize(msg) self.note(0, "Error %s", msg) if self.verbose > 0: self.show(" HREF ", url, " from", self.todo[url_pair]) self.setbad(old_pair, msg) return None