def loadHansard(hansard=None, url=None, session=None): if hansard: try: return HansardCache.objects.get(hansard=hansard) except HansardCache.DoesNotExist: if hansard.url: return loadHansard(url=hansard.url, session=hansard.session) elif url and session: normurl = parsetools.normalizeHansardURL(url) if normurl != url: print "WARNING: Normalized URL %s to %s" % (url, normurl) try: cached = HansardCache.objects.get(hansard__url=normurl) if cached.hansard.session != session: raise Exception( "Found cached Hansard, but session doesn't match...") return cached except HansardCache.DoesNotExist: print "Downloading Hansard from %s" % normurl req = urllib2.Request(normurl) page = urllib2.urlopen(req).read() #try: number = _getHansardNumber(page) #except Exception, e: # print e # print "Couldn't get Hansard number for" # print url # print "Please enter: ", # number = sys.stdin.readline().strip() try: hansard = Hansard.objects.get(session=session, number=number) except Hansard.DoesNotExist: hansard = Hansard(session=session, number=number, url=normurl) hansard.save() else: if hansard.url != normurl: raise Exception( "Hansard exists, with a different url: %s %s" % (normurl, hansard.url)) cache = HansardCache(hansard=hansard) cache.saveHTML(page) cache.save() return cache else: raise Exception("Either url/session or hansard are required")
def loadHansard(hansard=None, url=None, session=None): if hansard: try: return HansardCache.objects.get(hansard=hansard) except HansardCache.DoesNotExist: if hansard.url: return loadHansard(url=hansard.url, session=hansard.session) elif url and session: normurl = parsetools.normalizeHansardURL(url) if normurl != url: print "WARNING: Normalized URL %s to %s" % (url, normurl) try: cached = HansardCache.objects.get(hansard__url=normurl) if cached.hansard.session != session: raise Exception("Found cached Hansard, but session doesn't match...") return cached except HansardCache.DoesNotExist: print "Downloading Hansard from %s" % normurl req = urllib2.Request(normurl) page = urllib2.urlopen(req).read() #try: number = _getHansardNumber(page) #except Exception, e: # print e # print "Couldn't get Hansard number for" # print url # print "Please enter: ", # number = sys.stdin.readline().strip() try: hansard = Hansard.objects.get(session=session, number=number) except Hansard.DoesNotExist: hansard = Hansard(session=session, number=number, url=normurl) hansard.save() else: if hansard.url != normurl: raise Exception("Hansard exists, with a different url: %s %s" % (normurl, hansard.url)) cache = HansardCache(hansard=hansard) cache.saveHTML(page) cache.save() return cache else: raise Exception("Either url/session or hansard are required")
def normalize_hansard_urls(): for h in Hansard.objects.all(): normalized = parsetools.normalizeHansardURL(h.url) if normalized != h.url: h.url = normalized h.save()