def getURLString(self, surt=False, public_suffix=False): if None != self.opaque: return self.opaque if 'dns' == self.scheme: s = self.scheme + ':' ###java version adds :// regardless of scheme else: ###java version uses opaque type for dns urls, but this version supports dns urls s = self.scheme + '://' if surt: s += "(" if self.authUser: s += self.authUser if self.authPass: s += self.authPass s += '@' hostSrc = self.host if public_suffix: hostSrc = self.getPublicSuffix() if surt: hostSrc = hostToSURT(hostSrc) s += hostSrc if self.port != self.DEFAULT_PORT: s += ":%d" % self.port if surt: s += ')' hasPath = (None != self.path) and (len(self.path) > 0) if hasPath: s += self.path else: if (None != self.query) or (None != self.hash): #must have '/' with query or hash: s += '/' if None != self.query: s += '?' + self.query if None != self.hash: s += '#' + self.hash if None != self.last_delimiter: s += self.last_delimiter return s
def surt(url): """ These doctests are from WaybackURLKeyMakerTest.java >>> surt(None) '-' >>> surt('') '-' >>> surt("filedesc:foo.arc.gz") 'filedesc:foo.arc.gz' >>> surt("filedesc:/foo.arc.gz") 'filedesc:/foo.arc.gz' >>> surt("filedesc://foo.arc.gz") 'filedesc://foo.arc.gz' >>> surt("warcinfo:foo.warc.gz") 'warcinfo:foo.warc.gz' >>> surt("dns:alexa.com") 'com,alexa)' >>> surt("dns:archive.org") 'org,archive)' >>> surt("http://www.archive.org/") 'org,archive)/' >>> surt("http://archive.org/") 'org,archive)/' >>> surt("http://archive.org/goo/") 'org,archive)/goo' >>> surt("http://archive.org/goo/?") 'org,archive)/goo' >>> surt("http://archive.org/goo/?b&a") 'org,archive)/goo?a&b' >>> surt("http://archive.org/goo/?a=2&b&a=1") 'org,archive)/goo?a=1&a=2&b' PHP session id: >>> surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") 'org,archive)/index.php?action=profile;u=4221' WHOIS url: >>> surt("whois://whois.isoc.org.il/shaveh.co.il") 'whois://whois.isoc.org.il/shaveh.co.il' Yahoo web bug. See https://github.com/internetarchive/surt/issues/1 >>> surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2' """ if not url: return "-" if url.startswith("filedesc"): return url if url.startswith("warcinfo"): return url if url.startswith("dns:"): return hostToSURT(url[4:]) + ')' if url.startswith("whois://"): return url hurl = canonicalize(handyurl.parse(url)) key = hurl.getURLString(surt=True) parenIdx = key.find('(') if -1 == parenIdx: return url #something very wrong return key[parenIdx + 1:]
def surt(url): """ These doctests are from WaybackURLKeyMakerTest.java >>> surt(None) '-' >>> surt('') '-' >>> surt("filedesc:foo.arc.gz") 'filedesc:foo.arc.gz' >>> surt("filedesc:/foo.arc.gz") 'filedesc:/foo.arc.gz' >>> surt("filedesc://foo.arc.gz") 'filedesc://foo.arc.gz' >>> surt("warcinfo:foo.warc.gz") 'warcinfo:foo.warc.gz' >>> surt("dns:alexa.com") 'com,alexa)' >>> surt("dns:archive.org") 'org,archive)' >>> surt("http://www.archive.org/") 'org,archive)/' >>> surt("http://archive.org/") 'org,archive)/' >>> surt("http://archive.org/goo/") 'org,archive)/goo' >>> surt("http://archive.org/goo/?") 'org,archive)/goo' >>> surt("http://archive.org/goo/?b&a") 'org,archive)/goo?a&b' >>> surt("http://archive.org/goo/?a=2&b&a=1") 'org,archive)/goo?a=1&a=2&b' PHP session id: >>> surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") 'org,archive)/index.php?action=profile;u=4221' WHOIS url: >>> surt("whois://whois.isoc.org.il/shaveh.co.il") 'whois://whois.isoc.org.il/shaveh.co.il' Yahoo web bug. See https://github.com/internetarchive/surt/issues/1 >>> surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2' """ if not url: return "-" if url.startswith("filedesc"): return url if url.startswith("warcinfo"): return url if url.startswith("dns:"): return hostToSURT(url[4:]) + ')' if url.startswith("whois://"): return url hurl = canonicalize(handyurl.parse(url)) key = hurl.getURLString(surt=True) parenIdx = key.find('(') if -1 == parenIdx: return url #something very wrong return key[parenIdx+1:]