def _check_cache(subpath, purge): #parse the url parsed = urlparse.urlparse(subpath) if parsed.path != "/" and parsed.path != "": local_path = cacheUrl(parsed.path, parsed.netloc)[1:] else: local_path = cacheUrl("/index", parsed.netloc)[1:] # add URL params to path args = "" for a in request.args: if len(args) == 0: args = "?" + a + "=" + request.args.get(a) else: args += "&" + a + "=" + request.args.get(a) local_path += args #check if it's in the cache found = True try: if not os.path.exists(local_path) or purge: folders = "/".join(local_path.split("/")[:-1]) if not os.path.exists(folders): os.makedirs(folders) #download the file and serve dlfile(subpath + args, local_path) found = False except urllib2.URLError: print "Could not download file", subpath print "serving: " + local_path return local_path, found
def test_cacheUrl_uses_existing_domain(self): bad_domain = "www.thisisthedomain.com" good_domain = "https://www.thisisthecorrectdomain.com" self.assertEqual(cacheUrl(good_domain+"/hey", bad_domain), "/cache/"+good_domain+"/hey") good_domain = "http://www.thisisthecorrectdomain.com" self.assertEqual(cacheUrl(good_domain+"/hey", bad_domain), "/cache/"+good_domain+"/hey") good_domain = "//www.thisisthecorrectdomain.com"; url = cacheUrl(good_domain+"/hey", bad_domain); self.assertEqual(url, "/cache/http://www.thisisthecorrectdomain.com/hey")
def view_page(subpath): htmlfile, found = _check_cache(subpath, True) with open(htmlfile, 'r') as myfile: html = myfile.read() if found: return html #redirect css url() links to cache html = re.sub(r'url\(([^\)]*)\)', lambda m: "url(" + cacheUrl(m.group(1), subpath) + ")", html) soup = BeautifulSoup(html, "lxml") #redirect image links to cache for link in soup.findAll('img'): src = link.get('src') if src: link['src'] = cacheUrl(src, subpath) srcset = link.get('srcset') if srcset: links = link['srcset'].split(",") new = [] for l in links: l = l.strip() split = l.split(" ") split[0] = cache(split[0], subpath) new.append(" ".join(split)) link['srcset'] = ", ".join(new) #redirect script links to cache for link in soup.findAll('script'): if 'src' in link: link['src'] = cacheUrl(ink['src'], subpath) #redirect CSS and other <link> tags to cache for link in soup.findAll('link'): href = link.get("href") if href: link['href'] = cacheUrl(href, subpath) #inject js up top in <head> script = soup.new_tag("script") with open("injected.js") as f: script.string = f.read().replace("#!<DOMAIN>!#", subpath) soup.head.insert(0, script) #write file to cache with open(htmlfile, 'w') as myfile: myfile.write(str(soup)) return str(soup)
def test_cacheUrl_ignores_bogus_input(self): self.assertEqual(cacheUrl("", ""), None) self.assertEqual(cacheUrl("some_url", ""), None) self.assertEqual(cacheUrl(None, None), None) self.assertEqual(cacheUrl(None, "www.adomain.org"), None) self.assertEqual(cacheUrl("some_url", None), None)
def test_cacheUrl_removes_extra_slashes(self): domain = "www.thisisthedomain.com//" self.assertEqual(cacheUrl("/hey//", domain), "/cache/www.thisisthedomain.com/hey/")
def test_cacheUrl_starts_with_cache(self): domain = "www.thisisthedomain.com" self.assertEqual(cacheUrl("/hey", domain), "/cache/"+domain+"/hey")