def download(item, db, tries): if item["href"] in db: path = db.get(item["href"]) else: f = common.retrieve_m(config["clearspace-root"] + item["href"], tries=tries) doc = WikiDoc(f.read()) f.close() path = doc.path + "/" + doc.filename if want(path): skip = False if os.path.exists(path): st = os.stat(path) if st.st_mtime == doc.mtime: skip = True if not skip: common.mkdirs(doc.path) common.retrieve(config["clearspace-root"] + doc.filehref, path, force=True, tries=tries) common.mkro(path) os.utime(path, (doc.mtime, doc.mtime)) updatedbs(db, keep, item["href"], path)
def download(url, dest): if url.startswith("data:"): return if fileset.tas(dest): return common.mkdirs(os.path.split(dest)[0]) try: common.retrieve(url, dest) if args["type"] == "html-single" and dest.endswith(".html"): get_deps_html(url, dest) if args["type"] == "html-single" and dest.endswith(".css"): get_deps_css(url, dest) common.mkro(dest) except urllib.error.HTTPError as e: if e.code == 403 or e.code == 404: warn("WARNING: %s on %s, continuing..." % (e, url)) else: raise
def download(url, dest, username, password): pm = urllib2.HTTPPasswordMgrWithDefaultRealm() pm.add_password(None, url, username, password) opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(pm)) common.mkdirs(os.path.split(dest)[0]) common.retrieve(url, dest, opener = opener, tries = 10, force = True) common.mkro(dest)
def download(url, dest, username, password): pm = urllib.request.HTTPPasswordMgrWithDefaultRealm() pm.add_password(None, url, username, password) opener = urllib.request.build_opener( urllib.request.HTTPBasicAuthHandler(pm)) common.mkdirs(os.path.split(dest)[0]) common.retrieve(url, dest, opener=opener, tries=10, force=True) common.mkro(dest)
def download_item(item, extension, tries = 1): dstfile = os.path.join(item.type_, item.pageurl.split("/")[-1]) + extension common.mkdirs(item.type_) try: print("\r[%u]" % item.number, end = "", file = sys.stderr) common.retrieve(item.dlurl, dstfile, tries = tries) common.mkro(dstfile) except urllib.error.HTTPError as e: warn("can't download item at %s (#%u, %s, %s) (%s), continuing..." % \ (item.dlurl, item.number, item.title, item.type_, e))
def download_item(item, extension, tries=1): dstfile = os.path.join(item.type_, item.pageurl.split("/")[-1]) + extension common.mkdirs(item.type_) try: print("\r[%u]" % item.number, end="", file=sys.stderr) common.retrieve(item.dlurl, dstfile, tries=tries) common.mkro(dstfile) except urllib.error.HTTPError as e: warn("can't download item at %s (#%u, %s, %s) (%s), continuing..." % \ (item.dlurl, item.number, item.title, item.type_, e))
def sync(query, keep): xml = common.retrieve_m(config["gsa-url"] + "?client=internal&output=xml&num=1000&filter=0&q=" + query, tries = 10) xml = lxml.etree.parse(xml) if int(xml.xpath("//M/text()")[0]) == 1000: raise Exception("search returned too many results") for result in xml.xpath("//U/text()"): dest = result.split("//")[1] dest = dest.replace("~", "") common.mkdirs(os.path.split(dest)[0]) common.retrieve(result, dest, tries = 10) common.mkro(dest) keep.add(dest)
def sync(query, keep): xml = common.retrieve_m( config["gsa-url"] + "?client=internal&output=xml&num=1000&filter=0&q=" + query, tries=10) xml = lxml.etree.parse(xml) if int(xml.xpath("//M/text()")[0]) == 1000: raise Exception("search returned too many results") for result in xml.xpath("//U/text()"): dest = result.split("//")[1] dest = dest.replace("~", "") common.mkdirs(os.path.split(dest)[0]) common.retrieve(result, dest, tries=10) common.mkro(dest) keep.add(dest)
def save(args): jnlpurl = args.url if re.search("/internal/", jnlpurl): login(args) if re.search("/(mr|p).jnlp\?", jnlpurl): jnlpurl = getjnlpurl(jnlpurl) vcrfile = urllib.parse.parse_qs(jnlpurl[jnlpurl.index("?") + 1:])["psid"][0] jnlpfile = vcrfile + ".jnlp" common.retrieve(jnlpurl, jnlpfile, force = True) try: xml = lxml.etree.parse(jnlpfile).getroot() except lxml.etree.XMLSyntaxError: os.unlink(jnlpfile) print("%s: couldn't retrieve jnlp: credentials incorrect?" % ap.prog) sys.exit(1) xmlargs = xml.xpath("//argument") for i in range(len(xmlargs) - 1): if xmlargs[i].text == "-play": common.retrieve(xmlargs[i + 1].text, vcrfile) xmlargs[i + 1].text = "file://" + config["elluminate-base"] + "/" + vcrfile break fetchjars(xml) xml.set("codebase", "file://" + config["elluminate-base"] + "/" + JARS) f = open(jnlpfile, "wb") f.write(lxml.etree.tostring(xml, xml_declaration = True)) f.close() print(jnlpfile)
def save(args): jnlpurl = args.url if re.search("/internal/", jnlpurl): login(args) if re.search("/(mr|p).jnlp\?", jnlpurl): jnlpurl = getjnlpurl(jnlpurl) vcrfile = urlparse.parse_qs(jnlpurl[jnlpurl.index("?") + 1:])["psid"][0] jnlpfile = vcrfile + ".jnlp" common.retrieve(jnlpurl, jnlpfile, force = True) try: xml = lxml.etree.parse(jnlpfile).getroot() except lxml.etree.XMLSyntaxError: os.unlink(jnlpfile) print "%s: couldn't retrieve jnlp: credentials incorrect?" % ap.prog sys.exit(1) xmlargs = xml.xpath("//argument") for i in range(len(xmlargs) - 1): if xmlargs[i].text == "-play": common.retrieve(xmlargs[i + 1].text, vcrfile) xmlargs[i + 1].text = "file://" + config["elluminate-base"] + "/" + vcrfile break fetchjars(xml) xml.set("codebase", "file://" + config["elluminate-base"] + "/" + JARS) f = open(jnlpfile, "w") f.write(lxml.etree.tostring(xml, xml_declaration = True)) f.close() print jnlpfile
return elem.xpath(path, namespaces = { "xhtml" : "http://www.w3.org/1999/xhtml" }) if __name__ == "__main__": warnings = 0 global config config = common.load_config() args = parse_args() common.mkdirs(config["product-docs-base"]) os.chdir(config["product-docs-base"]) lock = common.Lock(".lock") urlbase = "http://docs.redhat.com/docs/%(locale)s/" % args common.retrieve(urlbase + "toc.html", "toc.html") common.mkro("toc.html") toc = lxml.etree.parse("toc.html").getroot() for url in xpath(toc, "//xhtml:a[@class='type' and text()='%(type)s']/@href" % args): url = url[2:] # trim leading ./ path = url[:url.index("/%(type)s/" % args)].replace("_", " ") common.mkdirs(path) path = path + "/" + url.split("/")[-1] try: common.retrieve(urlbase + url, path) except urllib2.HTTPError, e: if e.code == 403: print >>sys.stderr, "WARNING: %s on %s, continuing..." % (e, urlbase + url) warnings += 1
def fetchjars(xml): for ref in sorted(xml.xpath("//@href")): common.retrieve(xml.get("codebase") + "/" + ref, JARS + "/" + ref)
def download(opener, href, dest): common.mkdirs(os.path.split(dest)[0]) common.retrieve(href, dest, opener = opener, tries = 10) common.mkro(dest)
tries = tries) doc = WikiDoc(f.read()) f.close() path = doc.path + "/" + doc.filename if want(path): skip = False if os.path.exists(path): st = os.stat(path) if st.st_mtime == doc.mtime: skip = True if not skip: common.mkdirs(doc.path) common.retrieve(config["clearspace-root"] + doc.filehref, path, force = True, tries = tries) common.mkro(path) os.utime(path, (doc.mtime, doc.mtime)) updatedbs(db, keep, item["href"], path) if len(index.items) != step: break for dirpath, dirnames, filenames in os.walk(".", topdown = False): # remove local files which are no longer found in clearspace for f in filenames: path = os.path.relpath(dirpath, ".") + "/" + f if not path.startswith("./.") and path not in keep: os.unlink(path)
return bytes == "%PDF" if __name__ == "__main__": global config config = common.load_config() print >>sys.stderr, "Utility needs update since relaunch of www.redhat.com, feel free to submit patches..." sys.exit(1) common.mkdirs(config["references-base"]) os.chdir(config["references-base"]) lock = common.Lock(".lock") common.retrieve("http://www.redhat.com/customersuccess/", "index.html") common.mkro("index.html") toc = lxml.html.soupparser.parse("index.html").getroot() for url in toc.xpath("//a[substring-after(@href, '.') = 'pdf']/../../.."): url = copy.deepcopy(url) title = url.xpath("//h4//a/text()")[0].replace("/", "_") href = url.xpath("//a[substring-after(@href, '.') = 'pdf']/@href")[0] print >>sys.stderr, title f = common.retrieve_tmpfile("http://www.redhat.com" + href) if ispdf(f): # a few links on /customersuccess are currently broken HTML files common.sendfile_disk(f, title + ".pdf") common.mkro(title + ".pdf") f.close()
def download(url, dest): if not os.path.exists(dest): common.retrieve(url, dest) common.mkro(dest) return True return False