def save_session(wb, queue=None): hostname = urlrewrite.get_hostname(wb.root.url) filename = urlrewrite.hostname_to_filename(hostname) io.write_err("Saving session to %s ..." % shcolor.color(shcolor.YELLOW, filename+".{web,session}")) io.serialize(wb, filename + ".web", dir=io.LOGDIR) if queue: io.serialize(queue, filename + ".session", dir=io.LOGDIR) # only web being saved, ie. spidering complete, remove old session elif io.file_exists(filename + ".session", dir=io.LOGDIR): io.delete(filename + ".session", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
def save_session(wb, queue=None): hostname = urlrewrite.get_hostname(wb.root.url) filename = urlrewrite.hostname_to_filename(hostname) io.write_err("Saving session to %s ..." % shcolor.color(shcolor.YELLOW, filename + ".{web,session}")) io.serialize(wb, filename + ".web", dir=io.LOGDIR) if queue: io.serialize(queue, filename + ".session", dir=io.LOGDIR) # only web being saved, ie. spidering complete, remove old session elif io.file_exists(filename + ".session", dir=io.LOGDIR): io.delete(filename + ".session", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
def restore_session(url): hostname = urlrewrite.get_hostname(url) filename = urlrewrite.hostname_to_filename(hostname) q, wb = None, None if (io.file_exists(filename + ".web", dir=io.LOGDIR)): io.write_err("Restoring web from %s ..." % shcolor.color(shcolor.YELLOW, filename+".web")) wb = io.deserialize(filename + ".web", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) if (io.file_exists(filename + ".session", dir=io.LOGDIR)): io.write_err("Restoring session from %s ..." % shcolor.color(shcolor.YELLOW, filename+".session")) q = io.deserialize(filename + ".session", dir=io.LOGDIR) q = recipe.overrule_records(q) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) return q, wb
def restore_session(url): hostname = urlrewrite.get_hostname(url) filename = urlrewrite.hostname_to_filename(hostname) q, wb = None, None if (io.file_exists(filename + ".web", dir=io.LOGDIR)): io.write_err("Restoring web from %s ..." % shcolor.color(shcolor.YELLOW, filename + ".web")) wb = io.deserialize(filename + ".web", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) if (io.file_exists(filename + ".session", dir=io.LOGDIR)): io.write_err("Restoring session from %s ..." % shcolor.color(shcolor.YELLOW, filename + ".session")) q = io.deserialize(filename + ".session", dir=io.LOGDIR) q = recipe.overrule_records(q) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) return q, wb
def rewrite_recipe(recipe, url): for rule in recipe: if not "depth" in rule: rule["depth"] = 1 if os.environ.get("DEPTH"): rule["depth"] = int(os.environ.get("DEPTH")) if os.environ.get("HOST_FILTER"): rule["host_filter"] = urlrewrite.get_hostname(url) if os.environ.get("FETCH_ALL"): switch_key(rule, "dump", "fetch") elif os.environ.get("DUMP_ALL"): switch_key(rule, "fetch", "dump") # compile regexes for r in ("dump", "fetch", "spider"): if r in rule and type(rule[r]) == str: try: rule[r] = re.compile(rule[r]) except re.error, e: raise PatternError, "Pattern error: %s: %s" % (e.args[0], rule[r])
def apply_hostfilter(filter_hostname, url): if os.environ.get("HOST_FILTER"): return urlrewrite.get_hostname(url) == filter_hostname return True