def save(self): hostname = urlrewrite.get_hostname(self.wb.root.url) filename = urlrewrite.hostname_to_filename(hostname) ioutils.write_err("Saving session to %s ..." % ansicolor.yellow(filename + ".{web,session}")) ioutils.serialize(self.wb, filename + ".web", dir=ioutils.LOGDIR) if self.queue: ioutils.serialize(self.queue, filename + ".session", dir=ioutils.LOGDIR) # only web being saved, ie. spidering complete, remove old session elif ioutils.file_exists(filename + ".session", dir=ioutils.LOGDIR): ioutils.delete(filename + ".session", dir=ioutils.LOGDIR) ioutils.write_err(ansicolor.green("done\n"))
def restore(cls, url): hostname = urlrewrite.get_hostname(url) filename = urlrewrite.hostname_to_filename(hostname) q, wb = None, None if (ioutils.file_exists(filename + ".web", dir=ioutils.LOGDIR)): ioutils.write_err("Restoring web from %s ..." % ansicolor.yellow(filename + ".web")) wb = ioutils.deserialize(filename + ".web", dir=ioutils.LOGDIR) ioutils.write_err(ansicolor.green("done\n")) if (ioutils.file_exists(filename + ".session", dir=ioutils.LOGDIR)): ioutils.write_err("Restoring session from %s ..." % ansicolor.yellow(filename + ".session")) q = ioutils.deserialize(filename + ".session", dir=ioutils.LOGDIR) q = recipe.overrule_records(q) ioutils.write_err(ansicolor.green("done\n")) return cls(wb=wb, queue=q)
def rewrite_recipe(recipe, url): for rule in recipe: if not "depth" in rule: rule["depth"] = 1 if os.environ.get("DEPTH"): rule["depth"] = int(os.environ.get("DEPTH")) if os.environ.get("HOST_FILTER"): rule["host_filter"] = urlrewrite.get_hostname(url) if os.environ.get("FETCH_ALL"): switch_key(rule, "dump", "fetch") elif os.environ.get("DUMP_ALL"): switch_key(rule, "fetch", "dump") # compile regexes for r in ("dump", "fetch", "spider"): if r in rule and type(rule[r]) == str: try: rule[r] = re.compile(rule[r]) except re.error as e: raise PatternError("Pattern error: %s: %s" % (e.args[0], rule[r])) return recipe
def apply_hostfilter(filter_hostname, url): if os.environ.get("HOST_FILTER"): return urlrewrite.get_hostname(url) == filter_hostname return True