示例#1
0
 def save(self):
     hostname = urlrewrite.get_hostname(self.wb.root.url)
     filename = urlrewrite.hostname_to_filename(hostname)
     ioutils.write_err("Saving session to %s ..." %
                     ansicolor.yellow(filename + ".{web,session}"))
     ioutils.serialize(self.wb, filename + ".web", dir=ioutils.LOGDIR)
     if self.queue:
         ioutils.serialize(self.queue, filename + ".session", dir=ioutils.LOGDIR)
     # only web being saved, ie. spidering complete, remove old session
     elif ioutils.file_exists(filename + ".session", dir=ioutils.LOGDIR):
         ioutils.delete(filename + ".session", dir=ioutils.LOGDIR)
     ioutils.write_err(ansicolor.green("done\n"))
示例#2
0
 def restore(cls, url):
     hostname = urlrewrite.get_hostname(url)
     filename = urlrewrite.hostname_to_filename(hostname)
     q, wb = None, None
     if (ioutils.file_exists(filename + ".web", dir=ioutils.LOGDIR)):
         ioutils.write_err("Restoring web from %s ..." %
                         ansicolor.yellow(filename + ".web"))
         wb = ioutils.deserialize(filename + ".web", dir=ioutils.LOGDIR)
         ioutils.write_err(ansicolor.green("done\n"))
     if (ioutils.file_exists(filename + ".session", dir=ioutils.LOGDIR)):
         ioutils.write_err("Restoring session from %s ..." %
                         ansicolor.yellow(filename + ".session"))
         q = ioutils.deserialize(filename + ".session", dir=ioutils.LOGDIR)
         q = recipe.overrule_records(q)
         ioutils.write_err(ansicolor.green("done\n"))
     return cls(wb=wb, queue=q)
示例#3
0
def rewrite_recipe(recipe, url):
    for rule in recipe:
        if not "depth" in rule:
            rule["depth"] = 1
        if os.environ.get("DEPTH"):
            rule["depth"] = int(os.environ.get("DEPTH"))

        if os.environ.get("HOST_FILTER"):
            rule["host_filter"] = urlrewrite.get_hostname(url)
        if os.environ.get("FETCH_ALL"):
            switch_key(rule, "dump", "fetch")
        elif os.environ.get("DUMP_ALL"):
            switch_key(rule, "fetch", "dump")

        # compile regexes
        for r in ("dump", "fetch", "spider"):
            if r in rule and type(rule[r]) == str:
                try:
                    rule[r] = re.compile(rule[r])
                except re.error as e:
                    raise PatternError("Pattern error: %s: %s" % (e.args[0], rule[r]))
    return recipe
示例#4
0
def apply_hostfilter(filter_hostname, url):
    if os.environ.get("HOST_FILTER"):
        return urlrewrite.get_hostname(url) == filter_hostname
    return True