Пример #1
0
 def print_popular(self):
     tuples = [(len(n.incoming), n) for n in self.index.values()]
     tuples.sort(reverse=True)
     ln = len(str(tuples[0][0]).rjust(2))
     io.write_err("Showing most referenced urls:\n")
     for (i, node) in tuples[:10]:
         io.write_err(" %s  %s\n" % (str(i).rjust(ln), node.url))
Пример #2
0
 def print_popular(self):
     tuples = [(len(n.incoming), n) for n in self.index.values()]
     tuples.sort(reverse=True)
     ln = len(str(tuples[0][0]).rjust(2))
     io.write_err("Showing most referenced urls:\n")
     for (i, node) in tuples[:10]:
         io.write_err(" %s  %s\n" % (str(i).rjust(ln), node.url))
Пример #3
0
def save_session(wb, queue=None):
    hostname = urlrewrite.get_hostname(wb.root.url)
    filename = urlrewrite.hostname_to_filename(hostname)
    io.write_err("Saving session to %s ..." %
         shcolor.color(shcolor.YELLOW, filename+".{web,session}"))
    io.serialize(wb, filename + ".web", dir=io.LOGDIR)
    if queue: 
        io.serialize(queue, filename + ".session", dir=io.LOGDIR)
    # only web being saved, ie. spidering complete, remove old session
    elif io.file_exists(filename + ".session", dir=io.LOGDIR):
        io.delete(filename + ".session", dir=io.LOGDIR)
    io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
Пример #4
0
def save_session(wb, queue=None):
    hostname = urlrewrite.get_hostname(wb.root.url)
    filename = urlrewrite.hostname_to_filename(hostname)
    io.write_err("Saving session to %s ..." %
                 shcolor.color(shcolor.YELLOW, filename + ".{web,session}"))
    io.serialize(wb, filename + ".web", dir=io.LOGDIR)
    if queue:
        io.serialize(queue, filename + ".session", dir=io.LOGDIR)
    # only web being saved, ie. spidering complete, remove old session
    elif io.file_exists(filename + ".session", dir=io.LOGDIR):
        io.delete(filename + ".session", dir=io.LOGDIR)
    io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
Пример #5
0
def main():
    try:
        line = sys.stdin.readline()
        while line:
            path = line.strip()
            filename = os.path.basename(path)
            args = ["mplayer", "-dumpstream", "-dumpfile", filename, path]
            retval = subprocess.call(args)
            if retval:
                logerror(path)

            line = sys.stdin.readline()
    except KeyboardInterrupt:
        io.write_abort()
    except Exception, e:
        s  = "%s\n" % traceback.format_exc()
        s += "%s\n" % str(e)
        s += "Invocation string: %s\n" % str(args)
        io.write_err(s)
Пример #6
0
def main():
    try:
        line = sys.stdin.readline()
        while line:
            path = line.strip()
            filename = os.path.basename(path)
            args = ["mplayer", "-dumpstream", "-dumpfile", filename, path]
            retval = subprocess.call(args)
            if retval:
                logerror(path)

            line = sys.stdin.readline()
    except KeyboardInterrupt:
        io.write_abort()
    except Exception, e:
        s = "%s\n" % traceback.format_exc()
        s += "%s\n" % str(e)
        s += "Invocation string: %s\n" % str(args)
        io.write_err(s)
Пример #7
0
def restore_session(url):
    hostname = urlrewrite.get_hostname(url)
    filename = urlrewrite.hostname_to_filename(hostname)
    q, wb = None, None
    if (io.file_exists(filename + ".web", dir=io.LOGDIR)):
        io.write_err("Restoring web from %s ..." %
             shcolor.color(shcolor.YELLOW, filename+".web"))
        wb = io.deserialize(filename + ".web", dir=io.LOGDIR)
        io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
    if (io.file_exists(filename + ".session", dir=io.LOGDIR)):
        io.write_err("Restoring session from %s ..." %
             shcolor.color(shcolor.YELLOW, filename+".session"))
        q = io.deserialize(filename + ".session", dir=io.LOGDIR)
        q = recipe.overrule_records(q)
        io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
    return q, wb
Пример #8
0
def restore_session(url):
    hostname = urlrewrite.get_hostname(url)
    filename = urlrewrite.hostname_to_filename(hostname)
    q, wb = None, None
    if (io.file_exists(filename + ".web", dir=io.LOGDIR)):
        io.write_err("Restoring web from %s ..." %
                     shcolor.color(shcolor.YELLOW, filename + ".web"))
        wb = io.deserialize(filename + ".web", dir=io.LOGDIR)
        io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
    if (io.file_exists(filename + ".session", dir=io.LOGDIR)):
        io.write_err("Restoring session from %s ..." %
                     shcolor.color(shcolor.YELLOW, filename + ".session"))
        q = io.deserialize(filename + ".session", dir=io.LOGDIR)
        q = recipe.overrule_records(q)
        io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
    return q, wb
Пример #9
0
 def print_multiple(self):
     ss = []
     for n in self.index.values():
         if len(n.aliases) > 1:
             pair = (len(n.aliases), n.aliases)
             if pair not in ss:
                 ss.append(pair)
     if ss:
         ss.sort(reverse=True)
         ln = len(str(ss[0][0]))  # length of highest count
         io.write_err("Showing documents with multiple urls:\n")
         for pair in ss:
             (count, aliases) = pair
             for url in aliases:
                 prefix = "".rjust(ln)
                 if aliases.index(url) == 0:
                     prefix = str(count).rjust(ln)
                 io.write_err(" %s  %s\n" % (prefix, url))
             if not ss.index(pair) == len(ss)-1:
                 io.write_err("\n")
Пример #10
0
 def print_multiple(self):
     ss = []
     for n in self.index.values():
         if len(n.aliases) > 1:
             pair = (len(n.aliases), n.aliases)
             if pair not in ss:
                 ss.append(pair)
     if ss:
         ss.sort(reverse=True)
         ln = len(str(ss[0][0]))  # length of highest count
         io.write_err("Showing documents with multiple urls:\n")
         for pair in ss:
             (count, aliases) = pair
             for url in aliases:
                 prefix = "".rjust(ln)
                 if aliases.index(url) == 0:
                     prefix = str(count).rjust(ln)
                 io.write_err(" %s  %s\n" % (prefix, url))
             if not ss.index(pair) == len(ss) - 1:
                 io.write_err("\n")
Пример #11
0
 def print_trace(self, path):
     if path:
         io.write_err("Showing trace from root:\n")
         for (i, hop) in enumerate(path):
             io.write_err(" %s  %s\n" %
                          (str(i).rjust(1 + (len(path) / 10)), hop))
Пример #12
0
      type="int",
      metavar="<depth>",
      dest="depth",
      help="Spider to this depth")
    (opts, args) = io.parse_args(parser)
    try:
        if opts.fetch:
            os.environ["FETCH_ALL"] = "1"
        elif opts.dump:
            os.environ["DUMP_ALL"] = "1"
        if opts.host:
            os.environ["HOST_FILTER"] = "1"
        if opts.depth:
            os.environ["DEPTH"] = str(opts.depth)

        url = args[0]
        (q, w) = restore_session(url)
        if opts.recipe:
            rules = recipe.load_recipe(opts.recipe, url)
        else:
            pattern = args[1]
            rules = recipe.get_recipe(pattern, url)
        queue = q or recipe.get_queue(url, mode=fetch.Fetcher.SPIDER)
        wb = w or web.Web(url)
    except recipe.PatternError, e:
        io.write_err(shcolor.color(shcolor.RED, "%s\n" % e))
        sys.exit(1)
    except IndexError:
        io.opts_help(None, None, None, parser)
    main(queue, rules, wb)
Пример #13
0
 def print_trace(self, path):
     if path:
         io.write_err("Showing trace from root:\n")
         for (i, hop) in enumerate(path):
             io.write_err(" %s  %s\n" % (str(i).rjust(1+(len(path)/10)), hop))
Пример #14
0
    a("--fetch", action="store_true", help="Fetch urls, don't dump")
    a("--dump", action="store_true", help="Dump urls, don't fetch")
    a("--host", action="store_true", help="Only spider this host")
    a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth")
    (opts, args) = io.parse_args(parser)
    try:
        if opts.fetch:
            os.environ["FETCH_ALL"] = "1"
        elif opts.dump:
            os.environ["DUMP_ALL"] = "1"
        if opts.host:
            os.environ["HOST_FILTER"] = "1"
        if opts.depth:
            os.environ["DEPTH"] = str(opts.depth)

        url = args[0]
        (q, w) = restore_session(url)
        if opts.recipe:
            rules = recipe.load_recipe(opts.recipe, url)
        else:
            pattern = args[1]
            rules = recipe.get_recipe(pattern, url)
        queue = q or recipe.get_queue(url, mode=fetch.Fetcher.SPIDER)
        wb = w or web.Web(url)
    except recipe.PatternError, e:
        io.write_err(shcolor.color(shcolor.RED, "%s\n" % e))
        sys.exit(1)
    except IndexError:
        io.opts_help(None, None, None, parser)
    main(queue, rules, wb)
Пример #15
0
 def assert_in_web(self, url):
     if url not in self.index:
         io.write_err("Url %s not in the web\n" %
                      shcolor.color(shcolor.YELLOW, url))
         sys.exit(1)
Пример #16
0
 def print_stats(self):
     s = "Root url : %s\n" % self.root.url
     s += "Web size : %s urls\n" % len(self.index)
     io.write_err(s)
Пример #17
0
    a("--pause", type="int", metavar="<pause>", dest="pause", help="Pause for x seconds between requests")
    a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth")
    (opts, args) = io.parse_args(parser)
    try:
        if opts.fetch:
            os.environ["FETCH_ALL"] = "1"
        elif opts.dump:
            os.environ["DUMP_ALL"] = "1"
        if opts.host:
            os.environ["HOST_FILTER"] = "1"
        if opts.pause:
            os.environ["PAUSE"] = str(opts.pause)
        if opts.depth:
            os.environ["DEPTH"] = str(opts.depth)

        url = args[0]
        (q, w) = restore_session(url)
        if opts.recipe:
            rules = recipe.load_recipe(opts.recipe, url)
        else:
            pattern = args[1]
            rules = recipe.get_recipe(pattern, url)
        queue = q or recipe.get_queue(url, mode=fetch.Fetcher.SPIDER)
        wb = w or web.Web(url)
    except recipe.PatternError, e:
        io.write_err(ansicolor.red("%s\n" % e))
        sys.exit(1)
    except IndexError:
        io.opts_help(None, None, None, parser)
    main(queue, rules, wb)
Пример #18
0
 def print_stats(self):
     s  = "Root url : %s\n" % self.root.url
     s += "Web size : %s urls\n" % len(self.index)
     io.write_err(s)
Пример #19
0
 def assert_in_web(self, url):
     if url not in self.index:
         io.write_err("Url %s not in the web\n" %
                      shcolor.color(shcolor.YELLOW, url))
         sys.exit(1)
Пример #20
0
 def assert_in_web(self, url):
     if url not in self.index:
         io.write_err("Url %s not in the web\n" % ansicolor.yellow(url))
         sys.exit(1)