def run_script(): (parser, a) = ioutils.init_opts("<url> ['<pattern>'] [options]") a("--recipe", metavar="<recipe>", dest="recipe", help="Use a spidering recipe") a("--fetch", action="store_true", help="Fetch urls, don't dump") a("--dump", action="store_true", help="Dump urls, don't fetch") a("--host", action="store_true", help="Only spider this host") a("--pause", type="int", metavar="<pause>", dest="pause", help="Pause for x seconds between requests") a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth") (opts, args) = ioutils.parse_args(parser) try: if opts.fetch: os.environ["FETCH_ALL"] = "1" elif opts.dump: os.environ["DUMP_ALL"] = "1" if opts.host: os.environ["HOST_FILTER"] = "1" if opts.pause: os.environ["PAUSE"] = str(opts.pause) if opts.depth: os.environ["DEPTH"] = str(opts.depth) url = args[0] if opts.recipe: rules = recipe.load_recipe(opts.recipe, url) else: pattern = args[1] rules = recipe.get_recipe(pattern, url) session = Session.restore(url) session.rules = rules if session.queue is None: session.queue = recipe.get_queue(url, mode=fetch.Fetcher.SPIDER) if session.wb is None: session.wb = web.Web(url) except recipe.PatternError as e: ioutils.write_err(ansicolor.red("%s\n" % e)) sys.exit(1) except IndexError: ioutils.opts_help(None, None, None, parser) spiderfetcher = SpiderFetcher(session) spiderfetcher.main()
for rx in it: (rx_id, match) = rx try: regexs[rx_id].append(match) except KeyError: regexs[rx_id] = [] regexs[rx_id].append(match) spanlists = [map(lambda m: m.span('url'), regexs[rx_id]) for rx_id in sorted(regexs.keys())] return ansicolor.highlight_string(str, *spanlists) if __name__ == "__main__": (parser, a) = ioutils.init_opts("[ <url|file> [options] | --test ]") a("--dump", action="store_true", help="Dump urls") a("--test", action="store_true", help="Run spider testsuite") (opts, args) = ioutils.parse_args(parser) try: url = None if opts.test: data = testcases else: url = args[0] data = urllib.urlopen(url).read() if opts.dump: for u in unique(unbox_it_to_ss(findall(data, url))): print(u) else:
self.launch() if not self.error or not err.is_temporal(self.error): return if self.tries < 1: return # retry after a short delay self.write_progress(wait=True) time.sleep(self.retry_wait) if __name__ == "__main__": (parser, a) = ioutils.init_opts("<url>+ [options]") a("--fullpath", action="store_true", help="Use full path as filename to avoid name collisions") a("-c", "--continue", dest="cont", action="store_true", help="Resume downloads") a("-t", "--tries", dest="tries", type="int", action="store", help="Number of retries") a("-q", "--quiet", dest="quiet", action="store_true", help="Turn off logging") a("--spidertest", action="store_true", help="Test spider with url") (opts, args) = ioutils.parse_args(parser) if getattr(opts, 'cont', None): os.environ["CONT"] = "1" if getattr(opts, 'tries', None): if not os.environ.get("TRIES"): os.environ["TRIES"] = str(opts.tries) if opts.quiet: os.environ["LOGGING"] = str(False) try:
def logerror(path): ioutils.savelog("Path failed: %s\n" % path, "error_dumpstream") def main(): try: line = sys.stdin.readline() while line: path = line.strip() filename = os.path.basename(path) args = ["mplayer", "-dumpstream", "-dumpfile", filename, path] retval = subprocess.call(args) if retval: logerror(path) line = sys.stdin.readline() except KeyboardInterrupt: ioutils.write_abort() except Exception as e: s = "%s\n" % traceback.format_exc() s += "%s\n" % str(e) s += "Invocation string: %s\n" % str(args) ioutils.write_err(s) if __name__ == "__main__": (parser, a) = ioutils.init_opts("< <file>") (opts, args) = ioutils.parse_args(parser) main()
node.outgoing[n] = None #for node in self.index.values(): # print(node.incoming) # print(node.outgoing) def _from_pickle(self): for node in self.index.values(): for n in node.incoming: node.incoming[n] = self.index[n] for n in node.outgoing: node.outgoing[n] = self.index[n] if __name__ == "__main__": (parser, a) = ioutils.init_opts("<web> [options]") a("--dump", action="store_true", help="Dump all urls in web") a("--in", metavar="<url>", dest="into", help="Find incoming urls to <url>") a("--out", metavar="<url>", help="Find outgoing urls from <url>") a("--aliases", metavar="<url>", help="Find other urls for the document at <url>") a("--multiple", action="store_true", help="Find documents with multiple urls") a("--trace", metavar="<url>", help="Trace path from root to <url>") a("--deepest", action="store_true", help="Trace url furthest from root") a("--popular", action="store_true", help="Find the most referenced urls") a("--test", action="store_true", help="Run trace loop test") (opts, args) = ioutils.parse_args(parser) try: if opts.test: wb = Web() wb.root = Node("a") wb.index["a"] = wb.root