Exemplo n.º 1
0
def result_iterator(results, notifier = None,\
        proxy = None, reader = func.netstr_reader):

        res = []
        for dir_url in results:
                if dir_url.startswith("dir://"):
                        res += util.parse_dir(dir_url, proxy)
                else:
                        res.append(dir_url)

        x, x, root = util.load_conf()

        for url in res:
                if url.startswith("file://"):
                        fname = url[7:]
                        fd = file(fname)
                        sze = os.stat(fname).st_size
                elif url.startswith("disco://"):
                        host, fname = url[8:].split("/", 1)
                        url = util.proxy_url(proxy, fname, host)
                        if util.resultfs_enabled:
                                f = "%s/data/%s" % (root, fname)
                                fd = file(f)
                                sze = os.stat(f).st_size
                        else:
                                sze, fd = comm.open_remote(url)
                else:
                        raise JobException("Invalid result url: %s" % url)

                if notifier:
                        notifier(url)

                for x in reader(fd, sze, fname):
                        yield x
Exemplo n.º 2
0
        def __init__(self, input_files, do_sort, mem_sort_limit):
                self.inputs = []
                for input in input_files:
                        if input.startswith("dir://"):
                                self.inputs += parse_dir(input)
                        else:
                                self.inputs.append(input)

                self.line_count = 0
                if do_sort:
                        total_size = 0
                        for input in self.inputs:
                                sze, fd = connect_input(input)
                                total_size += sze

                        msg("Reduce[%d] input is %.2fMB" %\
                                (this_partition(), total_size / 1024.0**2))

                        if total_size > mem_sort_limit:
                                self.iterator = self.download_and_sort()
                        else: 
                                msg("Sorting in memory")
                                m = list(self.multi_file_iterator(self.inputs, False))
                                m.sort(num_cmp)
                                self.iterator = self.list_iterator(m)
                else:
                        self.iterator = self.multi_file_iterator(self.inputs)
Exemplo n.º 3
0
def deref(program, *files):
    """Usage: [file ...]

    Dereference the dir:// urls in file[s] or stdin and print them to stdout.
    """
    from disco.util import parse_dir
    for line in fileinput.input(files):
        for url in parse_dir(line.strip()):
            print url
Exemplo n.º 4
0
def deref(program, *files):
    """Usage: [file ...]

    Dereference the dir:// urls in file[s] or stdin and print them to stdout.
    """
    from disco.util import parse_dir
    for line in fileinput.input(files):
        for url in parse_dir(line.strip()):
            print url
Exemplo n.º 5
0
Arquivo: core.py Projeto: rca/disco
def result_iterator(results, notifier = None,\
        proxy = None, reader = func.netstr_reader):
        
        if not proxy:
                proxy = os.environ.get("DISCO_PROXY", None)
        if proxy:
                if proxy.startswith("disco://"):
                        proxy = "%s:%s" % (proxy[8:], util.MASTER_PORT)
                elif proxy.startswith("http://"):
                        proxy = proxy[7:]
        res = []
        for dir_url in results:
                if dir_url.startswith("dir://"):
                        res += util.parse_dir(dir_url, proxy)
                else:
                        res.append(dir_url)
        
        for url in res:
                if url.startswith("file://"):
                        fname = url[7:]
                        fd = file(fname)
                        sze = os.stat(fname).st_size
                        http = None
                else:
                        host, fname = url[8:].split("/", 1)
                        if proxy:
                                ext_host = proxy
                                fname = "/disco/node/%s/%s" % (host, fname)
                        else:
                                ext_host = host + ":" + util.HTTP_PORT
                        ext_file = "/" + fname

                        http = httplib.HTTPConnection(ext_host)
                        http.request("GET", ext_file, "")
                        fd = http.getresponse()
                        if fd.status != 200:
                                raise "HTTP error %d" % fd.status
                
                        sze = int(fd.getheader("content-length"))

                if notifier:
                        notifier(url)

                for x in reader(fd, sze, fname):
                        yield x
                
                if http:
                        http.close()
                else:
                        fd.close()
Exemplo n.º 6
0
    def __init__(self, *args, **kwargs):
        super(JobDict, self).__init__(*args, **kwargs)

        # -- backwards compatibility --
        if 'fun_map' in self and 'map' not in self:
            self['map'] = self.pop('fun_map')

        if 'input_files' in kwargs and 'input' not in self:
            self['input'] = self.pop('input_files')

        if 'reduce_writer' in self or 'map_writer' in self:
            warn("Writers are deprecated - use output_stream.add() instead",
                    DeprecationWarning)

        # -- required modules and files --
        if self['required_modules'] is None:
            functions = util.flatten(util.iterify(self[f])
                                     for f in chain(self.functions, self.stacks))
            self['required_modules'] = find_modules([f for f in functions
                                                     if callable(f)])

        # -- external flags --
        if isinstance(self['map'], dict):
            self['ext_map'] = True
        if isinstance(self['reduce'], dict):
            self['ext_reduce'] = True

        # -- input --
        ddfs = self.pop('ddfs', None)
        self['input'] = [list(util.iterify(url))
                         for i in self['input']
                         for url in util.urllist(i, listdirs=bool(self['map']),
                                                 ddfs=ddfs)]

        # partitions must be an integer internally
        self['partitions'] = self['partitions'] or 0

        # set nr_reduces: ignored if there is not actually a reduce specified
        if self['map']:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            self['nr_reduces'] = self['partitions'] or 1
        elif self.input_is_partitioned:
            # Only reduce, with partitions: len(dir://) specifies nr_reduces
            self['nr_reduces'] = len(util.parse_dir(self['input'][0][0]))
        else:
            # Only reduce, without partitions can only have 1 reduce
            self['nr_reduces'] = 1

        # merge_partitions iff the inputs to reduce are partitioned
        if self['merge_partitions']:
            if self['partitions'] or self.input_is_partitioned:
                self['nr_reduces'] = 1
            else:
                raise DiscoError("Can't merge partitions without partitions")

        # -- scheduler --
        scheduler = self.__class__.defaults['scheduler'].copy()
        scheduler.update(self['scheduler'])
        if int(scheduler['max_cores']) < 1:
            raise DiscoError("max_cores must be >= 1")
        self['scheduler'] = scheduler

        # -- sanity checks --
        if not self['map'] and not self['reduce']:
            raise DiscoError("Must specify map and/or reduce")

        for key in self:
            if key not in self.defaults:
                raise DiscoError("Unknown job argument: %s" % key)
Exemplo n.º 7
0
Arquivo: core.py Projeto: nick-b/disco
        def _run(self, **kw):
                d = lambda x: kw.get(x, Job.defaults[x])

                if "fun_map" in kw:
                        kw["map"] = kw["fun_map"]
                
                if "input_files" in kw:
                        kw["input"] = kw["input_files"]
                
                if not ("map" in kw and "input" in kw):
                        raise "Arguments 'map' and 'input' are required"
                
                if len(kw["input"]) < 1:
                        raise "Must have at least one input file"

                inputs = []
                for inp in kw["input"]:
                        if inp.startswith("dir://"):
                                inputs += util.parse_dir(inp)
                        else:
                                inputs.append(inp)

                req = {"name": self.name,
                       "input": " ".join(inputs),
                       "version": ".".join(map(str, sys.version_info[:2])),
                       "map_reader": marshal.dumps(d("map_reader").func_code),
                       "partition": marshal.dumps(d("partition").func_code),
                       "params": cPickle.dumps(d("params")),
                       "sort": str(int(d("sort"))),
                       "mem_sort_limit": str(d("mem_sort_limit"))}

                if type(kw["map"]) == dict:
                        req["ext_map"] = marshal.dumps(kw["map"])
                else:
                        req["map"] = marshal.dumps(kw["map"].func_code)
        
                if "ext_params" in kw:
                        if type(kw["ext_params"]) == dict:
                                req["ext_params"] =\
                                        encode_netstring_fd(kw["ext_params"])
                        else:
                                req["ext_params"] = kw["ext_params"]
        
                if "nr_maps" not in kw or kw["nr_maps"] > len(inputs):
                        nr_maps = len(inputs)
                else:
                        nr_maps = kw["nr_maps"]
                req["nr_maps"] = str(nr_maps)
        
                nr_reduces = d("nr_reduces")
                if "reduce" in kw:
                        if type(kw["reduce"]) == dict:
                                req["ext_reduce"] = marshal.dumps(kw["reduce"])
                                req["reduce"] = ""
                        else:
                                req["reduce"] = marshal.dumps(
                                        kw["reduce"].func_code)
                        nr_reduces = nr_reduces or max(nr_maps / 2, 1)
                        req["chunked"] = "True"
                else:
                        nr_reduces = nr_reduces or 1
                req["nr_reduces"] = str(nr_reduces)

                if d("chunked") != None:
                        if d("chunked"):
                                req["chunked"] = "True"
                        elif "chunked" in req:
                                del req["chunked"]

                if "combiner" in kw:
                        req["combiner"] =\
                                marshal.dumps(kw["combiner"].func_code)

                self.msg = encode_netstring_fd(req)
                reply = self.master.request("/disco/job/new", self.msg)
                        
                if reply != "job started":
                        raise "Failed to start a job. Server replied: " + reply
Exemplo n.º 8
0
Arquivo: core.py Projeto: rca/disco
        def _run(self, **kw):
                d = lambda x: kw.get(x, Job.defaults[x])

                # Backwards compatibility 
                # (fun_map == map, input_files == input)
                if "fun_map" in kw:
                        kw["map"] = kw["fun_map"]
                
                if "input_files" in kw:
                        kw["input"] = kw["input_files"]
                
                if not "input" in kw:
                        raise Exception("input is required")
                
                if not ("map" in kw or "reduce" in kw):
                        raise Exception("Specify map and/or reduce")
                
                for p in kw:
                        if p not in Job.defaults:
                                raise Exception("Unknown argument: %s" % p)

                inputs = kw["input"]
                
                req = {"name": self.name,
                       "version": ".".join(map(str, sys.version_info[:2])),
                       "params": cPickle.dumps(d("params")),
                       "sort": str(int(d("sort"))),
                       "mem_sort_limit": str(d("mem_sort_limit")),
                       "status_interval": str(d("status_interval")),
                       "required_modules": " ".join(d("required_modules")),
                       "profile": str(int(d("profile")))}

                if "map" in kw:
                        if type(kw["map"]) == dict:
                                req["ext_map"] = marshal.dumps(kw["map"])
                        else:
                                req["map"] = marshal.dumps(kw["map"].func_code)

                        if "nr_maps" not in kw or kw["nr_maps"] > len(inputs):
                                nr_maps = len(inputs)
                        else:
                                nr_maps = kw["nr_maps"]

                        if "map_init" in kw:
                                req["map_init"] = marshal.dumps(\
                                        kw["map_init"].func_code)
                       
                        req["map_reader"] =\
                                marshal.dumps(d("map_reader").func_code)
                        req["map_writer"] =\
                                marshal.dumps(d("map_writer").func_code)
                        req["partition"] =\
                                marshal.dumps(d("partition").func_code)
                        
                        parsed_inputs = []
                        for inp in inputs:
                                if inp.startswith("dir://"):
                                        parsed_inputs += util.parse_dir(inp)
                                else:
                                        parsed_inputs.append(inp)
                        inputs = parsed_inputs
                else:
                        addr = [x for x in inputs\
                                if not x.startswith("dir://")]

                        if d("nr_reduces") == None and not addr:
                                raise Exception("nr_reduces must match to "\
                                        "the number of partitions in the "\
                                        "input data")

                        if d("nr_reduces") != 1 and addr: 
                                raise Exception("nr_reduces must be 1 when "\
                                        "using external inputs without "\
                                        "the map phase")
                        nr_maps = 0
               
                req["input"] = " ".join(inputs)
                req["nr_maps"] = str(nr_maps)
        
                if "ext_params" in kw:
                        if type(kw["ext_params"]) == dict:
                                req["ext_params"] =\
                                        encode_netstring_fd(kw["ext_params"])
                        else:
                                req["ext_params"] = kw["ext_params"]
        
                nr_reduces = d("nr_reduces")
                if "reduce" in kw:
                        if type(kw["reduce"]) == dict:
                                req["ext_reduce"] = marshal.dumps(kw["reduce"])
                                req["reduce"] = ""
                        else:
                                req["reduce"] = marshal.dumps(
                                        kw["reduce"].func_code)
                        nr_reduces = nr_reduces or max(nr_maps / 2, 1)
                        req["chunked"] = "True"
                       
                        req["reduce_reader"] =\
                                marshal.dumps(d("reduce_reader").func_code)
                        req["reduce_writer"] =\
                                marshal.dumps(d("reduce_writer").func_code)

                        if "reduce_init" in kw:
                                req["reduce_init"] = marshal.dumps(\
                                        kw["reduce_init"].func_code)
                else:
                        nr_reduces = nr_reduces or 1
                
                req["nr_reduces"] = str(nr_reduces)

                if d("chunked") != None:
                        if d("chunked"):
                                req["chunked"] = "True"
                        elif "chunked" in req:
                                del req["chunked"]

                if "combiner" in kw:
                        req["combiner"] =\
                                marshal.dumps(kw["combiner"].func_code)

                self.msg = encode_netstring_fd(req)
                reply = self.master.request("/disco/job/new", self.msg)
                        
                if reply != "job started":
                        raise Exception("Failed to start a job. Server replied: " + reply)
Exemplo n.º 9
0
        def _run(self, **kw):
                d = lambda x: kw.get(x, Job.defaults[x])

                # -- check parameters --

                # Backwards compatibility
                # (fun_map == map, input_files == input)
                if "fun_map" in kw:
                        kw["map"] = kw["fun_map"]

                if "input_files" in kw:
                        kw["input"] = kw["input_files"]

                if "chunked" in kw:
                        raise DiscoError("Argument 'chunked' is deprecated")

                if not "input" in kw:
                        raise DiscoError("input is required")

                if not ("map" in kw or "reduce" in kw):
                        raise DiscoError("Specify map and/or reduce")

                for p in kw:
                        if p not in Job.defaults:
                                raise DiscoError("Unknown argument: %s" % p)

                inputs = kw["input"]

                # -- initialize request --

                req = {"name": self.name,
                       "version": ".".join(map(str, sys.version_info[:2])),
                       "params": cPickle.dumps(d("params"), cPickle.HIGHEST_PROTOCOL),
                       "sort": str(int(d("sort"))),
                       "mem_sort_limit": str(d("mem_sort_limit")),
                       "status_interval": str(d("status_interval")),
                       "profile": str(int(d("profile")))}

                # -- required modules --

                if "required_modules" in kw:
                        rm = kw["required_modules"]
                else:
                        funlist = []
                        for f in Job.funs:
                                df = d(f)
                                if type(df) == types.FunctionType:
                                        funlist.append(df)
                                elif type(df) == list:
                                        funlist += df
                        rm = modutil.find_modules(funlist)
                send_mod = []
                imp_mod = []
                for mod in rm:
                        if type(mod) == tuple:
                                send_mod.append(mod[1])
                                mod = mod[0]
                        imp_mod.append(mod)

                req["required_modules"] = " ".join(imp_mod)
                rf = util.pack_files(send_mod)

                # -- required files --

                if "required_files" in kw:
                        if type(kw["required_files"]) == dict:
                                rf.update(kw["required_files"])
                        else:
                                rf.update(util.pack_files(\
                                        kw["required_files"]))
                if rf:
                        req["required_files"] = marshal.dumps(rf)

                # -- map --

                if "map" in kw:
                        if type(kw["map"]) == dict:
                                req["ext_map"] = marshal.dumps(kw["map"])
                        else:
                                req["map"] = marshal.dumps(kw["map"].func_code)

                        if "map_init" in kw:
                                req["map_init"] = marshal.dumps(\
                                        kw["map_init"].func_code)

                        req["map_reader"] =\
                                marshal.dumps(d("map_reader").func_code)
                        req["map_writer"] =\
                                marshal.dumps(d("map_writer").func_code)
                        req["partition"] =\
                                marshal.dumps(d("partition").func_code)

                        if "combiner" in kw:
                                req["combiner"] =\
                                        marshal.dumps(kw["combiner"].func_code)

                        parsed_inputs = []
                        for inp in inputs:
                                if type(inp) == list:
                                        parsed_inputs.append(
                                                "\n".join(reversed(inp)))
                                elif inp.startswith("dir://"):
                                        parsed_inputs += util.parse_dir(inp)
                                else:
                                        parsed_inputs.append(inp)
                        inputs = parsed_inputs

                        if "nr_maps" not in kw or kw["nr_maps"] > len(inputs):
                                nr_maps = len(inputs)
                        else:
                                nr_maps = kw["nr_maps"]

                # -- only reduce --

                else:
                        nr_maps = 0
                        ext_inputs = []
                        red_inputs = []
                        for inp in inputs:
                                if type(inp) == list:
                                        raise DiscoError("Reduce doesn't "\
                                                "accept redundant inputs")
                                elif inp.startswith("dir://"):
                                        if inp.endswith(".txt"):
                                                ext_inputs.append(inp)
                                        else:
                                                red_inputs.append(inp)
                                else:
                                        ext_inputs.append(inp)

                        if ext_inputs and red_inputs:
                                raise DiscoError("Can't mix partitioned "\
                                        "inputs with other inputs")
                        elif red_inputs:
                                q = lambda x: int(x.split(":")[-1]) + 1
                                nr_red = q(red_inputs[0])
                                for x in red_inputs:
                                        if q(x) != nr_red:
                                                raise DiscoError(\
                                                "Number of partitions must "\
                                                "match in all inputs")
                                n = d("nr_reduces") or nr_red
                                if n != nr_red:
                                        raise DiscoError(
                                        "Specified nr_reduces = %d but "\
                                        "number of partitions in the input "\
                                        "is %d" % (n, nr_red))
                                kw["nr_reduces"] = nr_red
                                inputs = red_inputs
                        elif d("nr_reduces") != 1:
                                raise DiscoError("nr_reduces must be 1 when "\
                                        "using non-partitioned inputs "\
                                        "without the map phase")
                        else:
                                inputs = ext_inputs

                # shuffle fixes a pathological case in the fifo scheduler:
                # if inputs for a node are consequent, data locality will be
                # lost after K inputs where K is the number of cores.
                # Randomizing the order of inputs makes this pathological case
                # unlikely. This issue will be fixed in the new scheduler.
                random.shuffle(inputs)

                req["input"] = " ".join(inputs)
                req["nr_maps"] = str(nr_maps)

                if "ext_params" in kw:
                        if type(kw["ext_params"]) == dict:
                                req["ext_params"] =\
                                        encode_netstring_fd(kw["ext_params"])
                        else:
                                req["ext_params"] = kw["ext_params"]

                # -- reduce --

                nr_reduces = d("nr_reduces")
                if "reduce" in kw:
                        if type(kw["reduce"]) == dict:
                                req["ext_reduce"] = marshal.dumps(kw["reduce"])
                                req["reduce"] = ""
                        else:
                                req["reduce"] = marshal.dumps(
                                        kw["reduce"].func_code)
                        nr_reduces = nr_reduces or min(max(nr_maps / 2, 1), 100)

                        req["reduce_reader"] =\
                                marshal.dumps(d("reduce_reader").func_code)
                        req["reduce_writer"] =\
                                marshal.dumps(d("reduce_writer").func_code)

                        if "reduce_init" in kw:
                                req["reduce_init"] = marshal.dumps(\
                                        kw["reduce_init"].func_code)
                else:
                        nr_reduces = nr_reduces or 0

                req["nr_reduces"] = str(nr_reduces)

                # -- encode and send the request --

                self.msg = encode_netstring_fd(req)
                reply = self.master.request("/disco/job/new", self.msg)

                if reply != "job started":
                        raise DiscoError("Failed to start a job. Server replied: " + reply)