Пример #1
0
        def __init__(self, input_files, do_sort, mem_sort_limit):
                self.inputs = []
                part = PART_SUFFIX % this_partition()
                for input in input_files:
                        if input.startswith("dir://"):
                                try:
                                        self.inputs += parse_dir(input,
                                                part_id = this_partition())
                                except:
                                        data_err("Couldn't resolve address %s"\
                                                % input, input)
                        else:
                                self.inputs.append(input)

                self.line_count = 0
                if do_sort:
                        total_size = 0
                        for input in self.inputs:
                                sze, fd = connect_input(input)
                                total_size += sze

                        msg("Reduce[%d] input is %.2fMB" %\
                                (this_partition(), total_size / 1024.0**2))

                        if total_size > mem_sort_limit:
                                self.iterator = self.download_and_sort()
                        else: 
                                msg("Sorting in memory")
                                m = list(self.multi_file_iterator(self.inputs, False))
                                m.sort(num_cmp)
                                self.iterator = self.list_iterator(m)
                else:
                        self.iterator = self.multi_file_iterator(self.inputs)
Пример #2
0
def re_reader(item_re_str, fd, content_len, fname, output_tail = False, read_buffer_size=8192):
    item_re = re.compile(item_re_str)
    buf = ""
    tot = 0
    while True:
        if content_len:
            r = fd.read(min(read_buffer_size, content_len - tot))
        else:
            r = fd.read(read_buffer_size)
        tot += len(r)
        buf += r

        m = item_re.match(buf)
        while m:
            yield m.groups()
            buf = buf[m.end():]
            m = item_re.match(buf)

        if not len(r) or tot >= content_len:
            if content_len != None and tot < content_len:
                data_err("Truncated input (%s). "\
                     "Expected %d bytes, got %d" %\
                     (fname, content_len, tot), fname)
            if len(buf):
                if output_tail:
                    yield [buf]
                else:
                    msg("Couldn't match the last %d "\
                        "bytes in %s. Some bytes may be "\
                        "missing from input." %\
                        (len(buf), fname))
            break
Пример #3
0
        def __init__(self, input_files, do_sort, mem_sort_limit):
                self.inputs = []
                part = PART_SUFFIX % this_partition()
                for input in input_files:
                        if input.startswith("dir://"):
                                self.inputs += [x for x in parse_dir(input)\
                                        if x.startswith("chunk://") or\
                                           x.endswith(part)]
                        else:
                                self.inputs.append(input)

                self.line_count = 0
                if do_sort:
                        total_size = 0
                        for input in self.inputs:
                                sze, fd = connect_input(input)
                                total_size += sze

                        msg("Reduce[%d] input is %.2fMB" %\
                                (this_partition(), total_size / 1024.0**2))

                        if total_size > mem_sort_limit:
                                self.iterator = self.download_and_sort()
                        else: 
                                msg("Sorting in memory")
                                m = list(self.multi_file_iterator(self.inputs, False))
                                m.sort(num_cmp)
                                self.iterator = self.list_iterator(m)
                else:
                        self.iterator = self.multi_file_iterator(self.inputs)
Пример #4
0
 def list_iterator(self, lst):
     i = 0
     for x in lst:
         yield x
         i += 1
         if status_interval and not i % status_interval:
             msg("%d entries reduced" % i)
     msg("Reduce done: %d entries reduced in total" % i)
Пример #5
0
def op_map(job):
        global job_name
        
        job_input = this_inputs()
        msg("Received a new map job!")
        
        if len(job_input) != 1:
                err("Map can only handle one input. Got: %s" % 
                        " ".join(job_input))

        nr_reduces = int(job['nr_reduces'])
        required_modules = job['required_modules'].split()
        fun_map_reader.func_code = marshal.loads(job['map_reader'])
        fun_map_writer.func_code = marshal.loads(job['map_writer'])
        fun_partition.func_code = marshal.loads(job['partition'])
        for m in required_modules:
                fun_map_reader.func_globals.setdefault(m, __import__(m))
                fun_partition.func_globals.setdefault(m, __import__(m))
        
        if 'ext_map' in job:
                if 'ext_params' in job:
                        map_params = job['ext_params']
                else:
                        map_params = "0\n"
                external.prepare(job['ext_map'],
                        map_params, EXT_MAP % job_name)
                fun_map.func_code = external.ext_map.func_code
        else:
                map_params = cPickle.loads(job['params'])        
                fun_map.func_code = marshal.loads(job['map'])
        
        for m in required_modules:
                fun_map.func_globals.setdefault(m, __import__(m))

        if 'map_init' in job:
                fun_init.func_code = marshal.loads(job['map_init'])

        if 'combiner' in job:
                fun_combiner.func_code = marshal.loads(job['combiner'])
                for m in required_modules:
                        fun_combiner.func_globals.setdefault(m, __import__(m))
                partitions = [MapOutput(i, map_params, fun_combiner)\
                        for i in range(nr_reduces)]
        else:
                partitions = [MapOutput(i, map_params) for i in range(nr_reduces)]
        
        run_map(job_input[0], partitions, map_params)
        for p in partitions:
                p.close()
        if 'chunked' in job:
                merge_chunks(partitions)
                out = "chunk://%s/%s/map-chunk-%d" %\
                        (this_host(), job_name, this_partition())
        else:
                out = partitions[0].disco_address()
        
        external.close_ext()
        msg("%d %s" % (this_partition(), out), "OUT")
Пример #6
0
def op_map(job):
    msg("Received a new map job!")

    if len(Task.inputs) != 1:
        err("Map can only handle one input. Got: %s" %
            " ".join(Task.inputs))

    global fun_reader, fun_writer, fun_partition
    fun_reader = util.unpack(job['map_reader'], globals=globals())
    fun_writer = util.unpack(job['map_writer'], globals=globals())
    fun_partition = util.unpack(job['partition'], globals=globals())

    global fun_init
    if 'map_init' in job:
        fun_init = util.unpack(job['map_init'], globals=globals())

    global fun_map
    if 'ext_map' in job:
        if 'ext_params' in job:
            map_params = job['ext_params']
        else:
            map_params = "0\n"

        path = Task.path("EXT_MAP")
        external.prepare(job['ext_map'], map_params, path)
        fun_map = external.ext_map
    else:
        map_params = util.unpack(job['params'], globals=globals())
        fun_map = util.unpack(job['map'], globals=globals())

    global fun_combiner
    if 'combiner' in job:
        fun_combiner = util.unpack(job['combiner'], globals=globals())

    init_common(job)

    nr_part = max(1, Task.num_partitions)

    if 'combiner' in job:
        partitions = [MapOutput(i, map_params, fun_combiner)\
            for i in range(nr_part)]
    else:
        partitions = [MapOutput(i, map_params) for i in range(nr_part)]

    run_map(Task.inputs[0], partitions, map_params)
    external.close_ext()

    urls = {}
    for i, p in enumerate(partitions):
        p.close()
        urls["%d %s" % (i, p.url())] = True

    index, index_url = Task.map_index
    safe_update(index, urls)
    OutputURL(index_url)
Пример #7
0
        def multi_file_iterator(self, inputs, progress = True,
                                reader = fun_reduce_reader):
                i = 0
                for fname in inputs:
                        sze, fd = connect_input(fname)
                        for x in reader(fd, sze, fname):
                                yield x
                                i += 1
                                if progress and status_interval and\
                                        not i % status_interval:
                                        msg("%d entries reduced" % i)

                if progress:
                        msg("Reduce done: %d entries reduced in total" % i)
Пример #8
0
    def multi_file_iterator(self, inputs, params, progress = True,
                reader = fun_reader):
        i = 0
        for url in inputs:
            fd, sze, url = connect_input(url, params)
            for x in reader(fd, sze, url):
                yield x
                i += 1
                if progress and status_interval and\
                    not i % status_interval:
                    msg("%d entries reduced" % i)

        if progress:
            msg("Reduce done: %d entries reduced in total" % i)
Пример #9
0
    def download_and_sort(self, params):
        dlname = Task.path("REDUCE_DL", Task.id)
        msg("Reduce will be downloaded to %s" % dlname)
        out_fd = AtomicFile(dlname, "w")
        for url in self.inputs:
            fd, sze, url = connect_input(url, params)
            for k, v in fun_reader(fd, sze, url):
                if " " in k:
                    err("Spaces are not allowed in keys "\
                        "with external sort.")
                if "\0" in v:
                    err("Zero bytes are not allowed in "\
                        "values with external sort. "\
                        "Consider using base64 encoding.")
                out_fd.write("%s %s\0" % (k, v))
        out_fd.close()
        msg("Reduce input downloaded ok")

        msg("Starting external sort")
        sortname = Task.path("REDUCE_SORTED", Task.id)
        ensure_path(os.path.dirname(sortname))
        cmd = ["sort", "-n", "-k", "1,1", "-z",\
            "-t", " ", "-o", sortname, dlname]

        proc = subprocess.Popen(cmd)
        ret = proc.wait()
        if ret:
            err("Sorting %s to %s failed (%d)" %\
                (dlname, sortname, ret))

        msg("External sort done: %s" % sortname)
        return self.multi_file_iterator([sortname], params, reader =\
            lambda fd, sze, url:\
                re_reader("(?s)(.*?) (.*?)\000", fd, sze, url))
Пример #10
0
def run_map(job_input, partitions, param):
    i = 0
    fd, sze, url = connect_input(job_input, param)
    nr_reduces = max(1, Task.num_partitions)
    reader = fun_reader(fd, sze, url)
    fun_init(reader, param)

    for entry in reader:
        for key, value in fun_map(entry, param):
            p = fun_partition(key, nr_reduces, param)
            partitions[p].add(key, value)
        i += 1
        if status_interval and not i % status_interval:
            msg("%d entries mapped" % i)

    msg("Done: %d entries mapped in total" % i)
Пример #11
0
def map(line, params):
    """
    hackreduce:search:history format:
        None, timestamp, id, search, frequency?
    """
    from datetime import datetime, timedelta
    from disco.util import msg

    time_grouping = 30

    try: 
        unknown, timestamp, uid, query, frequency = line.split("','")
    except ValueError:
        msg(line)

    # bad hack :-(
    time = timestamp.replace("'", "")
    date_obj = datetime.fromtimestamp(float(time[:-3])) # timestamp has milliseconds, shave em off
    nearest_minute = date_obj - timedelta(
            minutes=date_obj.minute % time_grouping, 
            seconds=date_obj.second, 
            microseconds=date_obj.microsecond)


    # Give a score if the words within each query are in any of the 4 lists.
    sex = ('cockrings', 'sex', 't**s', 'naked', 'girls', 'f**k', 'suck', 'teen', 
                'hot', 'cum', 'topless', 'nude', )

    travel = ('fly', 'flight', 'plane', 'drive', 'europe', 'america', 'tours', 
                'map', 'hotel', 'cheap', 'asia', )

    nerd = ('java ', 'c ', 'c++', 'php', 'visual basic', 'perl', 
            'python', 'c#', 'javascript', 'ruby', 'erlang', 'lisp', )

    cooking = ('ice', 'cream', 'recipe', 'pasta', 'sauce', 'soup', 'meat', )


    score = {'sex': 0, 'nerd': 0, 'travel': 0, 'cooking': 0}

    for word in query.split():
        for key in score.keys():
            score[key] += int(word.lower() in locals()[key])

    yield (nearest_minute, score)
Пример #12
0
def map(line, params):
    """
    hackreduce:search:history format:
        None, timestamp, id, search, frequency?
    """
    from datetime import datetime, timedelta
    from disco.util import msg

    try: 
        unknown, timestamp, uid, query, frequency = line.split("','")
    except ValueError:
        print msg(line)

    # bad hack :-(
    time = timestamp.replace("'", "")
    date_obj = datetime.fromtimestamp(float(time[:-3])) # timestamp has milliseconds, shave em off
    nearest_minute = date_obj - timedelta(minutes=date_obj.minute % 1, seconds=date_obj.second, microseconds=date_obj.microsecond)

    yield (nearest_minute, {'unique_id': uid, 'query': query, 'frequency': frequency})
Пример #13
0
    def __init__(self, input_files, do_sort, mem_sort_limit, params):
        self.inputs = [url for input in input_files
                   for url in util.urllist(input, partid=Task.id)]
        random.shuffle(self.inputs)
        self.line_count = 0
        if do_sort:
            total_size = 0
            for input in self.inputs:
                fd, sze, url = connect_input(input, params)
                total_size += sze

            msg("Reduce[%d] input is %.2fMB" %\
                (Task.id, total_size / 1024.0**2))

            if total_size > mem_sort_limit:
                self.iterator = self.download_and_sort(params)
            else:
                msg("Sorting in memory")
                m = list(self.multi_file_iterator(self.inputs, False))
                m.sort(num_cmp)
                self.iterator = self.list_iterator(m)
        else:
            self.iterator = self.multi_file_iterator(self.inputs, params)
Пример #14
0
def op_reduce(job):
        job_inputs = this_inputs()

        msg("Received a new reduce job!")
        
        do_sort = int(job['sort'])
        mem_sort_limit = int(job['mem_sort_limit'])
        req_mod = job['required_modules'].split()
        
        if 'reduce_init' in job:
                fun_init.func_code = marshal.loads(job['reduce_init'])

        fun_reduce_reader.func_code = marshal.loads(job['reduce_reader'])
        fun_reduce_writer.func_code = marshal.loads(job['reduce_writer'])
        
        if 'required_files' in job:
                write_files(marshal.loads(job['required_files']), REQ_FILES)
                sys.path.insert(0, REQ_FILES)
        
        import_modules(req_mod, [fun_reduce_reader, fun_reduce_writer,\
            fun_reduce, fun_init])
         
        if 'ext_reduce' in job:
                if "ext_params" in job:
                        red_params = job['ext_params']
                else:
                        red_params = "0\n"
                external.prepare(job['ext_reduce'], red_params, EXT_REDUCE)
                fun_reduce.func_code = external.ext_reduce.func_code
        else:
                fun_reduce.func_code = marshal.loads(job['reduce'])
                red_params = cPickle.loads(job['params'])

        red_in = ReduceReader(job_inputs, do_sort, mem_sort_limit).iter()
        red_out = ReduceOutput(red_params)
        
        msg("Starting reduce")
        fun_init(red_in, red_params)
        fun_reduce(red_in, red_out, red_params)
        msg("Reduce done")
        
        red_out.close()
        external.close_ext()
        
        index = cStringIO.StringIO(os.path.basename(red_out.fname) + "\n")
        safe_append(index, REDUCE_INDEX)
        msg("dir://%s/%sreduce-index.txt" % (this_host(), JOB_HOME), "OUT")
Пример #15
0
def op_reduce(job):
        global job_name

        job_inputs = this_inputs()

        msg("Received a new reduce job!")
        
        do_sort = int(job['sort'])
        mem_sort_limit = int(job['mem_sort_limit'])
        required_modules = job['required_modules'].split()
        
        if 'reduce_init' in job:
                fun_init.func_code = marshal.loads(job['reduce_init'])

        fun_reduce_reader.func_code = marshal.loads(job['reduce_reader'])
        fun_reduce_writer.func_code = marshal.loads(job['reduce_writer'])
        
        if 'ext_reduce' in job:
                if "ext_params" in job:
                        red_params = job['ext_params']
                else:
                        red_params = "0\n"
                external.prepare(job['ext_reduce'], red_params,
                        EXT_REDUCE % job_name)
                fun_reduce.func_code = external.ext_reduce.func_code
        else:
                fun_reduce.func_code = marshal.loads(job['reduce'])
                red_params = cPickle.loads(job['params'])

        for m in required_modules:
                fun_reduce.func_globals.setdefault(m, __import__(m))

        red_in = ReduceReader(job_inputs, do_sort, mem_sort_limit).iter()
        red_out = ReduceOutput(red_params)
        
        msg("Starting reduce")
        fun_init(red_in, red_params)
        fun_reduce(red_in, red_out, red_params)
        msg("Reduce done")
        
        red_out.close()
        external.close_ext()

        msg("%d %s" % (this_partition(), red_out.disco_address()), "OUT")
Пример #16
0
        def download_and_sort(self):
                dlname = REDUCE_DL % (job_name, this_partition())
                ensure_path(dlname, False)
                msg("Reduce will be downloaded to %s" % dlname)
                out_fd = file(dlname + ".partial", "w")
                for fname in self.inputs:
                        sze, fd = connect_input(fname)
                        for k, v in fun_reduce_reader(fd, sze, fname):
                                if " " in k:
                                        err("Spaces are not allowed in keys "\
                                            "with external sort.")
                                if "\0" in v:
                                        err("Zero bytes are not allowed in "\
                                            "values with external sort. "\
                                            "Consider using base64 encoding.")
                                out_fd.write("%s %s\0" % (k, v))
                out_fd.close()
                os.rename(dlname + ".partial", dlname)
                msg("Reduce input downloaded ok")

                msg("Starting external sort")
                sortname = REDUCE_SORTED % (job_name, this_partition())
                ensure_path(sortname, False)
                cmd = ["sort", "-n", "-s", "-k", "1,1", "-z",\
                        "-t", " ", "-o", sortname, dlname]

                proc = subprocess.Popen(cmd)
                ret = proc.wait()
                if ret:
                        err("Sorting %s to %s failed (%d)" %\
                                (dlname, sortname, ret))
                
                msg("External sort done: %s" % sortname)
                return self.multi_file_iterator([sortname], reader =\
                        lambda fd, sze, fname:\
                                re_reader("(.*?) (.*?)\000", fd, sze, fname))
Пример #17
0
def op_reduce(job):
    msg("Received a new reduce job!")

    do_sort = int(job['sort'])
    mem_sort_limit = int(job['mem_sort_limit'])

    global fun_init
    if 'reduce_init' in job:
        fun_init = util.unpack(job['reduce_init'], globals=globals())

    global fun_reader, fun_writer
    fun_reader = util.unpack(job['reduce_reader'], globals=globals())
    fun_writer = util.unpack(job['reduce_writer'], globals=globals())

    global fun_reduce
    if 'ext_reduce' in job:
        if "ext_params" in job:
            red_params = job['ext_params']
        else:
            red_params = "0\n"

        path = Task.path("EXT_MAP")
        external.prepare(job['ext_reduce'], red_params, path)
        fun_reduce = external.ext_reduce
    else:
        fun_reduce = util.unpack(job['reduce'], globals=globals())
        red_params = util.unpack(job['params'], globals=globals())

    init_common(job)

    red_in = ReduceReader(Task.inputs, do_sort,
            mem_sort_limit, red_params).iter()
    red_out = ReduceOutput(red_params)

    msg("Starting reduce")
    fun_init(red_in, red_params)
    fun_reduce(red_in, red_out, red_params)
    msg("Reduce done")

    red_out.close()
    external.close_ext()

    index, index_url = Task.reduce_index
    safe_update(index, {"%d %s" % (Task.id, red_out.url()): True})
    OutputURL(index_url)
Пример #18
0
def op_map(job):
        job_input = this_inputs()
        msg("Received a new map job!")
        
        if len(job_input) != 1:
                err("Map can only handle one input. Got: %s" % 
                        " ".join(job_input))

        nr_reduces = int(job['nr_reduces'])
        nr_part = max(1, nr_reduces)
        fun_map_reader.func_code = marshal.loads(job['map_reader'])
        fun_map_writer.func_code = marshal.loads(job['map_writer'])
        fun_partition.func_code = marshal.loads(job['partition'])

        if 'map_init' in job:
                fun_init.func_code = marshal.loads(job['map_init'])
        
        if 'required_files' in job:
                write_files(marshal.loads(job['required_files']), REQ_FILES)
                sys.path.insert(0, REQ_FILES)

        req_mod = job['required_modules'].split()
        import_modules(req_mod, [fun_map_reader, fun_map_writer,
            fun_partition, fun_map, fun_combiner, fun_init])

        if 'ext_map' in job:
                if 'ext_params' in job:
                        map_params = job['ext_params']
                else:
                        map_params = "0\n"
                external.prepare(job['ext_map'], map_params, EXT_MAP)
                fun_map.func_code = external.ext_map.func_code
        else:
                map_params = cPickle.loads(job['params'])        
                fun_map.func_code = marshal.loads(job['map'])
        

        if 'combiner' in job:
                fun_combiner.func_code = marshal.loads(job['combiner'])
                partitions = [MapOutput(i, map_params, fun_combiner)\
                        for i in range(nr_part)]
        else:
                partitions = [MapOutput(i, map_params) for i in range(nr_part)]
        
        run_map(job_input[0], partitions, map_params)
        external.close_ext()
        
        for p in partitions:
                p.close()

        if nr_reduces:
                merge_partitions(partitions)
                n = os.path.basename(PART_OUTPUT % 0)
                msg("dir://%s/%s%s:%d" % (this_host(), JOB_HOME, n,
                        len(partitions) - 1), "OUT")
        else:
                res = [os.path.basename(p.fname) for p in partitions]
                index = cStringIO.StringIO("\n".join(res) + "\n")
                safe_append(index, MAP_INDEX)
                msg("dir://%s/%smap-index.txt" %\
                        (this_host(), JOB_HOME), "OUT")