Exemplo n.º 1
0
    def write_permacache(cls, fd=STDIN, out=STDOUT, num=1000):
        """Write computed listings (from fd) to permacache.

        :param int num: maximum listing size
        :param file fd: input stream
        """
        mr_tools.mr_reduce_max_per_key(
            cls._sorting_key, num=num, post=cls.store_keys, fd=fd, out=out
        )
Exemplo n.º 2
0
    def write_permacache(cls, fd=STDIN, out=STDOUT, num=1000):
        """Write computed listings (from fd) to permacache.

        :param int num: maximum listing size
        :param file fd: input stream
        """
        mr_tools.mr_reduce_max_per_key(cls._sorting_key,
                                       num=num,
                                       post=cls.store_keys,
                                       fd=fd,
                                       out=out)
Exemplo n.º 3
0
def top1k_writefiles(dirname):
    """Divide up the top 1k of each key into its own file to make
       restarting after a failure much easier"""
    def post(key, maxes):
        with open(os.path.join(dirname, key), 'w') as f:
            for item in maxes:
                f.write('%s\t' % key)
                f.write('\t'.join(item))
                f.write('\n')
        
    mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000,
                                   post=post)
Exemplo n.º 4
0
    def reduce_listings(cls, fd=STDIN, out=STDOUT, num=1000):
        """Debugging reducer.

        Like write_permacache, but just sends the reduced version of the
        listing to stdout instead of to the permacache. It's handy for
        debugging to see the final result before it's written out

        :param int num: maximum listing size
        :param file fd: input stream
        """
        mr_tools.mr_reduce_max_per_key(
            cls._sorting_key, num=num, fd=fd, out=out
        )
Exemplo n.º 5
0
def top1k_writefiles(dirname):
    """Divide up the top 1k of each key into its own file to make
       restarting after a failure much easier"""
    def post(key, maxes):
        with open(os.path.join(dirname, key), 'w') as f:
            for item in maxes:
                f.write('%s\t' % key)
                f.write('\t'.join(item))
                f.write('\n')

    mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]),
                                   num=1000,
                                   post=post)
Exemplo n.º 6
0
    def reduce_listings(cls, fd=STDIN, out=STDOUT, num=1000):
        """Debugging reducer.

        Like write_permacache, but just sends the reduced version of the
        listing to stdout instead of to the permacache. It's handy for
        debugging to see the final result before it's written out

        :param int num: maximum listing size
        :param file fd: input stream
        """
        mr_tools.mr_reduce_max_per_key(cls._sorting_key,
                                       num=num,
                                       fd=fd,
                                       out=out)
Exemplo n.º 7
0
def top1k_writefiles(dirname):
    """Divide up the top 1k of each key into its own file to make
       restarting after a failure much easier. Pairs with
       write_permacache_from_dir"""
    def hashdir(name, levels=[3]):
        # levels is a list of how long each stage if the hashdirname
        # should be. So [2,2] would make dirs like
        # 'ab/cd/thelisting.txt' (and this function would just return
        # the string 'ab/cd', so that you have the dirname that you
        # can create before os.path.joining to the filename)
        h = md5(name).hexdigest()

        last = 0
        dirs = []
        for l in levels:
            dirs.append(h[last:last + l])
            last += l

        return os.path.join(*dirs)

    def post(key, maxes):
        # we're taking a hash like 12345678901234567890123456789012
        # and making a directory name two deep out of the first half
        # of the characters. We may want to tweak this as the number
        # of listings

        hd = os.path.join(dirname, hashdir(key))
        try:
            os.makedirs(hd)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        filename = os.path.join(hd, key)

        with open(filename, 'w') as f:
            for item in maxes:
                f.write('%s\t' % key)
                f.write('\t'.join(item))
                f.write('\n')

    mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]),
                                   num=1000,
                                   post=post)
Exemplo n.º 8
0
def top1k_writefiles(dirname):
    """Divide up the top 1k of each key into its own file to make
       restarting after a failure much easier. Pairs with
       write_permacache_from_dir"""

    def hashdir(name, levels=[3]):
        # levels is a list of how long each stage if the hashdirname
        # should be. So [2,2] would make dirs like
        # 'ab/cd/thelisting.txt' (and this function would just return
        # the string 'ab/cd', so that you have the dirname that you
        # can create before os.path.joining to the filename)
        h = md5(name).hexdigest()

        last = 0
        dirs = []
        for l in levels:
            dirs.append(h[last : last + l])
            last += l

        return os.path.join(*dirs)

    def post(key, maxes):
        # we're taking a hash like 12345678901234567890123456789012
        # and making a directory name two deep out of the first half
        # of the characters. We may want to tweak this as the number
        # of listings

        hd = os.path.join(dirname, hashdir(key))
        try:
            os.makedirs(hd)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        filename = os.path.join(hd, key)

        with open(filename, "w") as f:
            for item in maxes:
                f.write("%s\t" % key)
                f.write("\t".join(item))
                f.write("\n")

    mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000, post=post)
Exemplo n.º 9
0
def top1k_writepermacache(fd=sys.stdin):
    mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000, post=store_keys, fd=fd)
Exemplo n.º 10
0
def write_permacache(fd = sys.stdin):
    mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000,
                                   post=store_keys,
                                   fd = fd)
Exemplo n.º 11
0
def reduce_listings(fd=sys.stdin):
    # like write_permacache, but just sends the reduced version of the listing
    # to stdout instead of to the permacache. It's handy for debugging to see
    # the final result before it's written out
    mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000,
                                   fd = fd)