Пример #1
0
def decode_data(data, encoding_guess, can_be_binary=True):
    """Given string data, return an (is_text, data) tuple, where data is
    returned as unicode if we think it's text and were able to determine an
    encoding for it.
    If can_be_binary is False, then skip the initial is_binary check.
    """
    if not (can_be_binary and is_binary_string(data[:1024])):
        try:
            # Try our default encoding.
            data = data.decode(encoding_guess)
            return True, data
        except UnicodeDecodeError:
            # Fall back to chardet - chardet is really slow, which is why we
            # don't just do chardet from the start.
            detector = UniversalDetector()
            for chunk in ichunks(80, data):
                detector.feed(chunk)
                if detector.done:
                    break
            detector.close()
            if detector.result['encoding']:
                try:
                    data = data.decode(detector.result['encoding'])
                    return True, data
                except (UnicodeDecodeError, LookupError):
                    # Either we couldn't decode or chardet gave us an encoding
                    # that python doesn't recognize (yes, it can do that).
                    pass  # Leave data as str.
    return False, data
Пример #2
0
Файл: mime.py Проект: vck/dxr
def decode_data(data, encoding_guess, can_be_binary=True):
    """Given string data, return an (is_text, data) tuple, where data is
    returned as unicode if we think it's text and were able to determine an
    encoding for it.
    If can_be_binary is False, then skip the initial is_binary check.
    """
    if not (can_be_binary and is_binary_string(data[:1024])):
        try:
            # Try our default encoding.
            data = data.decode(encoding_guess)
            return True, data
        except UnicodeDecodeError:
            # Fall back to chardet - chardet is really slow, which is why we
            # don't just do chardet from the start.
            detector = UniversalDetector()
            for chunk in ichunks(80, data):
                detector.feed(chunk)
                if detector.done:
                    break
            detector.close()
            if detector.result['encoding']:
                try:
                    data = data.decode(detector.result['encoding'])
                    return True, data
                except (UnicodeDecodeError, LookupError):
                    # Either we couldn't decode or chardet gave us an encoding
                    # that python doesn't recognize (yes, it can do that).
                    pass  # Leave data as str.
    return False, data
Пример #3
0
def mapper(text='gatsby'):
    """Splits a problem into a number of small tasks"""
    import os
    fname = os.path.join(os.path.dirname(__file__), 'map_reduce2-{text}.txt'.format(text=text))
    with open(fname, 'r') as f:
        paralleled_problem = []
        for c in ichunks(50, f):
            paralleled_problem.append(m.s(' '.join(map(lambda t: t.decode('utf-8'), c))))
    return (chord(paralleled_problem, g.s() | r.s())).delay()
Пример #4
0
def mergesort(filename, output=None, key=None, maxitems=1e6, progress=True):
    """Given an input file sort it by performing a merge sort on disk.

    :param filename: Either a filename as a ``str`` or a ``py._path.local.LocalPath`` instance.
    :type filename:  ``str`` or ``py._path.local.LocalPath``

    :param output: An optional output filename as a ``str`` or a ``py._path.local.LocalPath`` instance.
    :type output:  ``str`` or ``py._path.local.LocalPath`` or ``None``

    :param key: An optional key to sort the data on.
    :type key:  ``function`` or ``None``

    :param maxitems: Maximum number of items to hold in memory at a time.
    :type maxitems:  ``int``

    :param progress: Whether or not to display a progress bar
    :type progress: ``bool``

    This uses ``py._path.local.LocalPath.make_numbered_dir`` to create temporry scratch space to work
    with when splitting the input file into sorted chunks. The mergesort is processed iteratively in-memory
    using the ``~merge`` function which is almost identical to ``~heapq.merge`` but adds in the support of
    an optional key function.
    """

    p = filename if isinstance(filename, LocalPath) else LocalPath(filename)
    output = p if output is None else output
    key = key if key is not None else lambda x: x

    scratch = LocalPath.make_numbered_dir(prefix="mergesort-")

    nlines = sum(1 for line in p.open("r"))

    # Compute a reasonable chunksize < maxitems
    chunksize = first(ifilter(lambda x: x < maxitems, imap(lambda x: nlines / (2**x), count(1))))

    # Split the file up into n sorted files
    if progress:
        bar = ProgressBar("Split/Sorting Data", max=(nlines / chunksize))
    for i, items in enumerate(ichunks(chunksize, jsonstream(p))):
        with scratch.ensure("{0:d}.json".format(i)).open("w") as f:
            f.write("\n".join(map(dumps, sorted(items, key=key))))
        if progress:
            bar.next()
    if progress:
        bar.finish()

    q = scratch.listdir("*.json")
    with output.open("w") as f:
        if progress:
            bar = ProgressBar("Merge/Sorting Data", max=nlines)
        for item in merge(*imap(jsonstream, q)):
            f.write("{0:s}\n".format(dumps(item)))
            if progress:
                bar.next()
        if progress:
            bar.finish()
Пример #5
0
 def path_chunks(tree):
     """Return an iterable of worker-sized iterables of paths."""
     return ichunks(500, unignored(tree.source_folder,
                                   tree.ignore_paths,
                                   tree.ignore_filenames))
Пример #6
0
Файл: build.py Проект: vck/dxr
 def path_chunks(tree):
     """Return an iterable of worker-sized iterables of paths."""
     return ichunks(
         500,
         unignored(tree.source_folder, tree.ignore_paths,
                   tree.ignore_filenames))
Пример #7
0
def render_recent_collections_block(limit=10):
    """ Renders last block with last collections for index page.
    """
    qs = Collection.objects.all().order_by('-id')[:limit]
    videos_chunks = ichunks(2, qs)
    return locals()
Пример #8
0
def render_recent_galleries_block(limit=10):
    """ Renders last galleries block for index page.
    """
    qs = Gallery.objects.all().order_by('-id')[:limit]
    galleries_chunks = ichunks(2, qs)
    return locals()