def decode_data(data, encoding_guess, can_be_binary=True): """Given string data, return an (is_text, data) tuple, where data is returned as unicode if we think it's text and were able to determine an encoding for it. If can_be_binary is False, then skip the initial is_binary check. """ if not (can_be_binary and is_binary_string(data[:1024])): try: # Try our default encoding. data = data.decode(encoding_guess) return True, data except UnicodeDecodeError: # Fall back to chardet - chardet is really slow, which is why we # don't just do chardet from the start. detector = UniversalDetector() for chunk in ichunks(80, data): detector.feed(chunk) if detector.done: break detector.close() if detector.result['encoding']: try: data = data.decode(detector.result['encoding']) return True, data except (UnicodeDecodeError, LookupError): # Either we couldn't decode or chardet gave us an encoding # that python doesn't recognize (yes, it can do that). pass # Leave data as str. return False, data
def mapper(text='gatsby'): """Splits a problem into a number of small tasks""" import os fname = os.path.join(os.path.dirname(__file__), 'map_reduce2-{text}.txt'.format(text=text)) with open(fname, 'r') as f: paralleled_problem = [] for c in ichunks(50, f): paralleled_problem.append(m.s(' '.join(map(lambda t: t.decode('utf-8'), c)))) return (chord(paralleled_problem, g.s() | r.s())).delay()
def mergesort(filename, output=None, key=None, maxitems=1e6, progress=True): """Given an input file sort it by performing a merge sort on disk. :param filename: Either a filename as a ``str`` or a ``py._path.local.LocalPath`` instance. :type filename: ``str`` or ``py._path.local.LocalPath`` :param output: An optional output filename as a ``str`` or a ``py._path.local.LocalPath`` instance. :type output: ``str`` or ``py._path.local.LocalPath`` or ``None`` :param key: An optional key to sort the data on. :type key: ``function`` or ``None`` :param maxitems: Maximum number of items to hold in memory at a time. :type maxitems: ``int`` :param progress: Whether or not to display a progress bar :type progress: ``bool`` This uses ``py._path.local.LocalPath.make_numbered_dir`` to create temporry scratch space to work with when splitting the input file into sorted chunks. The mergesort is processed iteratively in-memory using the ``~merge`` function which is almost identical to ``~heapq.merge`` but adds in the support of an optional key function. """ p = filename if isinstance(filename, LocalPath) else LocalPath(filename) output = p if output is None else output key = key if key is not None else lambda x: x scratch = LocalPath.make_numbered_dir(prefix="mergesort-") nlines = sum(1 for line in p.open("r")) # Compute a reasonable chunksize < maxitems chunksize = first(ifilter(lambda x: x < maxitems, imap(lambda x: nlines / (2**x), count(1)))) # Split the file up into n sorted files if progress: bar = ProgressBar("Split/Sorting Data", max=(nlines / chunksize)) for i, items in enumerate(ichunks(chunksize, jsonstream(p))): with scratch.ensure("{0:d}.json".format(i)).open("w") as f: f.write("\n".join(map(dumps, sorted(items, key=key)))) if progress: bar.next() if progress: bar.finish() q = scratch.listdir("*.json") with output.open("w") as f: if progress: bar = ProgressBar("Merge/Sorting Data", max=nlines) for item in merge(*imap(jsonstream, q)): f.write("{0:s}\n".format(dumps(item))) if progress: bar.next() if progress: bar.finish()
def path_chunks(tree): """Return an iterable of worker-sized iterables of paths.""" return ichunks(500, unignored(tree.source_folder, tree.ignore_paths, tree.ignore_filenames))
def path_chunks(tree): """Return an iterable of worker-sized iterables of paths.""" return ichunks( 500, unignored(tree.source_folder, tree.ignore_paths, tree.ignore_filenames))
def render_recent_collections_block(limit=10): """ Renders last block with last collections for index page. """ qs = Collection.objects.all().order_by('-id')[:limit] videos_chunks = ichunks(2, qs) return locals()
def render_recent_galleries_block(limit=10): """ Renders last galleries block for index page. """ qs = Gallery.objects.all().order_by('-id')[:limit] galleries_chunks = ichunks(2, qs) return locals()