def clone(self, target=_UNSET, via_column=_UNSET, name=_UNSET, bounds=_UNSET, **coax_kwargs): kwargs = self.coax_kwargs kwargs.update(coax_kwargs) return How( target=pdutils.coalesce(target, self.target, unset=_UNSET), via_column=pdutils.coalesce(via_column, self.via_column, unset=_UNSET), name=pdutils.coalesce(name, self._name, unset=_UNSET), bounds=pdutils.coalesce(bounds, self.bounds, unset=_UNSET), **kwargs, )
def __repr__(self): # NOTE: Do not change this to use any values that require additional arrays to be fetched! num_chunks = pdutils.coalesce(self._matched_num_chunks, '?') num_rows = pdutils.coalesce(self._matched_num_rows, '?') csv_bytes = pdutils.coalesce(self._matched_csv_bytes, '?') gz_bytes = pdutils.coalesce(self._matched_gz_bytes, '?') parts = [ 'chunks=%s' % (num_chunks,), 'rows=%s' % (strutils.format_number(num_rows, units=strutils.NUMBER_UNITS),), 'csv_bytes=%s' % (strutils.format_number(csv_bytes, units=strutils.SI_UNITS),), 'gz_bytes=%s' % (strutils.format_number(gz_bytes, units=strutils.SI_UNITS),), ] return '<%s %r %s>' % (self.__class__.__name__, str(self.layout), ' '.join(x for x in parts if x))
def weighted_tqdm(items, weights=None, default_weight=0, total=None, **kwargs): weights = pdutils.coalesce(weights, {}) total = total if not pdutils.is_empty(total) else ( sum(weights.values()) if weights else None) if not weights and not default_weight: yield from items else: with tqdm.tqdm(items, total=total, **kwargs) as tq: for item in items: yield item weight = weights.get(item, default_weight) tq.update(weight)
def allocate(csv_path_size, num_groups): def insert_chunk(allocation, chunk): # The smallest group is in groups[0]. group = allocation.groups.pop(0) group_size = allocation.group_size.pop(0) # Insert chunk into the smallest group in the correct sort order (smallest-largest chunks). chunk_index = bisect.bisect_left(group.chunk_size, chunk.total_size) group.chunks.insert(chunk_index, chunk) group.chunk_size.insert(chunk_index, chunk.total_size) # Reinsert group back into the allocation in the correct sort order (smallest-largest groups). group_size = group_size + chunk.total_size group_index = bisect.bisect_left(allocation.group_size, group_size) allocation.groups.insert(group_index, group) allocation.group_size.insert(group_index, group_size) def optimize(allocation, threshold_percent=0.1): while True: diff_size = (allocation.group_size[-1] - allocation.group_size[0]) diff_percent = diff_size / (allocation.group_size[0] or diff_size) if diff_percent <= threshold_percent: break # The largest group is in groups[-1]. largest_group = allocation.groups.pop(-1) largest_group_size = allocation.group_size.pop(-1) # Remove the largest chunk from the largest group. largest_chunk = largest_group.chunks.pop(-1) _ = largest_group.chunk_size.pop(-1) # Re-insert the (formerly) largest_group back where it belongs. # TODO(rob): insert_chunk pointlessly pops and re-inserts this again immediately. largest_group_size = largest_group_size - largest_chunk.total_size group_index = bisect.bisect_left(allocation.group_size, largest_group_size) allocation.groups.insert(group_index, largest_group) allocation.group_size.insert(group_index, largest_group_size) # Cut the largest chunk it into 2, and re-insert it. head_chunk, bomb_chunk = largest_chunk.split() insert_chunk(allocation=allocation, chunk=head_chunk) insert_chunk(allocation=allocation, chunk=bomb_chunk) # Create initial allocation of whole csv_path to groups. num_groups = pdutils.coalesce(num_groups, NUM_CPUS) allocation = Allocation(num_groups=num_groups, num_files=0) for csv_path, csv_size in csv_path_size: allocation.num_files += 1 csv_range = (0, csv_size) chunk = Chunk(path=csv_path, size_range=csv_range, total_size=csv_size) insert_chunk(allocation=allocation, chunk=chunk) # Optimize groups by halving the largest chunk from the largest group. optimize(allocation=allocation) return allocation
def how_snug_labeler(how, layouts): if how.target.labeler is not None: # How.target has a labeler; may be LabelColumnTarget or GTypeLabelerTarget. gtype = how.target.gtype snug_step = how.target.labeler.step snug_head = how.target.labeler.head elif layouts: # Extract the minimum value of step for this how across all layouts. layout_gtypes = set() layout_steps = set() layout_heads = set() for layout in layouts: # Bind how to layout if possible; ignore this how if it can't be bound to layout (DROP). # This will legitimately happen with the 2 layouts: ('lat_lng', 'year_month'). bound_how = how.bind_layout(layout=layout, errors_missing=lib_errors.DROP) if bound_how is not None: layout_gtypes.add(bound_how.target.gtype) layout_steps.add(bound_how.target.labeler.step) layout_heads.add(bound_how.target.labeler.head) if len(layout_gtypes) > 1: # Since a GTypeLabeler can label exactly one gtype, crash if we found multiple. raise lib_errors.MultipleLabelColumnsError('found %d gtype matches in layouts: %r' % (len(layout_gtypes), how)) if len(layout_gtypes) == 0: # A spurious how clause (no matches with any layouts) also cannot create a labeler. raise lib_errors.NoLabelColumnsError('found 0 gtype matches in layouts: %r' % (how,)) # We were provided with at least one layout; use min(step/head) for their labeler. gtype = more_itertools.one(layout_gtypes) snug_step = min(layout_steps) snug_head = min(layout_heads) else: # With no provided layouts, how.target *must* have a gtype or we can't create a labeler. gtype = how.target.gtype snug_step = gtype.SANE_STEP snug_head = pdutils.coalesce(gtype.HEAD, 0) return gtype.labeler(step=snug_step, head=snug_head)
def balance(csv_paths, out_dir, num_groups=None, max_workers=None, relative_to=None, symlink_ok=True, csv_path_weights=None, tqdm_desc='splitting'): """Computes an allocation and materializes it by splitting large files and symlinking others. This function creates roughly equal-sized directories of csvs inside `out_dir`, in preparation for processing them in parallel. Confusingly, the `num_groups` parameter specifies the number of directories to allocate csvs into, while the `max_workers` parameter specifies the number of workers to use while materializing the allocation. csv_paths: iterable of csv file paths out_dir: non-existant directory to write new files into num_groups: number of groups to allocate files into; default max_workers max_workers: max number of parallel processes to run; default NUM_CPUS relative_to: drop this prefix when writing files into out_dir; default csv_paths common prefix symlink_ok: if True, symlinked non-split files into out_dir; if False, copy files csv_path_weights {path: int}: optional mapping of path to allocation weight; default csv bytes """ max_workers = pdutils.coalesce(max_workers, NUM_CPUS) num_groups = pdutils.coalesce(num_groups, max_workers) # Read the number of rows in each csv file, and allocate them into num_groups. csv_paths = tuple(csv_paths) if csv_path_weights is None: csv_path_weights = dict(csv_path_bytes(csv_paths=csv_paths)) logging.info('allocating csvs into %s groups...' % num_groups) allocation = allocate(csv_path_size=tuple(csv_path_weights.items()), num_groups=num_groups) logging.info(allocation) # Perform all required splits into their split_num directory. relative_to = relative_to or os.path.commonpath(csv_paths) file_splits = allocation.file_splits() job_kw = { 'out_dir': str(out_dir), 'relative_to': str(relative_to), 'num_groups': num_groups, 'symlink_ok': symlink_ok } jobs = ((_write_groups, dict(path=str(path), file_splits=fs, **job_kw)) for path, fs in file_splits.items()) with tqdm.tqdm(desc=tqdm_desc, total=sum(csv_path_weights.values()), mininterval=1, maxinterval=1) as tq: with futures.ProcessPoolExecutor(max_workers=max_workers) as pool: pending = set( pool.submit(fn, **kw) for fn, kw in itertools.islice(jobs, max_workers * 2)) while pending: done, pending = futures.wait( pending, return_when=futures.FIRST_COMPLETED) for job in done: finished_path = pathlib.Path(job.result()) tq.update(csv_path_weights[finished_path]) for fn, kw in itertools.islice(jobs, len(done)): pending.add(pool.submit(fn, **kw)) return allocation
def clone(self, target=_UNSET, head_bombs=_UNSET): return SequenceFilter( target=pdutils.coalesce(target, self.target, unset=_UNSET), head_bombs=pdutils.coalesce(head_bombs, self.head_bombs, unset=_UNSET), )
def labeler(cls, step=None, head=None, depth=None): if ((depth is not None and step is not None) or (depth is None and step is None)): raise ValueError('must provide one of depth, step/head: depth=%r, step=%r' % (depth, step)) step = pdutils.coalesce(step, (depth is not None) and cls.depth_step(depth=depth)) head = pdutils.coalesce(head, cls.HEAD) return super().labeler(step=step, head=head)
def depth_step(cls, depth, head=None, bomb=None): head = pdutils.coalesce(head, cls.HEAD) bomb = pdutils.coalesce(bomb, cls.BOMB, cls.TAIL) return (bomb - head) / 2**depth
def bucket(cls, val, step, head=None): head = pdutils.coalesce(head, cls.HEAD, 0) if is_scalar(val): return cls._bucket_scalar(val, step=step, head=head) return cls._bucket_pd(val, step=step, head=head)