def filter_quantile_union( df: DataFrame, quantile: int, quantile_range: str = 'upper') -> Tuple[DataFrame, Series]: """ Get part of the data above or below a determined quantile Parameters ---------- df : DataFrame The dataframe from which the distribution should be estimated quantile : integer The distribution section to be used as threshold quantile_range : string default: upper The region which should be filtered, where upper indicates values above the quantile and lower otherwise Returns ------- Two-Element Tuple A tuple where the first item is the resulting filtered dataframe and the second is a series with the quantile threshold value """ if quantile == 1: q = df.quantile(q=0.25) elif quantile == 2: q = df.quantile(q=0.5) elif quantile == 3: q = df.quantile(q=0.75) else: print("Invalid quantile") return q = q.compute() q2 = q.to_frame() q2 = q2.transpose() q2 = q2.reset_index() if quantile_range == 'upper': data = df.loc[(df.AnswerCount > q2.AnswerCount) | (df.ViewCount > q2.ViewCount) | (df.CommentCount > q2.CommentCount) | (df.FavoriteCount > q2.FavoriteCount) | (df.Score > q2.Score)] elif quantile_range == 'lower': data = df.loc[(df.AnswerCount < q2.AnswerCount) | (df.ViewCount < q2.ViewCount) | (df.CommentCount < q2.CommentCount) | (df.FavoriteCount < q2.FavoriteCount) | (df.Score < q2.Score)] else: #TODO raise an error here print('Ivalid range of data') data = None return data, q
def compute_and_set_divisions(df: DataFrame, **kwargs) -> DataFrame: mins, maxes, lens = _compute_partition_stats(df.index, allow_overlap=True, **kwargs) if len(mins) == len(df.divisions) - 1: df._divisions = tuple(mins) + (maxes[-1], ) if not any(mins[i] >= maxes[i - 1] for i in range(1, len(mins))): return df return fix_overlap(df, mins, maxes, lens)
def _agg(self, how, meta=None, fill_value=np.nan, how_args=(), how_kwargs={}): """Aggregate using one or more operations Parameters ---------- how : str Name of aggregation operation fill_value : scalar, optional Value to use for missing values, applied during upsampling. Default is NaN. how_args : optional Positional arguments for aggregation operation. how_kwargs : optional Keyword arguments for aggregation operation. Returns ------- Dask DataFrame or Series """ rule = self._rule kwargs = self._kwargs name = "resample-" + tokenize(self.obj, rule, kwargs, how, *how_args, **how_kwargs) # Create a grouper to determine closed and label conventions newdivs, outdivs = _resample_bin_and_out_divs(self.obj.divisions, rule, **kwargs) # Repartition divs into bins. These won't match labels after mapping partitioned = self.obj.repartition(newdivs, force=True) keys = partitioned.__dask_keys__() dsk = {} args = zip(keys, outdivs, outdivs[1:], ["left"] * (len(keys) - 1) + [None]) for i, (k, s, e, c) in enumerate(args): dsk[(name, i)] = ( _resample_series, k, s, e, c, rule, kwargs, how, fill_value, list(how_args), how_kwargs, ) # Infer output metadata meta_r = self.obj._meta_nonempty.resample(self._rule, **self._kwargs) meta = getattr(meta_r, how)(*how_args, **how_kwargs) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[partitioned]) if isinstance(meta, pd.DataFrame): return DataFrame(graph, name, meta, outdivs) return Series(graph, name, meta, outdivs)
def _calculate_divisions( df: DataFrame, partition_col: Series, repartition: bool, npartitions: int, upsample: float = 1.0, partition_size: float = 128e6, ) -> Tuple[List, List, List]: """ Utility function to calculate divisions for calls to `map_partitions` """ sizes = df.map_partitions(sizeof) if repartition else [] divisions = partition_col._repartition_quantiles(npartitions, upsample=upsample) mins = partition_col.map_partitions(M.min) maxes = partition_col.map_partitions(M.max) try: divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes) except TypeError as e: # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float. # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA. # If this happens, we hint the user about eliminating nulls beforehand. if not is_numeric_dtype(partition_col.dtype): obj, suggested_method = ( ("column", f"`.dropna(subset=['{partition_col.name}'])`") if any( partition_col._name == df[c]._name for c in df) else ("series", "`.loc[series[~series.isna()]]`")) raise NotImplementedError( f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n" f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n" f"We suggest you try with {suggested_method}.") from e # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError else: raise e divisions = methods.tolist(divisions) if type(sizes) is not list: sizes = methods.tolist(sizes) mins = methods.tolist(mins) maxes = methods.tolist(maxes) empty_dataframe_detected = pd.isna(divisions).all() if repartition or empty_dataframe_detected: total = sum(sizes) npartitions = max(math.ceil(total / partition_size), 1) npartitions = min(npartitions, df.npartitions) n = len(divisions) try: divisions = np.interp( x=np.linspace(0, n - 1, npartitions + 1), xp=np.linspace(0, n - 1, n), fp=divisions, ).tolist() except (TypeError, ValueError): # str type indexes = np.linspace(0, n - 1, npartitions + 1).astype(int) divisions = [divisions[i] for i in indexes] else: # Drop duplicate divisions returned by partition quantiles divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]] mins = remove_nans(mins) maxes = remove_nans(maxes) if pd.api.types.is_categorical_dtype(partition_col.dtype): dtype = partition_col.dtype mins = pd.Categorical(mins, dtype=dtype).codes.tolist() maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist() return divisions, mins, maxes
def set_partition( df: DataFrame, index: Union[str, Series], divisions: Sequence, max_branch: int = 32, drop: bool = True, shuffle: Optional[str] = None, compute: Optional[bool] = None, ) -> DataFrame: """Group DataFrame by index Sets a new index and partitions data along that index according to divisions. Divisions are often found by computing approximate quantiles. The function ``set_index`` will do both of these steps. Parameters ---------- df: DataFrame/Series Data that we want to re-partition index: string or Series Column to become the new index divisions: list Values to form new divisions between partitions drop: bool, default True Whether to delete columns to be used as the new index shuffle: str (optional) Either 'disk' for an on-disk shuffle or 'tasks' to use the task scheduling framework. Use 'disk' if you are on a single machine and 'tasks' if you are on a distributed cluster. max_branch: int (optional) If using the task-based shuffle, the amount of splitting each partition undergoes. Increase this for fewer copies but more scheduler overhead. See Also -------- set_index shuffle partd """ meta = df._meta._constructor_sliced([0]) if isinstance(divisions, tuple): # pd.isna considers tuples to be scalars. Convert to a list. divisions = list(divisions) if not isinstance(index, Series): dtype = df[index].dtype else: dtype = index.dtype if pd.isna(divisions).any() and pd.api.types.is_integer_dtype(dtype): # Can't construct a Series[int64] when any / all of the divisions are NaN. divisions = df._meta._constructor_sliced(divisions) elif (pd.api.types.is_categorical_dtype(dtype) and UNKNOWN_CATEGORIES in dtype.categories): # If categories are unknown, leave as a string dtype instead. divisions = df._meta._constructor_sliced(divisions) else: divisions = df._meta._constructor_sliced(divisions, dtype=dtype) if not isinstance(index, Series): partitions = df[index].map_partitions(set_partitions_pre, divisions=divisions, meta=meta) df2 = df.assign(_partitions=partitions) else: partitions = index.map_partitions(set_partitions_pre, divisions=divisions, meta=meta) df2 = df.assign(_partitions=partitions, _index=index) df3 = rearrange_by_column( df2, "_partitions", max_branch=max_branch, npartitions=len(divisions) - 1, shuffle=shuffle, compute=compute, ignore_index=True, ) if not isinstance(index, Series): df4 = df3.map_partitions( set_index_post_scalar, index_name=index, drop=drop, column_dtype=df.columns.dtype, ) else: df4 = df3.map_partitions( set_index_post_series, index_name=index.name, drop=drop, column_dtype=df.columns.dtype, ) df4.divisions = tuple(methods.tolist(divisions)) return df4.map_partitions(M.sort_index)
def sort_values( df: DataFrame, by: Union[str, List[str]], npartitions: Optional[Union[int, Literal["auto"]]] = None, ascending: Union[bool, List[bool]] = True, na_position: Union[Literal["first"], Literal["last"]] = "last", upsample: float = 1.0, partition_size: float = 128e6, sort_function: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None, sort_function_kwargs: Optional[Mapping[str, Any]] = None, **kwargs, ) -> DataFrame: """See DataFrame.sort_values for docstring""" if na_position not in ("first", "last"): raise ValueError("na_position must be either 'first' or 'last'") if not isinstance(by, list): by = [by] if len(by) > 1 and df.npartitions > 1 or any(not isinstance(b, str) for b in by): raise NotImplementedError( "Dataframes only support sorting by named columns which must be passed as a " "string or a list of strings; multi-partition dataframes only support sorting " "by a single column.\n" "You passed %s" % str(by)) sort_kwargs = { "by": by, "ascending": ascending, "na_position": na_position, } if sort_function is None: sort_function = M.sort_values if sort_function_kwargs is not None: sort_kwargs.update(sort_function_kwargs) if df.npartitions == 1: return df.map_partitions(sort_function, **sort_kwargs) if npartitions == "auto": repartition = True npartitions = max(100, df.npartitions) else: if npartitions is None: npartitions = df.npartitions repartition = False sort_by_col = df[by[0]] divisions, mins, maxes = _calculate_divisions(df, sort_by_col, repartition, npartitions, upsample, partition_size) if len(divisions) == 2: return df.repartition(npartitions=1).map_partitions( sort_function, **sort_kwargs) if not isinstance(ascending, bool): # support [True] as input if (isinstance(ascending, list) and len(ascending) == 1 and isinstance(ascending[0], bool)): ascending = ascending[0] else: raise NotImplementedError( f"Dask currently only supports a single boolean for ascending. You passed {str(ascending)}" ) if (all(not pd.isna(x) for x in divisions) and mins == sorted(mins, reverse=not ascending) and maxes == sorted(maxes, reverse=not ascending) and all(mx < mn for mx, mn in zip( maxes[:-1] if ascending else maxes[1:], mins[1:] if ascending else mins[:-1], )) and npartitions == df.npartitions): # divisions are in the right place return df.map_partitions(sort_function, **sort_kwargs) df = rearrange_by_divisions( df, by, divisions, ascending=ascending, na_position=na_position, duplicates=False, ) df = df.map_partitions(sort_function, **sort_kwargs) return df
def shuffle( df: DataFrame, column_names: List[str], npartitions: Optional[int] = None, ignore_index: bool = False, ) -> DataFrame: """Order divisions of DataFrame so that all values within column(s) align This enacts a task-based shuffle using explicit-comms. It requires a full dataset read, serialization and shuffle. This is expensive. If possible you should avoid shuffles. This does not preserve a meaningful index/partitioning scheme. This is not deterministic if done in parallel. Requires an activate client. Parameters ---------- df: dask.dataframe.DataFrame Dataframe to shuffle column_names: list of strings List of column names on which we want to split. npartitions: int or None The desired number of output partitions. If None, the number of output partitions equals `df.npartitions` ignore_index: bool Ignore index during shuffle. If True, performance may improve, but index values will not be preserved. Returns ------- df: dask.dataframe.DataFrame Shuffled dataframe Developer Notes --------------- The implementation consist of three steps: (a) Extend the dask graph of `df` with a call to `shuffle_group()` for each dataframe partition and submit the graph. (b) Submit a task on each worker that shuffle (all-to-all communicate) the groups from (a) and return a list of dataframe-partitions. (c) Submit a dask graph that extract (using `getitem()`) individual dataframe-partitions from (b). """ c = comms.default_comms() # As default we preserve number of partitions if npartitions is None: npartitions = df.npartitions # Step (a): partition/group each dataframe-partition name = ("explicit-comms-shuffle-group-" f"{tokenize(df, column_names, npartitions, ignore_index)}") df = df.persist( ) # Making sure optimizations are apply on the existing graph dsk = dict(df.__dask_graph__()) output_keys = [] for input_key in df.__dask_keys__(): output_key = (name, input_key[1]) dsk[output_key] = ( shuffle_group, input_key, column_names, 0, npartitions, npartitions, ignore_index, npartitions, ) output_keys.append(output_key) # Compute `df_groups`, which is a list of futures, one future per partition in `df`. # Each future points to a dict of length `df.npartitions` that maps each # partition-id to a DataFrame. df_groups = compute_as_if_collection(type(df), dsk, output_keys, sync=False) wait(df_groups) for f in df_groups: # Check for errors if f.status == "error": f.result() # raise exception # Step (b): find out which workers has what part of `df_groups`, # find the number of output each worker should have, # and submit `local_shuffle()` on each worker. key_to_part = {str(part.key): part for part in df_groups} in_parts = defaultdict(list) # Map worker -> [list of futures] for key, workers in c.client.who_has(df_groups).items(): # Note, if multiple workers have the part, we pick the first worker in_parts[first(workers)].append(key_to_part[key]) # Let's create a dict that specifices the number of partitions each worker has in_nparts = {} workers = set() # All ranks that have a partition of `df` for rank, worker in enumerate(c.worker_addresses): nparts = len(in_parts.get(worker, ())) if nparts > 0: in_nparts[rank] = nparts workers.add(rank) workers_sorted = sorted(workers) # Find the output partitions for each worker div = npartitions // len(workers) rank_to_out_part_ids = {} # rank -> [list of partition id] for i, rank in enumerate(workers_sorted): rank_to_out_part_ids[rank] = list(range(div * i, div * (i + 1))) for rank, i in zip(workers_sorted, range(div * len(workers), npartitions)): rank_to_out_part_ids[rank].append(i) # Run `local_shuffle()` on each worker result_futures = {} for rank, worker in enumerate(c.worker_addresses): if rank in workers: result_futures[rank] = c.submit( worker, local_shuffle, in_nparts, in_parts[worker], rank_to_out_part_ids, ignore_index, ) distributed.wait(list(result_futures.values())) del df_groups # Step (c): extract individual dataframe-partitions name = f"explicit-comms-shuffle-getitem-{tokenize(name)}" dsk = {} meta = None for rank, parts in rank_to_out_part_ids.items(): for i, part_id in enumerate(parts): dsk[(name, part_id)] = (getitem, result_futures[rank], i) if meta is None: # Get the meta from the first output partition meta = delayed(make_meta)(delayed(getitem)( result_futures[rank], i)).compute() assert meta is not None divs = [None] * (len(dsk) + 1) return new_dd_object(dsk, name, meta, divs).persist()
def from_bcolz(x, chunksize=None, categorize=True, index=None, lock=lock, **kwargs): """Read BColz CTable into a Dask Dataframe BColz is a fast on-disk compressed column store with careful attention given to compression. https://bcolz.readthedocs.io/en/latest/ Parameters ---------- x : bcolz.ctable chunksize : int, optional The size(rows) of blocks to pull out from ctable. categorize : bool, defaults to True Automatically categorize all string dtypes index : string, optional Column to make the index lock: bool or Lock Lock to use when reading or False for no lock (not-thread-safe) See Also -------- from_array: more generic function not optimized for bcolz """ if lock is True: lock = Lock() import bcolz import dask.array as da if isinstance(x, str): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names), )) categories[name] = da.unique(a).compute() columns = tuple(x.dtype.names) divisions = tuple(range(0, len(x), chunksize)) divisions = divisions + (len(x) - 1, ) if x.rootdir: token = tokenize( (x.rootdir, os.path.getmtime(x.rootdir)), chunksize, categorize, index, kwargs, ) else: token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize, index, kwargs) new_name = "from_bcolz-" + token dsk = {(new_name, i): ( dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize), ), columns, categories, lock, ) for i in range(0, int(ceil(len(x) / chunksize)))} meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock) result = DataFrame(dsk, new_name, meta, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names), )) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = tuple(da.percentile(a, q).compute()) return set_partition(result, index, divisions, **kwargs) else: return result
try: f = csv.writer(open('ws1.csv'), 'w', encoding="utf-8") f.writer(word_dict) f = csv.writer(open('ws2.csv'), 'w', encoding="utf-8") f.writer(word_dict) print(pd.read_csv('ws1.csv')) print(pd.read_csv('ws2.csv')) except Exception as e: print(e) from pandas import Series, DataFrame li_data2 = Series(word_dict) print(li_data2) print(li_data2.value_counts()[:5]) import matplotlib.pyplot as plt plt.rc("font", family="malgun gothic") plt.plot(li_data2.value_counts()[:5]) plt.xlabel("횟수 종류") plt.ylabel("종류별 발생수") plt.legend("횟수") plt.show() print('!!!!!!!!!!!!!!!!!!!!!!!!!!') df = DataFrame(wordlist) print(df)