def AUC(self, x, y, z): #x : Observed parameter #y : Modelled parameter #z : Indicate whether VaR calculation is required [True or False] ###Calculate parameters### N1 = x.sum() N2 = len(x) - N1 R = y.rank() R1 = R.loc[x == True] R2 = R.loc[x == False] ###Calculate Area Under the Curve### U = N1 * N2 + N1 * (N1 + 1) / 2 - R1.sum() AUC = U / (N1 * N2) #AUC = u.ab.sum()/ (N1 * N2) #Not used (alternative calculation) ###Variance AUC calculation [Optional and can only be applied to small samples]### s2 = [] if z == True: def aggregate(t1, t2): return t1.apply(lambda a: sum((a == t2) * 0.5 + (a < t2) * 1)) Ua = dd.map_partitions(aggregate, dd.from_pandas(R1, npartitions=4), R2).compute(scheduler='processes') Ub = dd.map_partitions(aggregate, dd.from_pandas(R2, npartitions=4), R1).compute(scheduler='processes') V10 = Ua / N2 V01 = Ub / N1 s2 = np.var(V10, ddof=1) / N1 + np.var(V01, ddof=1) / N2 return AUC, s2
def lateral(self, table, name_generator, func, args, alias): func = func.lower() if func not in self.lateral_functions: raise ValueError('unknown lateral function %s' % func) if func not in self.lateral_meta: raise ValueError('unknown meta for lateral function %s' % func) alias = name_generator.get(alias) name_generator = name_generator.fix(all_unique(args)) meta = pd.concat([ table._meta, self.add_table_to_columns(self.lateral_meta[func], alias), ], axis=1) return dd.map_partitions( self.lateral_partitions, table, name_generator, func, args, alias, # NOTE: pass empty_result as kw to prevent aligning it meta=meta, empty_result=meta, )
def execute_udf_node(op, *args, **kwargs): # We have rewritten op.func to be a closure enclosing # the kwargs, and therefore, we do not need to pass # kwargs here. This is true for all udf execution in this # file. # See ibis.udf.vectorized.UserDefinedFunction if isinstance(op.return_type, dt.Struct): meta = make_struct_op_meta(op) df = dd.map_partitions(op.func, *args, meta=meta) return df else: name = args[0].name if len(args) == 1 else None meta = pandas.Series([], name=name, dtype=op.return_type.to_dask()) df = dd.map_partitions(op.func, *args, meta=meta) return df
def execute_udf_node_groupby(op, *args, **kwargs): func = op.func # all grouping keys must be identical assert_identical_grouping_keys(*args) # we're performing a scalar operation on grouped column, so # perform the operation directly on the underlying Series # and regroup after it's finished args_objs = [make_selected_obj(arg) for arg in args] groupings = args[0].index return dd.map_partitions(func, *args_objs).groupby(groupings)
def test_map_partitions_column_info(): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]}) a = dd.from_pandas(df, npartitions=2) b = dd.map_partitions(lambda x: x, a.columns, a) assert b.columns == a.columns assert eq(df, b) b = dd.map_partitions(lambda x: x, a.x.name, a.x) assert b.name == a.x.name assert eq(df.x, b) b = dd.map_partitions(lambda x: x, a.x.name, a.x) assert b.name == a.x.name assert eq(df.x, b) b = dd.map_partitions(lambda df: df.x + df.y, None, a) assert b.name == None assert isinstance(b, dd.Series) b = dd.map_partitions(lambda df: df.x + 1, 'x', a) assert isinstance(b, dd.Series) assert b.name == 'x'
def test_map_partitions_column_info(): df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [5, 6, 7, 8]}) a = dd.from_pandas(df, npartitions=2) b = dd.map_partitions(lambda x: x, a.columns, a) tm.assert_index_equal(b.columns, a.columns) assert eq(df, b) b = dd.map_partitions(lambda x: x, a.x.name, a.x) assert b.name == a.x.name assert eq(df.x, b) b = dd.map_partitions(lambda x: x, a.x.name, a.x) assert b.name == a.x.name assert eq(df.x, b) b = dd.map_partitions(lambda df: df.x + df.y, None, a) assert b.name == None assert isinstance(b, dd.Series) b = dd.map_partitions(lambda df: df.x + 1, "x", a) assert isinstance(b, dd.Series) assert b.name == "x"
def test_map_partitions_names(): func = lambda x: x assert sorted(dd.map_partitions(func, d.columns, d).dask) == \ sorted(dd.map_partitions(func, d.columns, d).dask) assert sorted(dd.map_partitions(lambda x: x, d.columns, d, token=1).dask) == \ sorted(dd.map_partitions(lambda x: x, d.columns, d, token=1).dask) func = lambda x, y: x assert sorted(dd.map_partitions(func, d.columns, d, d).dask) == \ sorted(dd.map_partitions(func, d.columns, d, d).dask)
def add_columns(self, table, columns, name_generator): name_generator = name_generator.fix(all_unique(columns)) meta = super(DaskModel, self).add_columns(table._meta_nonempty, columns, name_generator) meta = meta.iloc[:0] return dd.map_partitions( self.add_columns_partitions, table, columns, name_generator, # NOTE: pass empty_result as kw to prevent aligning it meta=meta, empty_result=meta, )
def encode_nans(X, col, new_col=None, copy_data=True): """Encode NaNs in a dataframe.""" if not new_col: new_col = col + "_nan" assert new_col not in X.columns, f"AddIndicators::nan ind : {new_col} already exists in data" if isinstance(X, dd.DataFrame): ser = dd.map_partitions(pd.isna, X[col], meta=float) else: ser = pd.isna(X[col]).astype(float) if copy_data: ret_data = copy.copy(X) else: ret_data = X ret_data[new_col] = ser assert_no_duplicate_columns(ret_data) return ret_data
def maximal_independent_set( ds: DataMapping, **kwargs ) -> DataMapping: """Dask MIS Returns ------- dask.dataframe.DataFrame """ df = ds if isinstance(df, Dataset): df = df.to_dataframe() if isinstance(df, pd.DataFrame): df = dd.from_pandas(df, npartitions=1) if not isinstance(df, dd.DataFrame): raise ValueError(f'Unable to coerce mapping of type "{type(df)}" to dask DataFrame') def func(df): ds = core.maximal_independent_set(df, **kwargs) return ds.to_dataframe() return dd.map_partitions(func, df, meta=[('index_to_drop', df.dtypes['i'])])
def test_map_partitions_multi_argument(): assert eq(dd.map_partitions(lambda a, b: a + b, None, d.a, d.b), full.a + full.b) assert eq(dd.map_partitions(lambda a, b, c: a + b + c, None, d.a, d.b, 1), full.a + full.b + 1)
def test_map_partitions_multi_argument(): assert eq(dd.map_partitions(lambda a, b: a + b, 'c', d.a, d.b), full.a + full.b) assert eq(dd.map_partitions(lambda a, b, c: a + b + c, 'c', d.a, d.b, 1), full.a + full.b + 1)
def points_from_xy(x, y, crs=None): s = dd.map_partitions(_points_from_xy, x, y, crs=crs) example = gpd.GeoSeries(Point(0, 0)) return GeoSeries(s.dask, s._name, example, [all_space] * s.npartitions)
def select_rename(self, df, spec): return dd.map_partitions( super(DaskModel, self).select_rename, df, spec)
def filter_table(self, table, expr, name_generator): name_generator = name_generator.fix(all_unique(expr)) return dd.map_partitions( super(DaskModel, self).filter_table, table, expr, name_generator)