示例#1
0
    def AUC(self, x, y, z):
        #x : Observed parameter
        #y : Modelled parameter
        #z : Indicate whether VaR calculation is required [True or False]
        ###Calculate parameters###
        N1 = x.sum()
        N2 = len(x) - N1
        R = y.rank()
        R1 = R.loc[x == True]
        R2 = R.loc[x == False]
        ###Calculate Area Under the Curve###
        U = N1 * N2 + N1 * (N1 + 1) / 2 - R1.sum()
        AUC = U / (N1 * N2)
        #AUC = u.ab.sum()/ (N1 * N2) #Not used (alternative calculation)
        ###Variance AUC calculation [Optional and can only be applied to small samples]###
        s2 = []
        if z == True:

            def aggregate(t1, t2):
                return t1.apply(lambda a: sum((a == t2) * 0.5 + (a < t2) * 1))

            Ua = dd.map_partitions(aggregate, dd.from_pandas(R1,
                                                             npartitions=4),
                                   R2).compute(scheduler='processes')
            Ub = dd.map_partitions(aggregate, dd.from_pandas(R2,
                                                             npartitions=4),
                                   R1).compute(scheduler='processes')
            V10 = Ua / N2
            V01 = Ub / N1
            s2 = np.var(V10, ddof=1) / N1 + np.var(V01, ddof=1) / N2
        return AUC, s2
示例#2
0
    def lateral(self, table, name_generator, func, args, alias):
        func = func.lower()
        if func not in self.lateral_functions:
            raise ValueError('unknown lateral function %s' % func)

        if func not in self.lateral_meta:
            raise ValueError('unknown meta for lateral function %s' % func)

        alias = name_generator.get(alias)
        name_generator = name_generator.fix(all_unique(args))

        meta = pd.concat([
            table._meta,
            self.add_table_to_columns(self.lateral_meta[func], alias),
        ],
                         axis=1)

        return dd.map_partitions(
            self.lateral_partitions,
            table,
            name_generator,
            func,
            args,
            alias,

            # NOTE: pass empty_result as kw to prevent aligning it
            meta=meta,
            empty_result=meta,
        )
示例#3
0
    def execute_udf_node(op, *args, **kwargs):
        # We have rewritten op.func to be a closure enclosing
        # the kwargs, and therefore, we do not need to pass
        # kwargs here. This is true for all udf execution in this
        # file.
        # See ibis.udf.vectorized.UserDefinedFunction
        if isinstance(op.return_type, dt.Struct):
            meta = make_struct_op_meta(op)

            df = dd.map_partitions(op.func, *args, meta=meta)
            return df
        else:
            name = args[0].name if len(args) == 1 else None
            meta = pandas.Series([], name=name, dtype=op.return_type.to_dask())
            df = dd.map_partitions(op.func, *args, meta=meta)

            return df
示例#4
0
    def execute_udf_node_groupby(op, *args, **kwargs):
        func = op.func

        # all grouping keys must be identical
        assert_identical_grouping_keys(*args)

        # we're performing a scalar operation on grouped column, so
        # perform the operation directly on the underlying Series
        # and regroup after it's finished
        args_objs = [make_selected_obj(arg) for arg in args]
        groupings = args[0].index
        return dd.map_partitions(func, *args_objs).groupby(groupings)
示例#5
0
def test_map_partitions_column_info():
    df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]})
    a = dd.from_pandas(df, npartitions=2)

    b = dd.map_partitions(lambda x: x, a.columns, a)
    assert b.columns == a.columns
    assert eq(df, b)

    b = dd.map_partitions(lambda x: x, a.x.name, a.x)
    assert b.name == a.x.name
    assert eq(df.x, b)

    b = dd.map_partitions(lambda x: x, a.x.name, a.x)
    assert b.name == a.x.name
    assert eq(df.x, b)

    b = dd.map_partitions(lambda df: df.x + df.y, None, a)
    assert b.name == None
    assert isinstance(b, dd.Series)

    b = dd.map_partitions(lambda df: df.x + 1, 'x', a)
    assert isinstance(b, dd.Series)
    assert b.name == 'x'
示例#6
0
def test_map_partitions_column_info():
    df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [5, 6, 7, 8]})
    a = dd.from_pandas(df, npartitions=2)

    b = dd.map_partitions(lambda x: x, a.columns, a)
    tm.assert_index_equal(b.columns, a.columns)
    assert eq(df, b)

    b = dd.map_partitions(lambda x: x, a.x.name, a.x)
    assert b.name == a.x.name
    assert eq(df.x, b)

    b = dd.map_partitions(lambda x: x, a.x.name, a.x)
    assert b.name == a.x.name
    assert eq(df.x, b)

    b = dd.map_partitions(lambda df: df.x + df.y, None, a)
    assert b.name == None
    assert isinstance(b, dd.Series)

    b = dd.map_partitions(lambda df: df.x + 1, "x", a)
    assert isinstance(b, dd.Series)
    assert b.name == "x"
示例#7
0
def test_map_partitions_column_info():
    df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]})
    a = dd.from_pandas(df, npartitions=2)

    b = dd.map_partitions(lambda x: x, a.columns, a)
    assert b.columns == a.columns
    assert eq(df, b)

    b = dd.map_partitions(lambda x: x, a.x.name, a.x)
    assert b.name == a.x.name
    assert eq(df.x, b)

    b = dd.map_partitions(lambda x: x, a.x.name, a.x)
    assert b.name == a.x.name
    assert eq(df.x, b)

    b = dd.map_partitions(lambda df: df.x + df.y, None, a)
    assert b.name == None
    assert isinstance(b, dd.Series)

    b = dd.map_partitions(lambda df: df.x + 1, 'x', a)
    assert isinstance(b, dd.Series)
    assert b.name == 'x'
示例#8
0
def test_map_partitions_names():
    func = lambda x: x
    assert sorted(dd.map_partitions(func, d.columns, d).dask) == \
           sorted(dd.map_partitions(func, d.columns, d).dask)
    assert sorted(dd.map_partitions(lambda x: x, d.columns, d, token=1).dask) == \
           sorted(dd.map_partitions(lambda x: x, d.columns, d, token=1).dask)

    func = lambda x, y: x
    assert sorted(dd.map_partitions(func, d.columns, d, d).dask) == \
           sorted(dd.map_partitions(func, d.columns, d, d).dask)
示例#9
0
def test_map_partitions_names():
    func = lambda x: x
    assert sorted(dd.map_partitions(func, d.columns, d).dask) == \
           sorted(dd.map_partitions(func, d.columns, d).dask)
    assert sorted(dd.map_partitions(lambda x: x, d.columns, d, token=1).dask) == \
           sorted(dd.map_partitions(lambda x: x, d.columns, d, token=1).dask)

    func = lambda x, y: x
    assert sorted(dd.map_partitions(func, d.columns, d, d).dask) == \
           sorted(dd.map_partitions(func, d.columns, d, d).dask)
示例#10
0
    def add_columns(self, table, columns, name_generator):
        name_generator = name_generator.fix(all_unique(columns))

        meta = super(DaskModel, self).add_columns(table._meta_nonempty,
                                                  columns, name_generator)
        meta = meta.iloc[:0]

        return dd.map_partitions(
            self.add_columns_partitions,
            table,
            columns,
            name_generator,

            # NOTE: pass empty_result as kw to prevent aligning it
            meta=meta,
            empty_result=meta,
        )
示例#11
0
def encode_nans(X, col, new_col=None, copy_data=True):
    """Encode NaNs in a dataframe."""
    if not new_col:
        new_col = col + "_nan"
    assert new_col not in X.columns, f"AddIndicators::nan ind : {new_col} already exists in data"
    if isinstance(X, dd.DataFrame):
        ser = dd.map_partitions(pd.isna, X[col], meta=float)
    else:
        ser = pd.isna(X[col]).astype(float)

    if copy_data:
        ret_data = copy.copy(X)
    else:
        ret_data = X
    ret_data[new_col] = ser
    assert_no_duplicate_columns(ret_data)
    return ret_data
示例#12
0
def maximal_independent_set(
    ds: DataMapping,
    **kwargs
) -> DataMapping:
    """Dask MIS

    Returns
    -------
    dask.dataframe.DataFrame
    """
    df = ds
    if isinstance(df, Dataset):
        df = df.to_dataframe()
    if isinstance(df, pd.DataFrame):
        df = dd.from_pandas(df, npartitions=1)
    if not isinstance(df, dd.DataFrame):
        raise ValueError(f'Unable to coerce mapping of type "{type(df)}" to dask DataFrame')
    def func(df):
        ds = core.maximal_independent_set(df, **kwargs)
        return ds.to_dataframe()
    return dd.map_partitions(func, df, meta=[('index_to_drop', df.dtypes['i'])])
示例#13
0
def test_map_partitions_multi_argument():
    assert eq(dd.map_partitions(lambda a, b: a + b, None, d.a, d.b),
              full.a + full.b)
    assert eq(dd.map_partitions(lambda a, b, c: a + b + c, None, d.a, d.b, 1),
              full.a + full.b + 1)
示例#14
0
def test_map_partitions_multi_argument():
    assert eq(dd.map_partitions(lambda a, b: a + b, 'c', d.a, d.b),
              full.a + full.b)
    assert eq(dd.map_partitions(lambda a, b, c: a + b + c, 'c', d.a, d.b, 1),
              full.a + full.b + 1)
示例#15
0
def points_from_xy(x, y, crs=None):
    s = dd.map_partitions(_points_from_xy, x, y, crs=crs)
    example = gpd.GeoSeries(Point(0, 0))
    return GeoSeries(s.dask, s._name, example, [all_space] * s.npartitions)
示例#16
0
 def select_rename(self, df, spec):
     return dd.map_partitions(
         super(DaskModel, self).select_rename, df, spec)
示例#17
0
 def filter_table(self, table, expr, name_generator):
     name_generator = name_generator.fix(all_unique(expr))
     return dd.map_partitions(
         super(DaskModel, self).filter_table, table, expr, name_generator)