示例#1
0
class DaskCollect(OpVertex):
    """
    Calls dask.compute on a collection of dask delayed objects
    """
    dask_progress_bar = BaseParameter(
        True, "Include a diagnostic Progressbar from dask")
    num_workers = BaseParameter(4, "Number of dask workers")
    scheduler = BaseParameter('processes', "Dask scheduler option")

    def _node_color(self):
        return '#2fbc2d'

    def run(self, *ops):
        self.ops = list()
        self.delays = list()
        for _op, _delay in ops:
            self.ops.append(_op)
            self.delays.append(_delay)

        compute_kwargs = dict(scheduler=self.scheduler,
                              num_workers=self.num_workers)
        if self.dask_progress_bar:
            with ProgressBar():
                results = dask.compute(*self.delays, **compute_kwargs)
        else:
            results = dask.compute(*self.delays, **compute_kwargs)

        return results
示例#2
0
class DaskParallel(OpVertex):
    """
    Wraps an Op in a dask.delayed object
    """
    parallel_op = BaseParameter()
    op_kwargs = BaseParameter(dict())

    def _node_color(self):
        return '#2fbc2d'

    def _node_shape(self):
        #return 'doublecircle'
        return 'doubleoctagon'

    def requires(self):
        if inspect.isclass(self.parallel_op):
            # instantiate with provided keyword args
            self.inst_op = self.parallel_op(**self.op_kwargs)
        else:
            # already instantiated
            self.inst_op = self.parallel_op

        return self.inst_op.requires()

    def run(self, *args, **kwargs):
        return (self.inst_op, dask.delayed(self.inst_op.run)(*args, **kwargs))

    def get_name(self):
        return self.parallel_op.get_name()
示例#3
0
class DropDuplicates(DFOp):
    subset = BaseParameter(None)
    keep = BaseParameter('first')
    inplace = BaseParameter(False)

    def run(self, df):
        return df.drop_duplicates(subset=self.subset,
                                  keep=self.keep,
                                  inplace=self.inplace)
示例#4
0
class RenameColumns(DFOp):
    columns = BaseParameter(None)
    copy = BaseParameter(True)
    inplace = BaseParameter(False)
    level = BaseParameter(None)

    def run(self, df):
        return df.rename(columns=self.columns,
                         copy=self.copy,
                         inplace=self.inplace,
                         level=self.level)
示例#5
0
class DropNa(DFOp):
    axis = BaseParameter(0)
    how = BaseParameter('any')
    thresh = BaseParameter(None)
    subset = BaseParameter(None)

    def run(self, df):
        return df.dropna(axis=self.axis,
                         how=self.how,
                         thresh=self.thresh,
                         subset=self.subset,
                         inplace=False)
示例#6
0
class AssignColumn(DFOp):
    column = BaseParameter(None)
    value = BaseParameter(None)
    assignments = BaseParameter(None)

    def run(self, df):
        if self.assignments is not None:
            for col, val in self.assignments.items():
                df[col] = val
        else:
            df[self.column] = self.value
        return df
示例#7
0
class Read_CSV(OpVertex):
    filepath_or_buffer = BaseParameter()
    sep = BaseParameter(',')
    delimiter = BaseParameter(None)
    header = BaseParameter('infer')

    def run(self, path=None):
        if path is None:
            path = self.filepath_or_buffer
        return pd.read_csv(filepath_or_buffer=path,
                           sep=self.sep,
                           delimiter=self.delimiter,
                           header=self.header)
示例#8
0
class HistMulti(OpVertex):
    title = BaseParameter('')
    figsize = BaseParameter((8, 5))
    xlabel = BaseParameter('x')
    ylabel = BaseParameter('y')
    logx = BaseParameter(False)
    logy = BaseParameter(False)
    passthrough = BaseParameter(False,
                                "If False, returns axis of plot rather ")

    # @staticmethod
    def hist_compare(self, **hist_data):
        fig, ax = plt.subplots(figsize=self.figsize)
        hist_kwargs = dict(bins=20, alpha=.4, density=False)

        for d_name, d in hist_data.items():
            ax.hist(d, label=d_name, **hist_kwargs)

        ax.legend(fontsize=15)
        ax.set_title(self.title, fontsize=18)
        ax.set_xlabel(self.xlabel, fontsize=16)
        ax.set_ylabel(self.ylabel, fontsize=16)
        # ax.set_xticklabels(ax.get_xticklabels(), fontsize=13)
        ax.tick_params(labelsize=15)

        if self.logy:
            ax.set_yscale('log')
        if self.logx:
            ax.set_xscale('log')

        if self.passthrough:
            return hist_data
        else:
            return ax
示例#9
0
class Subplots(OpVertex):
    nrows = BaseParameter(1)
    ncols = BaseParameter(1)
    sharex = BaseParameter(False)
    sharey = BaseParameter(False)
    #squeeze=BaseParameter(True)
    subplot_kw = BaseParameter(None)
    gridspec_kw = BaseParameter(None)
    figsize = BaseParameter((12, 6))
    _never_cache = True

    #fig_kw = dsdag.core.parameter.BaseParameter(dict())
    def run(self):
        from matplotlib import pyplot as plt
        #self.fig, self.axs = plt.subplots(**{k: v.value for k, v in self.get_parameters()})
        self.fig, self.axs = plt.subplots(nrows=self.nrows,
                                          ncols=self.ncols,
                                          sharex=self.sharex,
                                          sharey=self.sharey,
                                          figsize=self.figsize,
                                          subplot_kw=self.subplot_kw,
                                          gridspec_kw=self.gridspec_kw,
                                          squeeze=False)
        self.axs = [
            self.axs[r][c] for r in range(self.nrows)
            for c in range(self.ncols)
        ]
        return self.axs
示例#10
0
class Join(DFOp):
    how = BaseParameter('left')
    lsuffix = BaseParameter('')
    rsuffix = BaseParameter('')
    sort = BaseParameter(False)

    def requires(self):
        raise NotImplementedError()

    def run(self, *args):
        assert all(isinstance(o, pd.DataFrame) for o in args)
        ret = args[0]
        for o in args[1:]:
            ret = ret.join(o,
                           how=self.how,
                           lsuffix=self.lsuffix,
                           rsuffix=self.rsuffix,
                           sort=self.sort)
        return ret
示例#11
0
class LambdaOp(OpVertex):
    f = BaseParameter(help_msg="Function that is applied to the input")

    def _node_color(self):
        return '#d65768'

    def requires(self):
        raise NotImplementedError("Incomplete LambdaOp - must be applied")

    def run(self, *args, **kwargs):
        return self.f(*args, **kwargs)
示例#12
0
class FrameBrowse(DFOp, FrameBrowseMaixin):
    passthrough = BaseParameter(True)

    def op_nb_viz(self, op_out, viz_out=None):
        return FrameBrowseMaixin.op_nb_viz(op_out, viz_out)

    def run(self, df):
        output = self.op_nb_viz(df)
        display(output)
        if self.passthrough:
            return df
示例#13
0
class Concat(DFOp):
    axis = BaseParameter(0)
    join = BaseParameter('outer')
    ignore_index = BaseParameter(False)
    keys = BaseParameter(None)
    levels = BaseParameter(None)
    verify_integrity = BaseParameter(False)
    sort = BaseParameter(None)
    copy = BaseParameter(True)

    def requires(self):
        raise NotImplementedError()

    def run(self, *args):
        return pd.concat(args,
                         axis=self.axis,
                         join=self.join,
                         ignore_index=self.ignore_index,
                         keys=self.keys,
                         levels=self.levels,
                         verify_integrity=self.verify_integrity,
                         sort=self.sort,
                         copy=self.copy)
示例#14
0
class ContourPlot(OpVertex):
    sample_ixes = BaseParameter(None)
    title = BaseParameter('')
    bands = BaseParameter(None)
    value_column = BaseParameter(None)
    dot_color_column = BaseParameter(None)
    cbar = BaseParameter(False)

    _never_cache = True

    @staticmethod
    def contour_plot(_s_df,
                     value_column,
                     dot_color=None,
                     vmin=None,
                     vmax=None,
                     ax=None,
                     cbar=False,
                     title=''):
        if ax is None:
            fig, ax = plt.subplots()

        x = _s_df['x']
        y = _s_df['y']
        z = _s_df[value_column]

        # define grid.
        _s_2d_df = _s_df.dropna().pivot(index='x',
                                        columns='y',
                                        values=value_column)

        _s_stacked_df = _s_2d_df.stack(
            dropna=False).rename(value_column).reset_index()

        # return _s_2d_df
        xi = _s_2d_df.index.tolist()
        yi = _s_2d_df.columns.tolist()
        zi = griddata(x, y, z, xi, yi, interp='linear')

        vmax = abs(zi).max() if vmax is None else vmax
        vmin = -abs(zi).max() if vmin is None else vmin

        ctr = ax.contourf(xi,
                          yi,
                          zi,
                          30,
                          cmap=plt.cm.hsv,
                          vmax=vmax,
                          vmin=vmin)
        if cbar:
            #plt.colorbar(mappable=ctr)
            plt.colorbar(ctr, ax=ax)

        ax.scatter(
            x=_s_df['x'],
            y=_s_df['y'],
            cmap='gray',
            s=5,
            c=dot_color,
        )

        ax.set_title(title)

        return ax

    def run(
        self,
        df,
        ax=None,
        value_column=None,
    ):
        value_column = self.value_column if value_column is None else value_column
        plt_df = df.dropna()

        self.contour_plot(
            plt_df,
            value_column=value_column,
            title=self.title,
            ax=ax,
            dot_color=plt_df[value_column] if self.dot_color_column is None
            else plt_df[self.dot_color_column],
            cbar=self.cbar)
示例#15
0
class DenseAutoencoder(OpVertex):
    input_feature_dim = BaseParameter()
    output_feature_dim = BaseParameter()
    encoding_dim = BaseParameter(25)
    dropout = BaseParameter(.15)
    input_dropout = BaseParameter(0.15)
    width = BaseParameter(128)
    depth = BaseParameter(1)
    hidden_activation = BaseParameter('relu')
    output_activation = BaseParameter('relu')
    lr = BaseParameter(0.001)
    decay = BaseParameter(0.0)
    optimizer = BaseParameter(None)
    loss = BaseParameter('mean_squared_error')
    batchnorm = BaseParameter(True)
    kernel_initializer = BaseParameter('glorot_normal')
    bias_initializer = BaseParameter('zeros')

    @staticmethod
    def build_autoencoder(input_feature_dim,
                          output_feature_dim,
                          encoding_dim=25,
                          dropout=.15,
                          input_dropout=0.15,
                          width=128,
                          depth=1,
                          hidden_activation='relu',
                          output_activation='relu',
                          lr=0.001,
                          decay=0.0,
                          optimizer=None,
                          loss='mean_squared_error',
                          batchnorm=True,
                          kernel_initializer='glorot_normal',
                          bias_initializer='zeros'):
        """
        returns encoder, decoder, autoencoder
        """

        #print("Input-Enc-Output: %d-%d-%d" % (input_feature_dim, encoding_dim, output_feature_dim))
        input_sym = Input(shape=(input_feature_dim, ))
        enc_x = input_sym

        if input_dropout is not None and input_dropout > 0.:
            enc_x = Dropout(input_dropout)(enc_x)

        for d in range(depth):
            enc_x = Dense(width,
                          activation=None,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer)(enc_x)
            if batchnorm:
                enc_x = BatchNormalization()(enc_x)
            enc_x = Activation(hidden_activation)(enc_x)
            enc_x = Dropout(dropout)(enc_x)

        encoded = Dense(encoding_dim,
                        activation=None,
                        kernel_initializer=kernel_initializer,
                        bias_initializer=bias_initializer)(enc_x)
        if batchnorm:
            encoded = BatchNormalization()(encoded)
        encoded = Activation(hidden_activation, name='encoder')(encoded)

        decoder = Dropout(dropout)(encoded)
        dec_x = decoder

        for d in range(depth):
            dec_x = Dense(width,
                          activation=None,
                          name='decoder_l%d' % d,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer)(dec_x)
            if batchnorm:
                dec_x = BatchNormalization()(dec_x)
            dec_x = Activation(hidden_activation)(dec_x)
            dec_x = Dropout(dropout)(dec_x)

        decoded = Dense(output_feature_dim,
                        activation=None,
                        kernel_initializer=kernel_initializer,
                        bias_initializer=bias_initializer)(dec_x)
        decoded = Activation(output_activation)(decoded)

        autoencoder = Model(input_sym, decoded)

        encoder = Model(input_sym, encoded)

        # create a placeholder for an encoded (32-dimensional) input
        encoded_input = Input(shape=(encoding_dim, ))
        # retrieve the last layer of the autoencoder model
        # decoder_layer = autoencoder.layers[-(depth*4 + 1)]
        decoder_layer = [
            l for l in autoencoder.layers if l.name == 'decoder_l0'
        ][0]
        # create the decoder model
        decoder = Model(encoded_input, decoder_layer(encoded_input))

        if optimizer is None:
            optimizer = Adam(lr=lr, decay=decay)

        autoencoder.compile(optimizer=optimizer, loss=loss)
        return encoder, decoder, autoencoder
示例#16
0
class ApplyMap(DFOp):
    func = BaseParameter()

    def run(self, df):
        return df.applymap(func=self.func)
示例#17
0
class Query(DFOp):
    q = BaseParameter()

    def run(self, df):
        return df.query(self.q)
示例#18
0
class SelectColumns(DFOp):
    columns = BaseParameter(None)

    def run(self, df):
        return df[self.columns]
示例#19
0
class Merge(DFOp):
    #key = BaseParameter()
    how = BaseParameter('inner')

    on = BaseParameter(None)
    left_on = BaseParameter(None)
    right_on = BaseParameter(None)
    left_index = BaseParameter(False)
    right_index = BaseParameter(False)
    sort = BaseParameter(False)
    suffixes = BaseParameter(('_x', '_y'))
    copy = BaseParameter(True)
    indicator = BaseParameter(False)
    validate = BaseParameter(None)

    def requires(self):
        raise NotImplementedError()

    def run(self, *args):
        #frames = list(kwargs.values())
        frames = list(args)
        merged = frames[0]
        for f in frames[1:]:
            merged = merged.merge(f,
                                  on=self.on,
                                  how=self.how,
                                  left_on=self.left_on,
                                  right_on=self.right_on,
                                  left_index=self.left_index,
                                  right_index=self.right_index,
                                  sort=self.sort,
                                  suffixes=self.suffixes,
                                  copy=self.copy,
                                  indicator=self.indicator,
                                  validate=self.validate)

        return merged
示例#20
0
class Select(OpVertex):
    i = BaseParameter(0)
    _never_cache = True
    def run(self, l):
        return l[self.i]
示例#21
0
class InputOp(OpVertex):
    obj = BaseParameter(help_msg="The object to wrap and return")

    def run(self):
        return self.obj
示例#22
0
class Drop(DFOp):
    labels = BaseParameter()
    axis = BaseParameter(0)

    def run(self, df):
        return df.drop(labels=self.labels, axis=self.axis)