예제 #1
0
    def score(self, x, y):
        template = """
            @(loopsize: %(xlen)sL)
            result(for(
                zip(%(x)s, %(y)s),
                merger[f32, +],
                |b, i, e| let res = f32(1) / (f32(1) + exp(f32(0) - result(
                    @(loopsize: %(thlen)sL)
                    for(
                        zip(%(th)s, e.$0),
                        merger[f32, +],
                        |b2, i2, e2| merge(b2, e2.$0 * e2.$1)
                    )
                )));
                if(res >= f32(0.5) && e.$1 == f32(1.0), merge(b, f32(1)), merge(b, f32(0)))
            ))
        """

        weldobj = WeldObject(NumPyEncoder(), NumPyDecoder())
        weldobj.weld_code = template % {
            'th': weldobj.update(self.th, WeldVec(WeldFloat())),  # 4
            'x': weldobj.update(x, WeldVec(WeldVec(WeldFloat()))),  # 5
            'y': weldobj.update(y, WeldVec(WeldFloat())),  # 6
            'xlen': str(len(x)),
            'thlen': str(len(self.th))
        }

        score = weldobj.evaluate(WeldFloat())

        return score / len(x)
예제 #2
0
    def fit(
        self,
        x,
        y,
        weldobj=None
    ):  # todo x is required to be a matrix here (i think this is ok, just have brittle types)
        self.weldobj = weldobj if weldobj else WeldObject(
            NumPyEncoder(), NumPyDecoder())

        m, n = x.shape
        th = np.zeros(n, dtype=np.float32)
        idxs = np.arange(m, dtype=np.int64)

        # pregenerate idxs
        isamps = np.random.choice(idxs, self.n_iters, replace=True)

        template = """
          @(loopsize: %(niters)sL)
          iterate(
            {%(isamps)s, i64(0), %(th)s},
            |p| { {
              p.$0, p.$1 + i64(1),
                let i = lookup(p.$0, p.$1);
                let xi = lookup(%(x)s, i);
                let step = if(p.$1 > i64(0), f32(1) / sqrt(f32(p.$1)), f32(1));
                let hx = f32(1) / (f32(1) + exp(f32(0) - f32(result(
                  @(loopsize: %(th_len)sL)
                  for(
                    zip(p.$2, xi),
                    merger[f32, +],
                    |b, ii, e| merge(b, e.$0 * e.$1)
                  )
                )))) - f32(lookup(%(y)s, i));

                result(@(loopsize: %(th_len)sL)
                  for(
                    p.$2, appender[f32], |b, j, e| merge(b, e - f32(step) * (f32(hx) * lookup(xi, j) / f32(%(m)s) + f32(%(lam)s) / f32(%(m)s) * e))
                  ))
            }, p.$1 < i64(%(niters)s) - 1L }).$2"""

        self.weldobj.weld_code = template % {
            'niters': str(self.n_iters),
            'isamps': self.weldobj.update(isamps, WeldVec(WeldLong())),
            'th': self.weldobj.update(th, WeldVec(WeldFloat())),
            'th_len': str(len(th)),
            'x': self.weldobj.update(x, WeldVec(WeldVec(WeldFloat()))),
            'y': self.weldobj.update(y, WeldVec(WeldFloat())),
            'm': str(float(m)),
            'lam': str(float(self.lam))
        }
        self.th = self.weldobj.evaluate(WeldVec(WeldFloat()))

        return self
예제 #3
0
def weld_subset(array, slice_):
    """ Return a subset of the input array

    Parameters
    ----------
    array : np.array or WeldObject
        1-dimensional array
    slice_ : slice
        subset to return

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(NumPyEncoder(), NumPyDecoder())

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    if slice_.step == 1:
        weld_template = """
        slice(
            %(array)s,
            %(slice_start)s,
            %(slice_stop)s
        )"""
    else:
        weld_template = """
        result(
            for(
                iter(%(array)s, %(slice_start)s, %(slice_stop)s, %(slice_step)s),
                appender,
                |b, i, n| 
                    merge(b, n)
            )  
        )"""

    weld_obj.weld_code = weld_template % {
        'array': array_var,
        'slice_start': 'i64(%s)' % slice_.start,
        'slice_stop': 'i64(%s)' % (slice_.stop - slice_.start),
        'slice_step': 'i64(%s)' % slice_.step
    }

    return weld_obj
예제 #4
0
class Column(LazyData):
    encoder = NumPyEncoder()
    decoder = NumPyDecoder()

    def __init__(self, name, table, data_id, dtype):
        self.name = name
        self.table = table
        self.data_id = data_id
        self.dtype = dtype

    def eager_read(self):
        # make use of cache by retrieving
        df = LazyResult.retrieve_file(self.table.file_id)

        slice_ = slice(self.table.slice_start, self.table.nrows, 1)

        data = df[self.name][slice_].values

        # treat any object dtype as str
        if self.dtype.char == 'O':
            data = data.astype(np.str)

        return data

    def eager_head(self, n=10):
        # skip the cache and re-use read_file method with param from Table
        # which will now only read first n rows
        df = self.table.read_file(n)

        data = df[self.name][:n].values

        # treat any object dtype as str
        if self.dtype.char == 'O':
            data = data.astype(np.str)

        return data

    def lazy_skip_columns(self, columns):
        # pandas allows skipping some columns efficiently through the usecols parameter
        for column in columns:
            self.table.usecols.remove(column)

    def lazy_slice_rows(self, slice_):
        # the parser needs to read until stop anyway, and filter later through eager_read
        self.table.slice_start = slice_.start
        self.table.nrows = slice_.stop
예제 #5
0
    def predict(self, x):
        template = """
            f32(1) / (f32(1) + exp(f32(0) - result(
                @(loopsize: %(th_len)sL)
                for(
                    zip(%(th)s, %(x)s),
                    merger[f32, +],
                    |b, i, e| merge(b, e.$0 * e.$1)
                )
            )))
        """
        weldobj = WeldObject(NumPyEncoder(), NumPyDecoder())
        weldobj.weld_code = template % {
            'th': weldobj.update(self.th, WeldVec(WeldFloat())),
            'x': weldobj.update(x, WeldVec(WeldFloat())),
            'th_len': str(len(self.th))
        }

        ret_ = weldobj.evaluate(WeldFloat(), verbose=False)
        return 1.0 if ret_ >= 0.5 else 0.0
예제 #6
0
class Variable(LazyData, LazyResult):
    """ Weld-ed netCDF4.Variable.

    Functionality is currently (very) restricted to an example operation, printing, and evaluating.

    Parameters
    ----------
    file_id : str
        generated by Dataset from FileMapping
    column_name : str
        the variable name in the dataset
    dimensions : tuple
        same as netCDF4.Variable.dimensions
    shape : tuple
        same as netCDF4.Variable.shape
    attributes : OrderedDict
        all Variable metadata
    expression : str or WeldObject
        str if created by netCDF4_weld.Dataset, else WeldObject tracking the computations created by operations
        on this variable; note that expression must be == column_name if created by Dataset!
    dtype : np.dtype
        type of the elements in this variable

    See also
    --------
    netCDF4.Variable

    """
    encoder = NumPyEncoder()
    decoder = NumPyDecoder()

    def __init__(self, file_id, column_name, dimensions, shape, attributes, expression, dtype):
        inferred_dtype = self._infer_dtype(dtype, attributes)
        weld_type = numpy_to_weld_type(inferred_dtype)
        LazyResult.__init__(self, expression, weld_type, 1)

        self.file_id = file_id
        self.column_name = column_name
        self.dimensions = dimensions
        self.shape = shape
        self.attributes = attributes
        # when reading data with netCDF4, the values are multiplied by the scale_factor if it exists,
        # which means that even if data is of type int, the scale factor is often float making the result a float
        self.dtype = inferred_dtype

        # same as [:]
        # the param used to lazy_slice_rows
        self.tuple_slices = slice(None)
        self._slice = None

    @staticmethod
    def _infer_dtype(dtype, attributes):
        # TODO: can it be float64?
        if 'scale_factor' in attributes:
            return np.dtype(np.float32)
        # calendar is stored as int in netCDF4, but we want the datetime format later which is encoded as a str(?)
        if 'calendar' in attributes:
            return np.dtype(np.str)
        else:
            return dtype

    def eager_read(self, slice_=None):
        ds = LazyResult.retrieve_file(self.file_id)

        # implemented like this to allow re-use of this method from eager_head
        if slice_ is None:
            slice_ = self.tuple_slices

        # want just np.array, no MaskedArray; let netCDF4 do the work of replacing missing values
        ds.variables[self.column_name].set_auto_mask(False)
        # the actual read from file call
        data = ds.variables[self.column_name][slice_]

        # TODO: transpose might be required when data variables have dimensions in a different order than the
        # dimensions declarations

        # want dimension = 1
        data = data.reshape(-1)

        attributes = ds.variables[self.column_name].__dict__
        # xarray creates a pandas DatetimeIndex with Timestamps (as it should); to save time however,
        # a shortcut is taken to convert netCDF4 python date -> pandas timestamp -> py datetime
        # TODO: weld pandas DatetimeIndex & Timestamp
        if 'calendar' in attributes:
            data = np.array([str(pd.Timestamp(k).date()) for k in netCDF4.num2date(data, attributes['units'],
                                                                                   calendar=attributes['calendar'])],
                            dtype=np.str)

        # at this point, netcdf is expected to read a subset; however, it reads slightly more at the end, so slice;
        # self._slice is empty when using eager head
        if self._slice is not None and self.column_name not in self.dimensions:
            len_slice = self._slice.stop - self._slice.start
            return data[:len_slice]
        else:
            return data

    def eager_head(self, n=10):
        tuple_slices = convert_row_to_nd_slices(slice(0, n, 1), self.shape)

        # bypass the cache and call directly
        return self.eager_read(slice_=tuple_slices)

    def lazy_skip_columns(self, columns):
        # nothing to do since netcdf is able to read specific columns only
        pass

    def lazy_slice_rows(self, slice_):
        # user wants a slice of rows, so convert to netCDF4 slices for all dimensions
        if isinstance(slice_, slice):
            slice_ = replace_slice_defaults(slice_)
            self._slice = slice_
            self.tuple_slices = convert_row_to_nd_slices(slice_, self.shape)
        elif isinstance(slice_, tuple):  # assumed correct
            self.tuple_slices = slice_
        else:
            raise TypeError('expected either slice or tuple of slices')

    def __repr__(self):
        return "{}(column_name={}, dtype={}, dimensions={}, attributes={})".format(self.__class__.__name__,
                                                                                   self.column_name,
                                                                                   self.dtype,
                                                                                   repr(self.dimensions),
                                                                                   repr(self.attributes))

    def __str__(self):
        return str(self.expr)

    # this and add are to show that one could also implement/do Weld operations at this level, not just in pandas
    def _element_wise_op(self, array, value, operation):
        weld_obj = WeldObject(Variable.encoder, Variable.decoder)

        array_var = weld_obj.update(array)

        if isinstance(array, WeldObject):
            array_var = array.obj_id
            weld_obj.dependencies[array_var] = array

        weld_template = """
        result(
            for(%(array)s, 
                appender[%(type)s], 
                |b: appender[%(type)s], i: i64, n: %(type)s| 
                    merge(b, n %(operation)s %(value)s)
            )
        )"""

        weld_obj.weld_code = weld_template % {'array': array_var,
                                              'value': value,
                                              'operation': operation,
                                              'type': numpy_to_weld_type(self.dtype)}

        return weld_obj

    def __add__(self, value):
        return Variable(self.file_id,
                        self.column_name,
                        self.shape,
                        self.dimensions,
                        self.attributes,
                        self._element_wise_op(self.expr, value, '+'),
                        self.dtype)
예제 #7
0
import numpy as np
from grizzly.encoders import NumPyEncoder, NumPyDecoder, numpy_to_weld_type
from weld.types import WeldLong
from weld.weldobject import WeldObject

from lazy_result import LazyResult

# the methods are only intended to work with numpy, so have a single encoder/decoder
_encoder = NumPyEncoder()
_decoder = NumPyDecoder()

# TODO: could generalize to return either values or indices


def _duplicate_elements_indices(array, n, weld_type, cartesian=False):
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)

    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    if isinstance(n, WeldObject):
        weld_obj.update(n)
        weld_obj.dependencies[n.obj_id] = n
        n = 'len(%s)' % n.obj_id
    elif isinstance(n, np.ndarray):
        array_var = weld_obj.update(n)
        n = 'len(%s)' % array_var