def _aggregate(self, operation): aggregated_data = weld_groupby_aggregate( self.expr, [str(numpy_to_weld_type(k)) for k in self.by_types], [str(numpy_to_weld_type(k)) for k in self.columns_types], operation) if len(self.by) == 1: new_index = Index(weld_get_column(aggregated_data, 0, True), self.by_types[0], self.by[0]) else: arrays = [ LazyResult(weld_get_column(aggregated_data, index, True), numpy_to_weld_type(self.by_types[index]), 1) for index in xrange(len(self.by)) ] new_index = MultiIndex.from_arrays(arrays, self.by) new_data = OrderedDict() for i in xrange(len(self.columns)): column_name = self.columns[i] new_data[column_name] = Series( weld_get_column(aggregated_data, i, False), self.columns_types[i], new_index, column_name) return DataFrame(new_data, new_index)
def __getitem__(self, item): """ Retrieve a portion of the Index Parameters ---------- item : slice or LazyResult if slice, returns a sliced Index; if LazyResult, returns a filtered Index only with the labels corresponding to True in the Series Returns ------- Index """ if isinstance(item, slice): item = replace_slice_defaults(item) self.update_rows(item) return Index(self.expr, self.dtype) elif isinstance(item, LazyResult): if str(item.weld_type) != str(numpy_to_weld_type('bool')): raise ValueError( 'expected LazyResult of bool to filter Index elements') return Index(weld_filter(self.expr, item.expr), self.dtype) else: raise TypeError( 'expected slice or LazyResult of bool in Index.__getitem__')
def get_weld_type(data): if isinstance(data, LazyResult): return data.weld_type elif isinstance(data, np.ndarray): return numpy_to_weld_type(data.dtype) else: raise TypeError('expected LazyResult or np.ndarray')
def read_csv(path): """ Read a csv file as a DataFrame Parameters ---------- path : str path of the file Returns ------- DataFrame """ table = csv_weld.Table(path) new_columns = {} for column_name in table.columns: column = table.columns[column_name] weld_obj, weld_input_name = LazyResult.generate_placeholder_weld_object(column.data_id, column.encoder, column.decoder) new_columns[column_name] = LazyResult(weld_obj, numpy_to_weld_type(column.dtype), 1) random_column = new_columns[new_columns.keys()[0]] index_weld_obj = weld_range(0, 'len({})'.format(random_column.expr.weld_code), 1) index_weld_obj.update(random_column.expr) return DataFrame(new_columns, Index(index_weld_obj, np.dtype(np.int64)))
def __init__(self, data, dtype, name=None): if not isinstance(data, (np.ndarray, WeldObject)): raise TypeError( 'expected np.ndarray or WeldObject in Series.__init__') super(Index, self).__init__(data, numpy_to_weld_type(dtype), 1) self.dtype = dtype self.name = 'Index' if name is None else name
def test_getitem_slice(self): weld_type = numpy_to_weld_type('int64') data = LazyResult(np.array([1, 2, 3]), weld_type, 1) series = Series(data.expr, np.dtype(np.int64), RangeIndex(0, 3, 1)) expected_result = Series(np.array([1, 2]), np.dtype(np.int64), RangeIndex(0, 2, 1)) result = series[:2] test_equal_series(expected_result, result)
def test_getitem_filter(self): to_filter = LazyResult( np.array([True, False, True], dtype=np.dtype(np.bool)), numpy_to_weld_type(np.dtype(np.bool)), 1) result = pdw.Index(np.array([1, 2, 3]), np.dtype(np.int64))[to_filter] expected_result = pdw.Index(np.array([1, 3]), np.dtype(np.int64)) np.testing.assert_array_equal( evaluate_if_necessary(expected_result).data, evaluate_if_necessary(result).data)
def __init__(self, file_id, column_name, dimensions, shape, attributes, expression, dtype): inferred_dtype = self._infer_dtype(dtype, attributes) weld_type = numpy_to_weld_type(inferred_dtype) LazyResult.__init__(self, expression, weld_type, 1) self.file_id = file_id self.column_name = column_name self.dimensions = dimensions self.shape = shape self.attributes = attributes # when reading data with netCDF4, the values are multiplied by the scale_factor if it exists, # which means that even if data is of type int, the scale factor is often float making the result a float self.dtype = inferred_dtype # same as [:] # the param used to lazy_slice_rows self.tuple_slices = slice(None) self._slice = None
def duplicate_elements_indices(array, n, cartesian=False): """ Expands array by multiplying each element n times Parameters ---------- array : np.ndarray or LazyResult the source data n : long or LazyResult how many times to repeat each element; if LazyResult, will use its length cartesian : bool True if used internally by cartesian_product to signify the operation has been done once already and hence must behave slightly different by using the number in the array instead of the index of that number (since at this point the array already contains indexes) Returns ------- LazyResult the expanded array containing the indices, not the elements Examples -------- >>> duplicate_elements_indices(np.array([1, 2, 3]), 2) [0, 0, 1, 1, 2, 2] """ if isinstance(array, LazyResult): weld_type = array.weld_type array = array.expr elif isinstance(array, np.ndarray): weld_type = numpy_to_weld_type(array.dtype) else: raise NotImplementedError if isinstance(n, LazyResult): n = n.expr elif isinstance(n, np.ndarray): n = len(n) elif not isinstance(n, long): raise TypeError( 'expected either a long value or a LazyResult to use its length') return LazyResult( _duplicate_elements_indices(array, n, weld_type, cartesian), WeldLong(), 1)
def __getitem__(self, item): """ Retrieve a portion of the MultiIndex Parameters ---------- item : slice or LazyResult if slice, returns a sliced MultiIndex; if LazyResult, returns a filtered MultiIndex only with the labels corresponding to True in the LazyResult Returns ------- MultiIndex """ # TODO: filter unnecessary levels too, both slice and LazyResult if isinstance(item, slice): item = replace_slice_defaults(item) new_labels = [ LazyResult(weld_subset(get_expression_or_raw(label), item), get_weld_type(label), 1) for label in self.labels ] return MultiIndex(self.levels, new_labels, self.names) elif isinstance(item, LazyResult): if str(item.weld_type) != str(numpy_to_weld_type('bool')): raise ValueError( 'expected series of bool to filter DataFrame rows') new_labels = [] for label in self.labels: label, weld_type = get_weld_info(label, True, True) new_labels.append( LazyResult(weld_filter(label, item.expr), weld_type, 1)) return MultiIndex(self.levels, new_labels, self.names) else: raise TypeError( 'expected slice or LazyResult of bool in MultiIndex.__getitem__' )
def test_getitem_filter(self): levels = [ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ] names = ['a', 'b'] to_filter = LazyResult( np.array([True, False, True, False], dtype=np.bool), numpy_to_weld_type(np.dtype(np.bool)), 1) result = pdw.MultiIndex.from_product(levels, names)[to_filter] expected_result = pdw.MultiIndex([ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ], [ LazyResult(np.array([0, 1]), WeldLong(), 1), LazyResult(np.array([0, 0]), WeldLong(), 1) ], ['a', 'b']) test_equal_multiindex(expected_result, result)
def __getitem__(self, item): """ Lazy operation to select a subset of the series Has consequences! When slicing, any previous and/or following operations on the data within will be done only on this subset of the data Parameters ---------- item : slice or LazyResult if slice, a slice of the data for the number of desired rows; currently must contain a stop value and will not work as expected for start != 0 and stride != 1; if LazyResult, returns a filtered Series only with the elements corresponding to True in the item LazyResult Returns ------- Series """ if isinstance(item, slice): item = replace_slice_defaults(item) self.update_rows(item) new_index = self.index[item] return Series(self.expr, self.dtype, new_index, self.name) elif isinstance(item, LazyResult): if str(item.weld_type) != str(numpy_to_weld_type('bool')): raise ValueError( 'expected series of bool to filter DataFrame rows') new_index = self.index[item] return Series(weld_filter(self.expr, item.expr), self.dtype, new_index, self.name) else: raise TypeError( 'expected a slice or a Series of bool in Series.__getitem__')
def _element_wise_op(self, array, value, operation): weld_obj = WeldObject(Variable.encoder, Variable.decoder) array_var = weld_obj.update(array) if isinstance(array, WeldObject): array_var = array.obj_id weld_obj.dependencies[array_var] = array weld_template = """ result( for(%(array)s, appender[%(type)s], |b: appender[%(type)s], i: i64, n: %(type)s| merge(b, n %(operation)s %(value)s) ) )""" weld_obj.weld_code = weld_template % {'array': array_var, 'value': value, 'operation': operation, 'type': numpy_to_weld_type(self.dtype)} return weld_obj
def __getitem__(self, item): """ Retrieve a portion of the DataFrame Has consequences! When slicing, any previous and/or following operations on the data within will be done only on this subset of the data Parameters ---------- item : str or slice or list of str or LazyResult if str, returns a column as a Series; if slice, returns a sliced DataFrame; if list, returns a DataFrame with only the columns from the list; if LazyResult, returns a filtered DataFrame only with the rows corresponding to True in the LazyResult Returns ------- Series or DataFrame """ if isinstance(item, str): element = self.data[item] data, dtype = get_weld_info(element, expression=True, dtype=True) return Series(data, dtype, self.index, item) elif isinstance(item, slice): item = replace_slice_defaults(item) new_data = {} for column_name in self: # making series because Series has the proper method to slice something; re-use the code above series = self[str(column_name)] # the actual slice handled by Series getitem new_data[column_name] = series[item] # index slice handled by index new_index = self.index[item] return DataFrame(new_data, new_index) elif isinstance(item, list): new_data = {} for column_name in item: if not isinstance(column_name, str): raise TypeError( 'expected a list of column names as strings') new_data[column_name] = self.data[column_name] return DataFrame(new_data, self.index) elif isinstance(item, LazyResult): if str(item.weld_type) != str(numpy_to_weld_type('bool')): raise ValueError( 'expected series of bool to filter DataFrame rows') new_data = {} for column_name in self: data = self.data[column_name] data, weld_type, dtype = get_weld_info(data, expression=True, weld_type=True, dtype=True) new_data[column_name] = Series(weld_filter(data, item.expr), dtype, self.index, column_name) # slice the index new_index = self.index[item] return DataFrame(new_data, new_index) else: raise TypeError( 'expected a str, slice, list, or Series in DataFrame.__getitem__' )