def reset_index(self): """ Returns a new DataFrame with previous Index as columns Returns ------- DataFrame """ new_columns = OrderedDict() # the index if isinstance(self.index, (Index, RangeIndex)): new_columns[self.index.name] = self.index else: # is MultiIndex for i in xrange(len(self.index.levels)): new_columns[self.index.names[i]] = \ LazyResult(weld_index_to_values(get_expression_or_raw(self.index.levels[i]), get_expression_or_raw(self.index.labels[i])), get_weld_type(self.index.levels[i]), 1) # the data/columns new_columns.update(self.data) # assumes at least 1 column a_column = get_expression_or_raw(new_columns.values()[-1]) new_index = RangeIndex( 0, LazyResult(weld_count(a_column), WeldLong(), 0).evaluate(), 1) return DataFrame(new_columns, new_index)
def read_csv(path): """ Read a csv file as a DataFrame Parameters ---------- path : str path of the file Returns ------- DataFrame """ table = csv_weld.Table(path) new_columns = {} for column_name in table.columns: column = table.columns[column_name] weld_obj, weld_input_name = LazyResult.generate_placeholder_weld_object(column.data_id, column.encoder, column.decoder) new_columns[column_name] = LazyResult(weld_obj, numpy_to_weld_type(column.dtype), 1) random_column = new_columns[new_columns.keys()[0]] index_weld_obj = weld_range(0, 'len({})'.format(random_column.expr.weld_code), 1) index_weld_obj.update(random_column.expr) return DataFrame(new_columns, Index(index_weld_obj, np.dtype(np.int64)))
def __init__(self, path): self.file_id = LazyResult.generate_file_id(Table._FILE_FORMAT) LazyResult.register_lazy_file(self.file_id, self) self.path = path header_df = self.read_metadata() # the params used to lazy_slice_rows and lazy_skip_columns self.slice_start = None self.nrows = None self.usecols = list(header_df) self.columns = self._create_columns(header_df)
def _create_columns(self, header_df): from weld.weldobject import WeldObject columns = {} for column_name in header_df: data_id = LazyResult.generate_data_id(column_name) column = Column(column_name, self, data_id, header_df[column_name].dtype) columns[column_name] = column weld_input_name = WeldObject.generate_input_name(data_id) LazyResult.register_lazy_data(weld_input_name, column) return columns
def array_to_labels(array, levels, levels_type): """ Extracts the indices of the values in the array Parameters ---------- array : np.ndarray or LazyResult the source data levels : np.ndarray or LazyResult the unique items from the array, currently sorted by default (see TODOs) levels_type : WeldType of the levels Returns ------- LazyResult the labels for MultiIndex """ if isinstance(array, LazyResult): array = array.expr if isinstance(levels, LazyResult): levels = levels.expr return LazyResult(_array_to_labels(array, levels, levels_type), WeldLong(), 1)
def _index_to_values(levels, labels): levels, weld_type = get_weld_info(levels, expression=True, weld_type=True) labels = get_expression_or_raw(labels) return LazyResult(weld_index_to_values(levels, labels), weld_type, 1)
def eager_read(self, slice_=None): ds = LazyResult.retrieve_file(self.file_id) # implemented like this to allow re-use of this method from eager_head if slice_ is None: slice_ = self.tuple_slices # want just np.array, no MaskedArray; let netCDF4 do the work of replacing missing values ds.variables[self.column_name].set_auto_mask(False) # the actual read from file call data = ds.variables[self.column_name][slice_] # TODO: transpose might be required when data variables have dimensions in a different order than the # dimensions declarations # want dimension = 1 data = data.reshape(-1) attributes = ds.variables[self.column_name].__dict__ # xarray creates a pandas DatetimeIndex with Timestamps (as it should); to save time however, # a shortcut is taken to convert netCDF4 python date -> pandas timestamp -> py datetime # TODO: weld pandas DatetimeIndex & Timestamp if 'calendar' in attributes: data = np.array([str(pd.Timestamp(k).date()) for k in netCDF4.num2date(data, attributes['units'], calendar=attributes['calendar'])], dtype=np.str) # at this point, netcdf is expected to read a subset; however, it reads slightly more at the end, so slice; # self._slice is empty when using eager head if self._slice is not None and self.column_name not in self.dimensions: len_slice = self._slice.stop - self._slice.start return data[:len_slice] else: return data
def _aggregate(self, operation, verbose=True, decode=True, passes=None, num_threads=1, apply_experimental_transforms=False): assert isinstance(operation, (str, unicode)) index = [] data = [] for column_name in self: index.append(column_name) # get as series series = self[str(column_name)] # apply the operation data.append( LazyResult( weld_aggregate(series.expr, operation, series.weld_type), series.weld_type, 0).evaluate(verbose, decode, passes, num_threads, apply_experimental_transforms)) return Series( np.array(data).astype(np.float64), np.dtype(np.float64), Index(np.array(index).astype(np.str), np.dtype(np.str)))
def test_head(self): data = LazyResult(np.array([1, 2, 3]), np.dtype(np.int64), 1) series = Series(data.expr, np.dtype(np.int64), RangeIndex(0, 2, 1)) expected_result = np.array([1, 2]) result = series.head(2) np.testing.assert_array_equal(expected_result, result)
def _create_variables(self): variables = OrderedDict() for kv in self.ds.variables.items(): # generate a data_id to act as placeholder to the data data_id = LazyResult.generate_data_id(kv[0]) weld_obj, weld_input_id = LazyResult.generate_placeholder_weld_object( data_id, Variable.encoder, Variable.decoder) variable = Variable(self.file_id, kv[0], kv[1].dimensions, kv[1].shape, kv[1].__dict__, weld_obj, kv[1].dtype) LazyResult.register_lazy_data(weld_input_id, variable) variables[kv[0]] = variable return variables
def __init__(self, file_id, column_name, dimensions, shape, attributes, expression, dtype): inferred_dtype = self._infer_dtype(dtype, attributes) weld_type = numpy_to_weld_type(inferred_dtype) LazyResult.__init__(self, expression, weld_type, 1) self.file_id = file_id self.column_name = column_name self.dimensions = dimensions self.shape = shape self.attributes = attributes # when reading data with netCDF4, the values are multiplied by the scale_factor if it exists, # which means that even if data is of type int, the scale factor is often float making the result a float self.dtype = inferred_dtype # same as [:] # the param used to lazy_slice_rows self.tuple_slices = slice(None) self._slice = None
def _create_columns(self, header_df): from weld.weldobject import WeldObject columns = {} for column_name in header_df: data_id = LazyResult.generate_data_id(column_name) column = Column(column_name, self, data_id, header_df[column_name].dtype) weld_input_name = WeldObject.generate_input_name(data_id) LazyResult.register_lazy_data(weld_input_name, column) # force read it eagerly LazyResult.input_mapping[str( weld_input_name)] = column.eager_read() columns[column_name] = column return columns
def test_getitem_slice(self): weld_type = numpy_to_weld_type('int64') data = LazyResult(np.array([1, 2, 3]), weld_type, 1) series = Series(data.expr, np.dtype(np.int64), RangeIndex(0, 3, 1)) expected_result = Series(np.array([1, 2]), np.dtype(np.int64), RangeIndex(0, 2, 1)) result = series[:2] test_equal_series(expected_result, result)
def __getitem__(self, item): """ Retrieve a portion of the MultiIndex Parameters ---------- item : slice or LazyResult if slice, returns a sliced MultiIndex; if LazyResult, returns a filtered MultiIndex only with the labels corresponding to True in the LazyResult Returns ------- MultiIndex """ # TODO: filter unnecessary levels too, both slice and LazyResult if isinstance(item, slice): item = replace_slice_defaults(item) new_labels = [ LazyResult(weld_subset(get_expression_or_raw(label), item), get_weld_type(label), 1) for label in self.labels ] return MultiIndex(self.levels, new_labels, self.names) elif isinstance(item, LazyResult): if str(item.weld_type) != str(numpy_to_weld_type('bool')): raise ValueError( 'expected series of bool to filter DataFrame rows') new_labels = [] for label in self.labels: label, weld_type = get_weld_info(label, True, True) new_labels.append( LazyResult(weld_filter(label, item.expr), weld_type, 1)) return MultiIndex(self.levels, new_labels, self.names) else: raise TypeError( 'expected slice or LazyResult of bool in MultiIndex.__getitem__' )
def __init__(self, path, variables=None, dimensions=None): self.file_id = LazyResult.generate_file_id(Dataset._FILE_FORMAT) LazyResult.register_lazy_file(self.file_id, self) self.path = path self.ds = self.read_metadata() if variables is None: self.variables = self._create_variables() else: self.variables = variables if dimensions is None: self.dimensions = OrderedDict( map(lambda kv: (kv[0], kv[1].size), self.ds.dimensions.items())) else: self.dimensions = dimensions self._columns = [k for k in self.variables if k not in self.dimensions]
def test_getitem_filter(self): to_filter = LazyResult( np.array([True, False, True], dtype=np.dtype(np.bool)), numpy_to_weld_type(np.dtype(np.bool)), 1) result = pdw.Index(np.array([1, 2, 3]), np.dtype(np.int64))[to_filter] expected_result = pdw.Index(np.array([1, 3]), np.dtype(np.int64)) np.testing.assert_array_equal( evaluate_if_necessary(expected_result).data, evaluate_if_necessary(result).data)
def _merge_multi(self, index1, index2): assert len(index1.levels) == len(index2.levels) == 3 index1 = index1.expand() index2 = index2.expand() data = weld_merge_triple_index( [[get_expression_or_raw(index1[i]) for i in xrange(3)], [get_expression_or_raw(index2[i]) for i in xrange(3)]], DataFrame._cache_flag) return [LazyResult(data[i], WeldBit(), 1) for i in xrange(2)]
def eager_read(self): # make use of cache by retrieving df = LazyResult.retrieve_file(self.table.file_id) slice_ = slice(self.table.slice_start, self.table.nrows, 1) data = df[self.name][slice_].values # treat any object dtype as str if self.dtype.char == 'O': data = data.astype(np.str) return data
def from_arrays(cls, arrays, names): weld_types = [get_weld_type(k) for k in arrays] arrays = [get_expression_or_raw(k) for k in arrays] levels = [ LazyResult(weld_unique(arrays[k], weld_types[k]), weld_types[k], 1) for k in xrange(len(arrays)) ] levels_types = [level.weld_type for level in levels] labels = [ npw.array_to_labels(arrays[k], levels[k], levels_types[k]) for k in xrange(len(arrays)) ] return cls(levels, labels, names)
def duplicate_elements_indices(array, n, cartesian=False): """ Expands array by multiplying each element n times Parameters ---------- array : np.ndarray or LazyResult the source data n : long or LazyResult how many times to repeat each element; if LazyResult, will use its length cartesian : bool True if used internally by cartesian_product to signify the operation has been done once already and hence must behave slightly different by using the number in the array instead of the index of that number (since at this point the array already contains indexes) Returns ------- LazyResult the expanded array containing the indices, not the elements Examples -------- >>> duplicate_elements_indices(np.array([1, 2, 3]), 2) [0, 0, 1, 1, 2, 2] """ if isinstance(array, LazyResult): weld_type = array.weld_type array = array.expr elif isinstance(array, np.ndarray): weld_type = numpy_to_weld_type(array.dtype) else: raise NotImplementedError if isinstance(n, LazyResult): n = n.expr elif isinstance(n, np.ndarray): n = len(n) elif not isinstance(n, long): raise TypeError( 'expected either a long value or a LazyResult to use its length') return LazyResult( _duplicate_elements_indices(array, n, weld_type, cartesian), WeldLong(), 1)
def test_getitem_filter(self): levels = [ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ] names = ['a', 'b'] to_filter = LazyResult( np.array([True, False, True, False], dtype=np.bool), numpy_to_weld_type(np.dtype(np.bool)), 1) result = pdw.MultiIndex.from_product(levels, names)[to_filter] expected_result = pdw.MultiIndex([ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ], [ LazyResult(np.array([0, 1]), WeldLong(), 1), LazyResult(np.array([0, 0]), WeldLong(), 1) ], ['a', 'b']) test_equal_multiindex(expected_result, result)
def test_from_product(self): levels = [ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ] names = ['a', 'b'] result = pdw.MultiIndex.from_product(levels, names) expected_result = pdw.MultiIndex([ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ], [ LazyResult(np.array([0, 0, 1, 1]), WeldLong(), 1), LazyResult(np.array([0, 1, 0, 1]), WeldLong(), 1) ], ['a', 'b']) test_equal_multiindex(expected_result, result)
def _merge_single(self, index1, index2): data = [get_expression_or_raw(index1), get_expression_or_raw(index2)] data = weld_merge_single_index(data, DataFrame._cache_flag) return [LazyResult(data[i], WeldBit(), 1) for i in xrange(2)]
def _aggregate(self, operation): assert isinstance(operation, (str, unicode)) return LazyResult(weld_aggregate(self.expr, operation, self.weld_type), self.weld_type, 0)
def weld_merge_single_index(indexes, cache=True): """ Returns bool arrays for which indexes shall be kept Parameters ---------- indexes : list of np.array or WeldObject input array cache : bool flag to indicate whether to cache result as intermediate result Returns ------- list of WeldObject representation of the computations Examples ------- >>> index1 = np.array([1, 3, 4, 5, 6]) >>> index2 = np.array([2, 3, 5]) >>> result = weld_merge_single_index([index1, index2]) >>> LazyResult(result[0], WeldBit(), 1).evaluate(verbose=False) [False True False True False] >>> LazyResult(result[1], WeldBit(), 1).evaluate(verbose=False) [False True True] """ weld_obj = WeldObject(_encoder, _decoder) weld_ids = [] for array in indexes: array_var = weld_obj.update(array) if isinstance(array, WeldObject): array_var = array.obj_id weld_obj.dependencies[array_var] = array weld_ids.append(array_var) weld_template = """ let len1 = len(%(array1)s); let len2 = len(%(array2)s); # bool arrays shall be padded until maxLen so that result can be cached as np.ndarray of ndim=2 let maxlen = if(len1 > len2, len1, len2); let res = iterate({0L, 0L, appender[bool], appender[bool]}, |p| let val1 = lookup(%(array1)s, p.$0); let val2 = lookup(%(array2)s, p.$1); let iter_output = if(val1 == val2, {p.$0 + 1L, p.$1 + 1L, merge(p.$2, true), merge(p.$3, true)}, if(val1 < val2, {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)} ) ); { iter_output, iter_output.$0 < len1 && iter_output.$1 < len2 } ); # iterate over remaining un-checked elements in both arrays let res = if (res.$0 < maxlen, iterate(res, |p| { {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, p.$0 + 1L < maxlen } ), res); let res = if (res.$1 < maxlen, iterate(res, |p| { {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}, p.$1 + 1L < maxlen } ), res); let b = appender[vec[bool]]; let c = merge(b, result(res.$2)); result(merge(c, result(res.$3)))""" weld_obj.weld_code = weld_template % {'array1': weld_ids[0], 'array2': weld_ids[1]} # this has both required bool arrays into 1 ndarray; note that arrays have been padded with False until of same len # TODO: this could still be a single vec/array with the arrays concatenated instead to avoid decoder with ndim=2 mallocs result = LazyResult(weld_obj, WeldBit(), 2) # creating the actual results to return weld_objects = [] weld_ids = [] weld_col_ids = [] if cache: id_ = LazyResult.generate_intermediate_id('sindex_merge') weld_input_id = WeldObject.generate_input_name(id_) LazyResult.register_intermediate_result(weld_input_id, result) for i in range(2): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(id_) assert result_var is not None weld_objects.append(weld_obj) weld_ids.append(result_var) else: for i in range(2): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(result.expr) assert result_var is None result_var = result.expr.obj_id weld_obj.dependencies[result_var] = result.expr weld_objects.append(weld_obj) weld_ids.append(result_var) # need 1 array from each resulting tables to get actual length for i in range(2): array_var = weld_objects[i].update(indexes[i]) if isinstance(indexes[i], WeldObject): array_var = indexes[i].obj_id weld_objects[i].dependencies[array_var] = indexes[i] weld_col_ids.append(array_var) weld_templ = """slice(lookup(%(array)s, %(i)s), 0L, len(%(col)s))""" for i in range(2): weld_objects[i].weld_code = weld_templ % {'array': weld_ids[i], 'i': str(i) + 'L', 'col': weld_col_ids[i]} return weld_objects
def mean(self): return LazyResult(weld_mean(self.expr, self.weld_type), WeldDouble(), 0)
def cartesian_product_indices(arrays, cache=True): """ Performs cartesian product between all arrays Returns the indices instead of the actual values Parameters ---------- arrays : list of (np.ndarray or LazyResult) list containing arrays that need to be in the product cache : bool, optional flag to indicate whether to cache result as intermediate result Returns ------- list of LazyResult Examples -------- >>> cartesian_product_indices([np.array([1, 2]), np.array([3, 4])]) [[0, 0, 1, 1], [0, 1, 0, 1]] See also -------- pandas.MultiIndex """ if len(arrays) < 2: raise ValueError('expected at least 2 arrays') weld_object = _cartesian_product_indices(arrays) # this now contains the entire np.ndarray with all results of cartesian product result = LazyResult(weld_object, WeldLong(), 2) # construct the actual weld_objects corresponding to single result columns/arrays weld_objects = [] weld_ids = [] if cache: id_ = LazyResult.generate_intermediate_id('cartesian_product') weld_input_name = WeldObject.generate_input_name(id_) LazyResult.register_intermediate_result(weld_input_name, result) for i in range(len(arrays)): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(id_) assert result_var is not None weld_objects.append(weld_obj) weld_ids.append(result_var) else: for i in range(len(arrays)): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(result.expr) assert result_var is None result_var = result.expr.obj_id weld_obj.dependencies[result_var] = result.expr weld_objects.append(weld_obj) weld_ids.append(result_var) weld_template = """lookup(%(array)s, %(i)sL)""" for i in range(len(arrays)): weld_objects[i].weld_code = weld_template % { 'array': weld_ids[i], 'i': str(i) } return [LazyResult(obj, WeldLong(), 1) for obj in weld_objects]
def weld_merge_triple_index(indexes, cache=True): """ Returns bool arrays for which indexes shall be kept Note it does NOT work correctly with duplicate elements; indexes MUST be already sorted Parameters ---------- indexes : list of list of np.array or WeldObject list of len 2 with first and second elements being the labels in a list for the first and second DataFrame MultiIndex, respectively cache : bool flag to indicate whether to cache result as intermediate result Returns ------- list of WeldObject representation of the computations, one for each DataFrame """ assert len(indexes) == 2 assert len(indexes[0]) == len(indexes[1]) == 3 # flatten the list indexes = [elem for sublist in indexes for elem in sublist] # create final weld objects of what will be the bool arrays # also save the weld_ids for the inputs weld_obj = WeldObject(_encoder, _decoder) weld_ids = [] for array in indexes: array_var = weld_obj.update(array) if isinstance(array, WeldObject): array_var = array.obj_id weld_obj.dependencies[array_var] = array weld_ids.append(array_var) weld_template = """ let len1 = len(%(array1)s); let len2 = len(%(array4)s); # bool arrays shall be padded until maxLen so that result can be cached as np.ndarray of ndim=2 let maxlen = if(len1 > len2, len1, len2); let indexes1 = {%(array1)s, %(array2)s, %(array3)s}; let indexes2 = {%(array4)s, %(array5)s, %(array6)s}; let res = if(len1 > 0L && len2 > 0L, iterate({0L, 0L, appender[bool], appender[bool]}, |p| let val1 = {lookup(indexes1.$0, p.$0), lookup(indexes1.$1, p.$0), lookup(indexes1.$2, p.$0)}; let val2 = {lookup(indexes2.$0, p.$1), lookup(indexes2.$1, p.$1), lookup(indexes2.$2, p.$1)}; let iter_output = if(val1.$0 == val2.$0, if(val1.$1 == val2.$1, if(val1.$2 == val2.$2, {p.$0 + 1L, p.$1 + 1L, merge(p.$2, true), merge(p.$3, true)}, if(val1.$2 < val2.$2, {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)} ) ), if(val1.$1 < val2.$1, {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)} ) ), if(val1.$0 < val2.$0, {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)} ) ); { iter_output, iter_output.$0 < len1 && iter_output.$1 < len2 } ), {0L, 0L, appender[bool], appender[bool]} ); # iterate over remaining un-checked elements in both arrays and append False until maxLen let res = if(res.$0 < maxlen, iterate(res, |p| { {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, p.$0 + 1L < maxlen } ), res); let res = if(res.$1 < maxlen, iterate(res, |p| { {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}, p.$1 + 1L < maxlen } ), res); let b = appender[vec[bool]]; let c = merge(b, result(res.$2)); result(merge(c, result(res.$3)))""" weld_obj.weld_code = weld_template % {'array1': weld_ids[0], 'array2': weld_ids[1], 'array3': weld_ids[2], 'array4': weld_ids[3], 'array5': weld_ids[4], 'array6': weld_ids[5]} result = LazyResult(weld_obj, WeldBit(), 2) weld_objects = [] weld_ids = [] weld_col_ids = [] if cache: id_ = LazyResult.generate_intermediate_id('mindex_merge') weld_input_name = WeldObject.generate_input_name(id_) LazyResult.register_intermediate_result(weld_input_name, result) for i in range(2): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(id_) assert result_var is not None weld_objects.append(weld_obj) weld_ids.append(result_var) else: for i in range(2): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(result.expr) assert result_var is None result_var = result.expr.obj_id weld_obj.dependencies[result_var] = result.expr weld_objects.append(weld_obj) weld_ids.append(result_var) # need 1 array from each resulting tables to get actual length for i in range(2): array_var = weld_objects[i].update(indexes[i * 3]) if isinstance(indexes[i * 3], WeldObject): array_var = indexes[i * 3].obj_id weld_objects[i].dependencies[array_var] = indexes[i * 3] weld_col_ids.append(array_var) weld_templ = """slice(lookup(%(array)s, %(i)s), 0L, len(%(col)s))""" for i in range(2): weld_objects[i].weld_code = weld_templ % {'array': weld_ids[i], 'i': str(i) + 'L', 'col': weld_col_ids[i]} return weld_objects
def count(self): return LazyResult(weld_count(self.expr), WeldLong(), 0)
def std(self): return LazyResult(weld_standard_deviation(self.expr, self.weld_type), WeldDouble(), 0)