def weld_merge_triple_index(indexes, cache=True): """ Returns bool arrays for which indexes shall be kept Note it does NOT work correctly with duplicate elements; indexes MUST be already sorted Parameters ---------- indexes : list of list of np.array or WeldObject list of len 2 with first and second elements being the labels in a list for the first and second DataFrame MultiIndex, respectively cache : bool flag to indicate whether to cache result as intermediate result Returns ------- list of WeldObject representation of the computations, one for each DataFrame """ assert len(indexes) == 2 assert len(indexes[0]) == len(indexes[1]) == 3 # flatten the list indexes = [elem for sublist in indexes for elem in sublist] # create final weld objects of what will be the bool arrays # also save the weld_ids for the inputs weld_obj = WeldObject(_encoder, _decoder) weld_ids = [] for array in indexes: array_var = weld_obj.update(array) if isinstance(array, WeldObject): array_var = array.obj_id weld_obj.dependencies[array_var] = array weld_ids.append(array_var) weld_template = """ let len1 = len(%(array1)s); let len2 = len(%(array4)s); # bool arrays shall be padded until maxLen so that result can be cached as np.ndarray of ndim=2 let maxlen = if(len1 > len2, len1, len2); let indexes1 = {%(array1)s, %(array2)s, %(array3)s}; let indexes2 = {%(array4)s, %(array5)s, %(array6)s}; let res = if(len1 > 0L && len2 > 0L, iterate({0L, 0L, appender[bool], appender[bool]}, |p| let val1 = {lookup(indexes1.$0, p.$0), lookup(indexes1.$1, p.$0), lookup(indexes1.$2, p.$0)}; let val2 = {lookup(indexes2.$0, p.$1), lookup(indexes2.$1, p.$1), lookup(indexes2.$2, p.$1)}; let iter_output = if(val1.$0 == val2.$0, if(val1.$1 == val2.$1, if(val1.$2 == val2.$2, {p.$0 + 1L, p.$1 + 1L, merge(p.$2, true), merge(p.$3, true)}, if(val1.$2 < val2.$2, {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)} ) ), if(val1.$1 < val2.$1, {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)} ) ), if(val1.$0 < val2.$0, {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)} ) ); { iter_output, iter_output.$0 < len1 && iter_output.$1 < len2 } ), {0L, 0L, appender[bool], appender[bool]} ); # iterate over remaining un-checked elements in both arrays and append False until maxLen let res = if(res.$0 < maxlen, iterate(res, |p| { {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, p.$0 + 1L < maxlen } ), res); let res = if(res.$1 < maxlen, iterate(res, |p| { {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}, p.$1 + 1L < maxlen } ), res); let b = appender[vec[bool]]; let c = merge(b, result(res.$2)); result(merge(c, result(res.$3)))""" weld_obj.weld_code = weld_template % {'array1': weld_ids[0], 'array2': weld_ids[1], 'array3': weld_ids[2], 'array4': weld_ids[3], 'array5': weld_ids[4], 'array6': weld_ids[5]} result = LazyResult(weld_obj, WeldBit(), 2) weld_objects = [] weld_ids = [] weld_col_ids = [] if cache: id_ = LazyResult.generate_intermediate_id('mindex_merge') weld_input_name = WeldObject.generate_input_name(id_) LazyResult.register_intermediate_result(weld_input_name, result) for i in range(2): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(id_) assert result_var is not None weld_objects.append(weld_obj) weld_ids.append(result_var) else: for i in range(2): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(result.expr) assert result_var is None result_var = result.expr.obj_id weld_obj.dependencies[result_var] = result.expr weld_objects.append(weld_obj) weld_ids.append(result_var) # need 1 array from each resulting tables to get actual length for i in range(2): array_var = weld_objects[i].update(indexes[i * 3]) if isinstance(indexes[i * 3], WeldObject): array_var = indexes[i * 3].obj_id weld_objects[i].dependencies[array_var] = indexes[i * 3] weld_col_ids.append(array_var) weld_templ = """slice(lookup(%(array)s, %(i)s), 0L, len(%(col)s))""" for i in range(2): weld_objects[i].weld_code = weld_templ % {'array': weld_ids[i], 'i': str(i) + 'L', 'col': weld_col_ids[i]} return weld_objects
def cartesian_product_indices(arrays, cache=True): """ Performs cartesian product between all arrays Returns the indices instead of the actual values Parameters ---------- arrays : list of (np.ndarray or LazyResult) list containing arrays that need to be in the product cache : bool, optional flag to indicate whether to cache result as intermediate result Returns ------- list of LazyResult Examples -------- >>> cartesian_product_indices([np.array([1, 2]), np.array([3, 4])]) [[0, 0, 1, 1], [0, 1, 0, 1]] See also -------- pandas.MultiIndex """ if len(arrays) < 2: raise ValueError('expected at least 2 arrays') weld_object = _cartesian_product_indices(arrays) # this now contains the entire np.ndarray with all results of cartesian product result = LazyResult(weld_object, WeldLong(), 2) # construct the actual weld_objects corresponding to single result columns/arrays weld_objects = [] weld_ids = [] if cache: id_ = LazyResult.generate_intermediate_id('cartesian_product') weld_input_name = WeldObject.generate_input_name(id_) LazyResult.register_intermediate_result(weld_input_name, result) for i in range(len(arrays)): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(id_) assert result_var is not None weld_objects.append(weld_obj) weld_ids.append(result_var) else: for i in range(len(arrays)): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(result.expr) assert result_var is None result_var = result.expr.obj_id weld_obj.dependencies[result_var] = result.expr weld_objects.append(weld_obj) weld_ids.append(result_var) weld_template = """lookup(%(array)s, %(i)sL)""" for i in range(len(arrays)): weld_objects[i].weld_code = weld_template % { 'array': weld_ids[i], 'i': str(i) } return [LazyResult(obj, WeldLong(), 1) for obj in weld_objects]
def weld_merge_single_index(indexes, cache=True): """ Returns bool arrays for which indexes shall be kept Parameters ---------- indexes : list of np.array or WeldObject input array cache : bool flag to indicate whether to cache result as intermediate result Returns ------- list of WeldObject representation of the computations Examples ------- >>> index1 = np.array([1, 3, 4, 5, 6]) >>> index2 = np.array([2, 3, 5]) >>> result = weld_merge_single_index([index1, index2]) >>> LazyResult(result[0], WeldBit(), 1).evaluate(verbose=False) [False True False True False] >>> LazyResult(result[1], WeldBit(), 1).evaluate(verbose=False) [False True True] """ weld_obj = WeldObject(_encoder, _decoder) weld_ids = [] for array in indexes: array_var = weld_obj.update(array) if isinstance(array, WeldObject): array_var = array.obj_id weld_obj.dependencies[array_var] = array weld_ids.append(array_var) weld_template = """ let len1 = len(%(array1)s); let len2 = len(%(array2)s); # bool arrays shall be padded until maxLen so that result can be cached as np.ndarray of ndim=2 let maxlen = if(len1 > len2, len1, len2); let res = iterate({0L, 0L, appender[bool], appender[bool]}, |p| let val1 = lookup(%(array1)s, p.$0); let val2 = lookup(%(array2)s, p.$1); let iter_output = if(val1 == val2, {p.$0 + 1L, p.$1 + 1L, merge(p.$2, true), merge(p.$3, true)}, if(val1 < val2, {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)} ) ); { iter_output, iter_output.$0 < len1 && iter_output.$1 < len2 } ); # iterate over remaining un-checked elements in both arrays let res = if (res.$0 < maxlen, iterate(res, |p| { {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3}, p.$0 + 1L < maxlen } ), res); let res = if (res.$1 < maxlen, iterate(res, |p| { {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}, p.$1 + 1L < maxlen } ), res); let b = appender[vec[bool]]; let c = merge(b, result(res.$2)); result(merge(c, result(res.$3)))""" weld_obj.weld_code = weld_template % {'array1': weld_ids[0], 'array2': weld_ids[1]} # this has both required bool arrays into 1 ndarray; note that arrays have been padded with False until of same len # TODO: this could still be a single vec/array with the arrays concatenated instead to avoid decoder with ndim=2 mallocs result = LazyResult(weld_obj, WeldBit(), 2) # creating the actual results to return weld_objects = [] weld_ids = [] weld_col_ids = [] if cache: id_ = LazyResult.generate_intermediate_id('sindex_merge') weld_input_id = WeldObject.generate_input_name(id_) LazyResult.register_intermediate_result(weld_input_id, result) for i in range(2): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(id_) assert result_var is not None weld_objects.append(weld_obj) weld_ids.append(result_var) else: for i in range(2): weld_obj = WeldObject(_encoder, _decoder) result_var = weld_obj.update(result.expr) assert result_var is None result_var = result.expr.obj_id weld_obj.dependencies[result_var] = result.expr weld_objects.append(weld_obj) weld_ids.append(result_var) # need 1 array from each resulting tables to get actual length for i in range(2): array_var = weld_objects[i].update(indexes[i]) if isinstance(indexes[i], WeldObject): array_var = indexes[i].obj_id weld_objects[i].dependencies[array_var] = indexes[i] weld_col_ids.append(array_var) weld_templ = """slice(lookup(%(array)s, %(i)s), 0L, len(%(col)s))""" for i in range(2): weld_objects[i].weld_code = weld_templ % {'array': weld_ids[i], 'i': str(i) + 'L', 'col': weld_col_ids[i]} return weld_objects