def _handle_str_contains(self, assign, lhs, rhs, fname): if fname == 'str_contains_regex': comp_func = 'hpat.str_ext.contains_regex' elif fname == 'str_contains_noregex': comp_func = 'hpat.str_ext.contains_noregex' else: assert False str_arr = rhs.args[0] pat = rhs.args[1] func_text = 'def f(str_arr, pat):\n' func_text += ' l = len(str_arr)\n' func_text += ' S = np.empty(l, dtype=np.bool_)\n' func_text += ' for i in numba.parfor.internal_prange(l):\n' func_text += ' S[i] = {}(str_arr[i], pat)\n'.format(comp_func) loc_vars = {} exec(func_text, {}, loc_vars) f = loc_vars['f'] f_blocks = compile_to_numba_ir( f, { 'numba': numba, 'np': np, 'hpat': hpat }, self.typingctx, (if_series_to_array_type(self.typemap[str_arr.name]), if_series_to_array_type(self.typemap[pat.name])), self.typemap, self.calltypes).blocks replace_arg_nodes(f_blocks[min(f_blocks.keys())], [str_arr, pat]) # replace call with result of parfor (S) # S is target of last statement in 1st block of f assign.value = f_blocks[min(f_blocks.keys())].body[-2].target return (f_blocks, [assign])
def _handle_df_col_filter(self, lhs_name, rhs, assign): # find df['col2'] = df['col1'][arr] # since columns should have the same size, output is filled with NaNs # TODO: check for float, make sure col1 and col2 are in the same df if (rhs.op == 'getitem' and rhs.value.name in self.df_cols and lhs_name in self.df_cols and self.is_bool_arr(rhs.index.name)): lhs = assign.target in_arr = rhs.value index_var = rhs.index f_blocks = compile_to_numba_ir( _column_filter_impl_float, { 'numba': numba, 'np': np }, self.typingctx, (if_series_to_array_type(self.typemap[lhs.name]), if_series_to_array_type( self.typemap[in_arr.name]), self.typemap[index_var.name]), self.typemap, self.calltypes).blocks first_block = min(f_blocks.keys()) replace_arg_nodes(f_blocks[first_block], [lhs, in_arr, index_var]) alloc_nodes = gen_np_call('empty_like', np.empty_like, lhs, [in_arr], self.typingctx, self.typemap, self.calltypes) f_blocks[first_block].body = alloc_nodes + \ f_blocks[first_block].body return f_blocks
def _run_pd_DatetimeIndex(self, assign, lhs, rhs): """transform pd.DatetimeIndex() call with string array argument """ kws = dict(rhs.kws) if 'data' in kws: data = kws['data'] if len(rhs.args) != 0: # pragma: no cover raise ValueError( "only data argument suppoted in pd.DatetimeIndex()") else: if len(rhs.args) != 1: # pragma: no cover raise ValueError( "data argument in pd.DatetimeIndex() expected") data = rhs.args[0] def f(str_arr): numba.parfor.init_prange() n = len(str_arr) S = numba.unsafe.ndarray.empty_inferred((n, )) for i in numba.parfor.internal_prange(n): S[i] = hpat.pd_timestamp_ext.parse_datetime_str(str_arr[i]) ret = S f_ir = compile_to_numba_ir( f, { 'hpat': hpat, 'numba': numba }, self.typingctx, (if_series_to_array_type(self.typemap[data.name]), ), self.typemap, self.calltypes) topo_order = find_topo_order(f_ir.blocks) f_ir.blocks[topo_order[-1]].body[-4].target = lhs replace_arg_nodes(f_ir.blocks[topo_order[0]], [data]) return f_ir.blocks
def _handle_empty_like(self, assign, lhs, rhs): # B = empty_like(A) -> B = empty(len(A), dtype) in_arr = rhs.args[0] if self.typemap[in_arr.name].ndim == 1: # generate simpler len() for 1D case def f(_in_arr): # pragma: no cover _alloc_size = len(_in_arr) _out_arr = np.empty(_alloc_size, _in_arr.dtype) else: def f(_in_arr): # pragma: no cover _alloc_size = _in_arr.shape _out_arr = np.empty(_alloc_size, _in_arr.dtype) f_block = compile_to_numba_ir( f, { 'np': np }, self.typingctx, (if_series_to_array_type(self.typemap[in_arr.name]), ), self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [in_arr]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = assign.target return nodes
def _handle_string_array_expr(self, lhs, rhs, assign): # convert str_arr==str into parfor if (rhs.op == 'binop' and rhs.fn in ['==', '!=', '>=', '>', '<=', '<'] and (is_str_arr_typ(self.typemap[rhs.lhs.name]) or is_str_arr_typ(self.typemap[rhs.rhs.name]))): arg1 = rhs.lhs arg2 = rhs.rhs arg1_access = 'A' arg2_access = 'B' len_call = 'len(A)' if is_str_arr_typ(self.typemap[arg1.name]): arg1_access = 'A[i]' # replace type now for correct typing of len, etc. self.typemap.pop(arg1.name) self.typemap[arg1.name] = string_array_type if is_str_arr_typ(self.typemap[arg2.name]): arg1_access = 'B[i]' len_call = 'len(B)' self.typemap.pop(arg2.name) self.typemap[arg2.name] = string_array_type func_text = 'def f(A, B):\n' func_text += ' l = {}\n'.format(len_call) func_text += ' S = np.empty(l, dtype=np.bool_)\n' func_text += ' for i in numba.parfor.internal_prange(l):\n' func_text += ' S[i] = {} {} {}\n'.format( arg1_access, rhs.fn, arg2_access) loc_vars = {} exec(func_text, {}, loc_vars) f = loc_vars['f'] f_blocks = compile_to_numba_ir( f, { 'numba': numba, 'np': np }, self.typingctx, (if_series_to_array_type(self.typemap[arg1.name]), if_series_to_array_type(self.typemap[arg2.name])), self.typemap, self.calltypes).blocks replace_arg_nodes(f_blocks[min(f_blocks.keys())], [arg1, arg2]) # replace == expression with result of parfor (S) # S is target of last statement in 1st block of f assign.value = f_blocks[min(f_blocks.keys())].body[-2].target return (f_blocks, [assign]) return None
def generic(self, args, kws): assert not kws assert len(args) % 2 == 0, "name and column pairs expected" col_names = [a.literal_value for a in args[:len(args)//2]] arr_types = [if_series_to_array_type(a) for a in args[len(args)//2:]] # XXX index handling, assuming implicit index assert "Index" not in col_names[0] col_names = ['Index'] + col_names arr_types = [types.Array(types.int64, 1, 'C')] + arr_types iter_typ = DataFrameTupleIterator(col_names, arr_types) return signature(iter_typ, *args)
def generic(self, args, kws): assert not kws assert len(args) == 1 arr_list = args[0] if (isinstance(arr_list, types.UniTuple) and is_str_arr_typ(arr_list.dtype)): ret_typ = string_array_type else: # use typer of np.concatenate arr_list_to_arr = if_series_to_array_type(arr_list) ret_typ = numba.typing.npydecl.NdConcatenate(self.context).generic()(arr_list_to_arr) return signature(ret_typ, arr_list)
def _handle_dt_index_binop(self, lhs, rhs, assign): arg1, arg2 = rhs.lhs, rhs.rhs allowed_types = (dt_index_series_type, string_type) if (self.typemap[arg1.name] not in allowed_types or self.typemap[arg2.name] not in allowed_types): raise ValueError("DatetimeIndex operation not supported") func_text = 'def f(arg1, arg2):\n' if self.typemap[arg1.name] == dt_index_series_type: func_text += ' dt_index, _str = arg1, arg2\n' comp = 'dt_index[i] {} other'.format(rhs.fn) else: func_text += ' dt_index, _str = arg2, arg1\n' comp = 'other {} dt_index[i]'.format(rhs.fn) func_text += ' l = len(dt_index)\n' func_text += ' other = hpat.pd_timestamp_ext.parse_datetime_str(_str)\n' func_text += ' S = numba.unsafe.ndarray.empty_inferred((l,))\n' func_text += ' for i in numba.parfor.internal_prange(l):\n' func_text += ' S[i] = {}\n'.format(comp) loc_vars = {} exec(func_text, {}, loc_vars) f = loc_vars['f'] # print(func_text) f_blocks = compile_to_numba_ir(f, { 'numba': numba, 'np': np, 'hpat': hpat }, self.typingctx, (if_series_to_array_type(self.typemap[arg1.name]), if_series_to_array_type(self.typemap[arg2.name])), self.typemap, self.calltypes).blocks replace_arg_nodes(f_blocks[min(f_blocks.keys())], [arg1, arg2]) # replace == expression with result of parfor (S) # S is target of last statement in 1st block of f assign.value = f_blocks[min(f_blocks.keys())].body[-2].target return (f_blocks, [assign])
def _run_call_hiframes(self, assign, lhs, rhs, func_name): if func_name in ('to_series_type', 'to_arr_from_series'): assign.value = rhs.args[0] return [assign] if func_name in ('str_contains_regex', 'str_contains_noregex'): return self._handle_str_contains(assign, lhs, rhs, func_name) # arr = fix_df_array(col) -> arr=col if col is array if (func_name == 'fix_df_array' and isinstance(self.typemap[rhs.args[0].name], (types.Array, StringArrayType))): assign.value = rhs.args[0] return [assign] # arr = fix_rolling_array(col) -> arr=col if col is float array if func_name == 'fix_rolling_array': in_arr = rhs.args[0] if isinstance(self.typemap[in_arr.name].dtype, types.Float): assign.value = rhs.args[0] return [assign] else: def f(column): # pragma: no cover a = column.astype(np.float64) f_block = compile_to_numba_ir( f, { 'hpat': hpat, 'np': np }, self.typingctx, (if_series_to_array_type(self.typemap[in_arr.name]), ), self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [in_arr]) nodes = f_block.body[:-3] nodes[-1].target = assign.target return nodes return self._handle_df_col_calls(assign, lhs, rhs, func_name)
def _handle_df_col_calls(self, assign, lhs, rhs, func_name): if func_name == 'count': in_arr = rhs.args[0] f_blocks = compile_to_numba_ir( _column_count_impl, { 'numba': numba, 'np': np, 'hpat': hpat }, self.typingctx, (if_series_to_array_type(self.typemap[in_arr.name]), ), self.typemap, self.calltypes).blocks topo_order = find_topo_order(f_blocks) first_block = topo_order[0] last_block = topo_order[-1] replace_arg_nodes(f_blocks[first_block], [in_arr]) # assign results to lhs output f_blocks[last_block].body[-3].target = assign.target return f_blocks if func_name == 'fillna': out_arr = rhs.args[0] in_arr = rhs.args[1] val = rhs.args[2] f_blocks = compile_to_numba_ir( _column_fillna_impl, { 'numba': numba, 'np': np }, self.typingctx, (if_series_to_array_type(self.typemap[out_arr.name]), if_series_to_array_type(self.typemap[in_arr.name]), if_series_to_array_type(self.typemap[val.name])), self.typemap, self.calltypes).blocks first_block = min(f_blocks.keys()) replace_arg_nodes(f_blocks[first_block], [out_arr, in_arr, val]) return f_blocks if func_name == 'column_sum': in_arr = rhs.args[0] f_blocks = compile_to_numba_ir( _column_sum_impl, { 'numba': numba, 'np': np, 'hpat': hpat }, self.typingctx, (if_series_to_array_type(self.typemap[in_arr.name]), ), self.typemap, self.calltypes).blocks topo_order = find_topo_order(f_blocks) first_block = topo_order[0] last_block = topo_order[-1] replace_arg_nodes(f_blocks[first_block], [in_arr]) # assign results to lhs output f_blocks[last_block].body[-3].target = assign.target return f_blocks if func_name == 'mean': in_arr = rhs.args[0] f_blocks = compile_to_numba_ir( _column_mean_impl, { 'numba': numba, 'np': np, 'hpat': hpat }, self.typingctx, (if_series_to_array_type(self.typemap[in_arr.name]), ), self.typemap, self.calltypes).blocks topo_order = find_topo_order(f_blocks) first_block = topo_order[0] last_block = topo_order[-1] replace_arg_nodes(f_blocks[first_block], [in_arr]) # assign results to lhs output f_blocks[last_block].body[-3].target = assign.target return f_blocks if func_name == 'var': in_arr = rhs.args[0] f_blocks = compile_to_numba_ir( _column_var_impl, { 'numba': numba, 'np': np, 'hpat': hpat }, self.typingctx, (if_series_to_array_type(self.typemap[in_arr.name]), ), self.typemap, self.calltypes).blocks topo_order = find_topo_order(f_blocks) first_block = topo_order[0] last_block = topo_order[-1] replace_arg_nodes(f_blocks[first_block], [in_arr]) # assign results to lhs output f_blocks[last_block].body[-3].target = assign.target return f_blocks return [assign]
def run(self): blocks = self.func_ir.blocks topo_order = find_topo_order(blocks) for label in topo_order: new_body = [] for inst in blocks[label].body: if isinstance(inst, ir.Assign): out_nodes = self._run_assign(inst) if isinstance(out_nodes, list): new_body.extend(out_nodes) if isinstance(out_nodes, dict): label = include_new_blocks(blocks, out_nodes, label, new_body) new_body = [] if isinstance(out_nodes, tuple): gen_blocks, post_nodes = out_nodes label = include_new_blocks(blocks, gen_blocks, label, new_body) new_body = post_nodes else: new_body.append(inst) blocks[label].body = new_body if debug_prints(): # pragma: no cover print("--- types before Series replacement:", self.typemap) print("calltypes: ", self.calltypes) replace_series = {} for vname, typ in self.typemap.items(): if isinstance(typ, SeriesType): # print("replacing series type", vname) new_typ = series_to_array_type(typ) replace_series[vname] = new_typ # replace array.call() variable types if isinstance(typ, types.BoundFunction) and isinstance( typ.this, SeriesType): this = series_to_array_type(typ.this) # TODO: handle string arrays, etc. assert typ.typing_key.startswith('array.') attr = typ.typing_key[len('array.'):] resolver = getattr(ArrayAttribute, 'resolve_' + attr) # methods are either installed with install_array_method or # using @bound_function in arraydecl.py if hasattr(resolver, '__wrapped__'): resolver = bound_function(typ.typing_key)( resolver.__wrapped__) new_typ = resolver(ArrayAttribute(self.typingctx), this) replace_series[vname] = new_typ for vname, typ in replace_series.items(): self.typemap.pop(vname) self.typemap[vname] = typ replace_calltype = {} # replace sig of getitem/setitem/... series type with array for call, sig in self.calltypes.items(): if sig is None: continue assert isinstance(sig, Signature) sig.return_type = if_series_to_array_type(sig.return_type) sig.args = tuple(map(if_series_to_array_type, sig.args)) # XXX: side effect: force update of call signatures if isinstance(call, ir.Expr) and call.op == 'call': # StencilFunc requires kws for typing so sig.args can't be used # reusing sig.args since some types become Const in sig argtyps = sig.args[:len(call.args)] kwtyps = {name: self.typemap[v.name] for name, v in call.kws} new_sig = self.typemap[call.func.name].get_call_type( self.typingctx, argtyps, kwtyps) # calltypes of things like BoundFunction (array.call) need to # be update for lowering to work # XXX: new_sig could be None for things like np.int32() if call in self.calltypes and new_sig is not None: old_sig = self.calltypes[call] # fix types with undefined dtypes in empty_inferred, etc. return_type = _fix_typ_undefs(new_sig.return_type, old_sig.return_type) args = tuple( _fix_typ_undefs(a, b) for a, b in zip(new_sig.args, old_sig.args)) replace_calltype[call] = Signature(return_type, args, new_sig.recvr, new_sig.pysig) for call, sig in replace_calltype.items(): self.calltypes.pop(call) self.calltypes[call] = sig if debug_prints(): # pragma: no cover print("--- types after Series replacement:", self.typemap) print("calltypes: ", self.calltypes) self.func_ir._definitions = get_definitions(self.func_ir.blocks) return if_series_to_unbox(self.return_type)
def _run_assign(self, assign): lhs = assign.target.name rhs = assign.value if isinstance(rhs, ir.Expr): # arr = S.values if (rhs.op == 'getattr' and isinstance(self.typemap[rhs.value.name], SeriesType) and rhs.attr == 'values'): # simply return the column assign.value = rhs.value return [assign] res = self._handle_string_array_expr(lhs, rhs, assign) if res is not None: return res res = self._handle_df_col_filter(lhs, rhs, assign) if res is not None: return res # replace getitems on dt_index/dt64 series with Timestamp function if (rhs.op in ['getitem', 'static_getitem'] and self.typemap[rhs.value.name] == dt_index_series_type): if rhs.op == 'getitem': ind_var = rhs.index else: ind_var = rhs.index_var in_arr = rhs.value def f(_in_arr, _ind): dt = _in_arr[_ind] s = np.int64(dt) res = hpat.pd_timestamp_ext.convert_datetime64_to_timestamp( s) assert self.typemap[ind_var.name] == types.intp f_block = compile_to_numba_ir( f, { 'numba': numba, 'np': np, 'hpat': hpat }, self.typingctx, (if_series_to_array_type( self.typemap[in_arr.name]), types.intp), self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [in_arr, ind_var]) nodes = f_block.body[:-3] # remove none return nodes[-1].target = assign.target return nodes if rhs.op == 'call': fdef = guard(find_callname, self.func_ir, rhs) if fdef is None: # could be make_function from list comprehension which is ok func_def = guard(get_definition, self.func_ir, rhs.func) if isinstance(func_def, ir.Expr) and func_def.op == 'make_function': return [assign] warnings.warn( "function call couldn't be found for initial analysis") return [assign] else: func_name, func_mod = fdef if fdef == ('DatetimeIndex', 'pandas'): return self._run_pd_DatetimeIndex(assign, assign.target, rhs) if func_mod == 'hpat.hiframes_api': return self._run_call_hiframes(assign, assign.target, rhs, func_name) if fdef == ('empty_like', 'numpy'): return self._handle_empty_like(assign, lhs, rhs) if self._is_dt_index_binop(rhs): return self._handle_dt_index_binop(lhs, rhs, assign) return [assign]
def generic(self, args, kws): assert not kws assert len(args) == 1 arr = if_series_to_array_type(args[0], True) return signature(arr, *args)
def generic(self, args, kws): assert not kws assert len(args) == 1 arr = args[0] return signature(if_series_to_array_type(arr), arr)