def compute(self, context, bool_expr): entity = context.entity baseperiod = entity.base_period period = context.period - 1 value = expr_eval(bool_expr, context) # using a full int so that the "store" type check works result = value.astype(np.int) res_size = len(entity.array) last_period_true = np.full(res_size, period + 1, dtype=np.int) id_to_rownum = context.id_to_rownum still_running = value.copy() while np.any(still_running) and period >= baseperiod: ids, values = self.value_for_period(bool_expr, period, context, fill=None) missing = np.ones(res_size, dtype=bool) period_value = np.zeros(res_size, dtype=bool) if len(ids): value_rows = id_to_rownum[ids] safe_put(missing, value_rows, False) safe_put(period_value, value_rows, values) value = still_running & period_value result += value * (last_period_true - period) still_running &= period_value | missing last_period_true[period_value] = period period -= 1 return result
def compute(self, context, expr): entity = context.entity baseperiod = entity.base_period period = context.period - 1 typemap = {bool: int, int: int, float: float} res_type = typemap[getdtype(expr, context)] res_size = len(entity.array) sum_values = np.zeros(res_size, dtype=res_type) id_to_rownum = context.id_to_rownum while period >= baseperiod: ids, values = self.value_for_period(expr, period, context, fill=None) # filter out lines which are present because there was a value for # that individual at that period but not for that column acceptable_rows = hasvalue(values) acceptable_ids = ids[acceptable_rows] if len(acceptable_ids): acceptable_values = values[acceptable_rows] value_rows = id_to_rownum[acceptable_ids] period_value = np.zeros(res_size, dtype=np.float) safe_put(period_value, value_rows, acceptable_values) sum_values += period_value period -= 1 return sum_values
def compute(self, context, expr): entity = context.entity baseperiod = entity.base_period period = context.period - 1 res_size = len(entity.array) num_values = np.zeros(res_size, dtype=np.int) # current period last_period_wh_value = np.full(res_size, context.period, dtype=np.int) sum_values = np.zeros(res_size, dtype=np.float) id_to_rownum = context.id_to_rownum while period >= baseperiod: ids, values = self.value_for_period(expr, period, context, fill=None) # filter out lines which are present because there was a value for # that individual at that period but not for that column acceptable_rows = hasvalue(values) acceptable_ids = ids[acceptable_rows] if len(acceptable_ids): acceptable_values = values[acceptable_rows] value_rows = id_to_rownum[acceptable_ids] has_value = np.zeros(res_size, dtype=bool) safe_put(has_value, value_rows, True) period_value = np.zeros(res_size, dtype=np.float) safe_put(period_value, value_rows, acceptable_values) num_values += has_value * (last_period_wh_value - period) sum_values += period_value last_period_wh_value[has_value] = period period -= 1 return sum_values / num_values
def fill_missing_values(ids, values, context, filler='auto'): """ ids: ids present in past period values: values in past period context: current period context """ if filler is 'auto': filler = get_default_value(values) result = np.full(context_length(context), filler, dtype=values.dtype) if len(ids): id_to_rownum = context.id_to_rownum # if there was more objects in the past than in the current # period. Currently, remove() keeps old ids, so this never # happens, but if we ever change remove(), we'll need to add # such a check everywhere we use id_to_rownum # invalid_ids = ids > len(id_to_rownum) # if np.any(invalid_ids): # fix ids rows = id_to_rownum[ids] safe_put(result, rows, values) return result
def merge_subset_in_array(output, id_to_rownum, subset, first=False, default_values=None): default_values = default_values if default_values is not None else {} if subset.dtype == output.dtype and len(subset) == len(output): return subset elif subset.dtype == output.dtype: safe_put(output, id_to_rownum[subset['id']], subset) return output output_names = output.dtype.names subset_names = subset.dtype.names names_to_copy = set(subset_names) & set(output_names) if len(subset) == len(output): for fname in names_to_copy: output[fname] = subset[fname] return output else: rownums = id_to_rownum[subset['id']] # TODO: this is a gross approximation, more research is needed to get # a better threshold. It might also depend on "first". if len(names_to_copy) > len(output_names) / 2: if first: subset_all_cols = np.empty(len(subset), dtype=output.dtype) for fname in set(output_names) - set(subset_names): default_value = default_values.get(fname, None) subset_all_cols[fname] = \ get_default_value(subset_all_cols[fname], default_value) else: subset_all_cols = output[rownums] # Note that all rows which correspond to rownums == -1 have # wrong values (they have the value of the last row) but it is # not necessary to correct them since they will not be copied # back into output_array. # np.putmask(subset_all_cols, rownums == -1, missing_row) for fname in names_to_copy: subset_all_cols[fname] = subset[fname] safe_put(output, rownums, subset_all_cols) else: for fname in names_to_copy: safe_put(output[fname], rownums, subset[fname]) return output
def build_period_array(input_table, fields_to_keep, output_fields, input_rows, input_index, start_period, default_values): periods_before = [p for p in input_rows.keys() if p <= start_period] if not periods_before: id_to_rownum = np.empty(0, dtype=int) output_array = ColumnArray.empty(0, np.dtype(output_fields)) return output_array, id_to_rownum periods_before.sort() # take the last period which we have data for target_period = periods_before[-1] # computing is_present max_id = len(input_index[target_period]) - 1 period_id_to_rownum = None present_in_period = None is_present = np.zeros(max_id + 1, dtype=bool) for period in periods_before: period_id_to_rownum = input_index[period] present_in_period = period_id_to_rownum != -1 present_in_period.resize(max_id + 1, refcheck=False) is_present |= present_in_period # if all individuals are present in the target period, we are done already! if np.array_equal(present_in_period, is_present): start, stop = input_rows[target_period] input_array = ColumnArray.from_table(input_table, start, stop) input_array.add_and_drop_fields(fields_to_keep, output_fields, default_values) return input_array, period_id_to_rownum # building id_to_rownum for the target period id_to_rownum = np.full(max_id + 1, -1, dtype=int) rownum = 0 for row_id, present in enumerate(is_present): if present: id_to_rownum[row_id] = rownum rownum += 1 # computing the source row for each destination row # we loop over the periods before start_period in reverse order output_array_source_rows = np.full(rownum, -1, dtype=int) for period in periods_before[::-1]: start, stop = input_rows[period] input_rownums = np.arange(start, stop) input_id_to_rownum = input_index[period] id_is_in_period = input_id_to_rownum != -1 # which output rows are filled by input for this period output_rownums = id_to_rownum[np.where(id_is_in_period)] # get source rows (in the global array) for individuals in this period source_rows = output_array_source_rows[output_rownums] # if their source row is already known, leave them alone need_update = source_rows == -1 # global indices of rows which are not set yet (for this period) rows_to_update = output_rownums[need_update] # source row for those rows local_source_rows = input_rownums[need_update] # update the source row for those rows safe_put(output_array_source_rows, rows_to_update, local_source_rows) if np.all(output_array_source_rows != -1): break # reading data output_array = ColumnArray.from_table_coords(input_table, output_array_source_rows) output_array.add_and_drop_fields(fields_to_keep, output_fields, default_values) return output_array, id_to_rownum