def aslabeledarray(data): sequence = (tuple, list) if isinstance(data, la.LArray): return data elif (isinstance(data, sequence) and len(data) and isinstance(data[0], la.LArray)): # XXX: use la.stack? # TODO: check that all arrays have the same axes axes = [la.Axis(len(data))] + list(data[0].axes) return la.LArray(data, axes) else: return la.LArray(data)
geo = la.Axis(belgium, 'geo') # data1 = np.arange(30).reshape(2, 15) # arr1 = la.LArray(data1, axes=(sex, lipro)) # edit(arr1) # data2 = np.arange(116 * 44 * 2 * 15).reshape(116, 44, 2, 15) \ # .astype(float) # data2 = np.random.random(116 * 44 * 2 * 15).reshape(116, 44, 2, 15) \ # .astype(float) # data2 = (np.random.randint(10, size=(116, 44, 2, 15)) - 5) / 17 # data2 = np.random.randint(10, size=(116, 44, 2, 15)) / 100 + 1567 # data2 = np.random.normal(51000000, 10000000, size=(116, 44, 2, 15)) data2 = np.random.normal(0, 1, size=(116, 44, 2, 15)) arr2 = la.LArray(data2, axes=(age, geo, sex, lipro)) # arr2 = la.ndrange([100, 100, 100, 100, 5]) # arr2 = arr2['F', 'A11', 1] # view(arr2[0, 'A11', 'F', 'P01']) # view(arr1) # view(arr2[0, 'A11']) # edit(arr1) # print(arr2[0, 'A11', :, 'P01']) # edit(arr2.astype(int), minvalue=-99, maxvalue=55.123456) # edit(arr2.astype(int), minvalue=-99) # arr2.i[0, 0, 0, 0] = np.inf # arr2.i[0, 0, 1, 1] = -np.inf # arr2 = [0.0000111, 0.0000222] # arr2 = [0.00001, 0.00002] # edit(arr2, minvalue=-99, maxvalue=25.123456)
def load_ndarray(fpath, celltype=None): print(" - reading", fpath) # FIXME: implement celltype a = la.read_csv(fpath, dialect='liam2') # print(a.info) return a with open(fpath, "rb") as f: reader = csv.reader(f) line_stream = skip_comment_cells(strip_rows(reader)) header = line_stream.next() str_table = [] for line in line_stream: if any(value == '' for value in line): raise Exception("empty cell found in %s" % fpath) str_table.append(line) ndim = len(header) # handle last dimension header (horizontal values) last_d_header = str_table.pop(0) # auto-detect type of values for the last d and convert them last_d_pvalues = convert_1darray(last_d_header) unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues) if dupe_last_d: print(("Duplicate column header value(s) (for '%s') in '%s': %s" % (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d)))) raise Exception("bad data in '%s': found %d " "duplicate column header value(s)" % (fpath, len(dupe_last_d))) # handle other dimensions header # strip the ndim-1 first columns headers = [[line.pop(0) for line in str_table] for _ in range(ndim - 1)] headers = [convert_1darray(pvalues_str) for pvalues_str in headers] if ndim > 1: # having duplicate values is normal when there are more than 2 # dimensions but we need to test whether there are duplicates of # combinations. dupe_combos = list(duplicates(zip(*headers))) if dupe_combos: print(("Duplicate row header value(s) in '%s':" % fpath)) print((PrettyTable(dupe_combos))) raise Exception("bad alignment data in '%s': found %d " "duplicate row header value(s)" % (fpath, len(dupe_combos))) possible_values = [np.array(list(unique(pvalues))) for pvalues in headers] possible_values.append(np.array(unique_last_d)) shape = tuple(len(values) for values in possible_values) num_possible_values = prod(shape) # transform the 2d table into a 1d list str_table = list(chain.from_iterable(str_table)) if len(str_table) != num_possible_values: raise Exception("incoherent data in '%s': %d data cells " "found while it should be %d based on the number " "of possible values in headers (%s)" % (fpath, len(str_table), num_possible_values, ' * '.join(str(len(values)) for values in possible_values))) # TODO: compare time with numpy built-in conversion: # if dtype is None, numpy tries to detect the best type itself # which it does a good job of if the values are already numeric values # if dtype is provided, numpy does a good job to convert from string # values. if celltype is None: celltype = detect_column_type(str_table) data = convert_1darray(str_table, celltype) array = np.array(data, dtype=celltype) return la.LArray(array.reshape(shape), header, possible_values)
def compute(self, context, *expressions, **kwargs): if not expressions: raise TypeError("groupby() takes at least 1 argument") # TODO: allow lists/tuples of arguments to group by the combinations # of keys for expr in expressions: if isinstance(expr, (bool, int, float)): raise TypeError("groupby() does not work with constant " "arguments") if isinstance(expr, (tuple, list)): raise TypeError("groupby() takes expressions as arguments, " "not a list of expressions") # On python 3, we could clean up this code (keyword only arguments). expr = kwargs.pop('expr', None) if expr is None: expr = Count() # by = kwargs.pop('by', None) filter_value = kwargs.pop('filter', None) percent = kwargs.pop('percent', False) possible_values = kwargs.pop('pvalues', None) totals = kwargs.pop('totals', True) expr_vars = [v.name for v in collect_variables(expr)] labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] columns = [expand(c, context_length(context)) for c in columns] if filter_value is not None: filtered_columns = [col[filter_value] for col in columns] # FIXME: use the actual filter_expr instead of not_hashable filtered_context = context.subset(filter_value, expr_vars, not_hashable) else: filtered_columns = columns filtered_context = context if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: # return la.LArray([], labels, possible_values) return la.LArray([]) # evaluate the expression on each group # we use not_hashable to avoid storing the subset in the cache contexts = [ filtered_context.subset(indices, expr_vars, not_hashable) for indices in groups ] data = [expr_eval(expr, c) for c in contexts] # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] if percent: totals = True if totals: width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [ np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height) ] cols_indices = [ np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width) ] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_ctxs = [ filtered_context.subset(indices, expr_vars, not_hashable) for indices in rows_indices ] row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs] col_ctxs = [ filtered_context.subset(indices, expr_vars, not_hashable) for indices in cols_indices ] col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs] else: row_totals = None col_totals = None if percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in zip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) axes = [ la.Axis(axis_labels, axis_name) for axis_name, axis_labels in zip(labels, possible_values) ] # FIXME: also handle totals return la.LArray(data, axes)
def index_tables(globals_def, entities, fpath): print("reading data from %s ..." % fpath) input_file = tables.open_file(fpath) try: input_root = input_file.root def must_load_from_input_file(gdef): return isinstance(gdef, dict) and 'path' not in gdef any_global_from_input_file = any( must_load_from_input_file(gdef) for gdef in globals_def.values()) if any_global_from_input_file and 'globals' not in input_root: raise Exception( 'could not find any globals in the input data file ' '(but some are declared in the simulation file)') globals_data = load_path_globals(globals_def) constant_globals_data = handle_constant_globals(globals_def) globals_data.update(constant_globals_data) globals_node = getattr(input_root, 'globals', None) for name, global_def in globals_def.items(): # already loaded from another source (path) if name in globals_data: continue if name not in globals_node: raise Exception("could not find 'globals/%s' in the input " "data file" % name) global_data = getattr(globals_node, name) global_type = global_def.get('type', global_def.get('fields')) # TODO: move the checking (assertValidType) to a separate function assert_valid_type(global_data, global_type, context=name) array = global_data.read() if isinstance(global_type, list): # make sure we do not keep in memory columns which are # present in the input file but where not asked for by the # modeller. They are not accessible anyway. array = add_and_drop_fields(array, global_type) attrs = global_data.attrs dim_names = getattr(attrs, 'dimensions', None) if dim_names is not None: # we serialise dim_names as a numpy array so that it is # stored as a native hdf type and not a pickle but we # prefer to work with simple lists # also files serialized using Python2 are "bytes" not "str" dim_names = [str(dim_name) for dim_name in dim_names] pvalues = [ getattr(attrs, 'dim%d_pvalues' % i) for i in range(len(dim_names)) ] axes = [ la.Axis(labels, axis_name) for axis_name, labels in zip(dim_names, pvalues) ] array = la.LArray(array, axes) globals_data[name] = array input_entities = input_root.entities entities_tables = {} print(" * indexing tables") for ent_name, entity in entities.items(): print(" -", ent_name, "...", end=' ') table = getattr(input_entities, ent_name) assert_valid_type(table, list(entity.fields.in_input.name_types)) rows_per_period, id_to_rownum_per_period = \ timed(index_table, table) indexed_table = IndexedTable(table, rows_per_period, id_to_rownum_per_period) entities_tables[ent_name] = indexed_table except: input_file.close() raise return input_file, {'globals': globals_data, 'entities': entities_tables}