def load_base_csv(self, data_file, languages, groups=[], translation_filename=None, translation_extra=[], keys_ex=[], validate=True): """Loads a base data map object from a csv groups is a list of additional fields (name is automatically include) that nest via groupname_subfield. """ data_file = self.get_data_path(data_file) groups = ['name'] + groups rows = [group_fields(row, groups=groups) for row in read_csv(data_file)] basemap = DataMap(languages=languages, keys_ex=keys_ex) basemap.extend(rows) if translation_filename: try: translations = fix_id(self.load_list_csv(translation_filename)) groups = set(['name'] + translation_extra) merge_list(basemap, translations, groups=groups, many=False) except FileNotFoundError: print(f"Warning: Could not find translation file {translation_filename}") if languages: self._validate_base_map(data_file, basemap, languages, error=validate) return basemap
def group_fields(self, data): if not isinstance(data, collections.Mapping): raise TypeError("Invalid data type, perhaps you forgot many=true?") groups = (list(self.__groups__ or []) + list(self.__translation_groups__ or []) + self.identify_prefixes()) return group_fields(data, groups=groups)
def extend_base(self, filename, *, groups=[]): filename = self._get_filename(filename) dataitems = self.reader.load_list_csv(filename) if not dataitems: return self groups = set(['name'] + groups) # todo: have it check the first column name and allow joins on other languages # Get first column name, whose values will anchor the data to merge first_column_name = next(iter(dataitems[0].keys())) results = {} for item in dataitems: key = item[first_column_name] # Remove the join from the subdata item.pop(first_column_name) results[key] = group_fields(item, groups=groups) self.data_map.merge(results) return self
def unflatten(obj_list, *, nest, groups=[], leaftype): """Performs the reverse of flatten. Turns a CSV (list of objects) into a nested object. Nest is a list of fields used to walk through the nesting. TODO: Remove groups and leaftype and leave that to a post-step. Wait to see what the post-load abstraction will be before doing that. """ if leaftype not in ['list', 'dict']: raise Exception("Unsupported leaf type") # This is a recursive algorithm if not nest: # BASE CASE: nothing more to nest, performs groups on entries if leaftype is 'list': return [util.group_fields(obj, groups=groups) for obj in obj_list] if leaftype is 'dict': return util.group_fields(obj_list[0], groups=groups) else: current_nest = nest[0] remaining_nest = nest[1:] # Phase one, start grouping rows grouped_rows = {} for mapping in obj_list: key = mapping.pop(current_nest) grouped_rows.setdefault(key, []).append(mapping) # Phase 2, unflatten recursively results = {} for key, items in grouped_rows.items(): # Validation. Make sure it recurses correctly if leaftype != 'list' and len(items) > 1: raise Exception( f"Found multiple entries for {current_nest}:{key}, " + "which is invalid in this leaf type") # Recursive call results[key] = unflatten(items, nest=remaining_nest, groups=groups, leaftype=leaftype) return results
def load_base_csv(self, data_file, groups=[], validate=True): """Loads a base data map object from a csv groups is a list of additional fields (name is automatically include) that nest via groupname_subfield. """ data_file = self.get_data_path(data_file) groups = ['name'] + groups rows = read_csv(data_file) rows = [group_fields(row, groups=groups) for row in rows] basemap = DataMap() basemap.extend(rows) self._validate_base_map(data_file, basemap, error=validate) return basemap
def group_fields(self, data): if not isinstance(data, collections.Mapping): raise TypeError("Invalid data type, perhaps you forgot many=true?") groups = self.__groups__ or [] return group_fields(data, groups=groups)
def merge_list(base, rows: typing.Iterable[dict], key=None, groups=[], many=False): """Routine to merge lists of dictionaries together using one or more keys. The keys used are determined by first sequential key of the first row. If the key is an id, it will join on that, but if it is a name, it will join on that and key_ex fields. """ def create_key_fields(data_map, column_name): lang = derive_lang(column_name) key_fields = [] if lang is None: key_fields.append('id') else: key_fields.append(f'name_{lang}') key_fields.extend(data_map.keys_ex) return key_fields def create_key_fn(key_fields): def derive_key(dict): items = [] for k in key_fields: if f'base_{k}' in dict: items.append(dict[f'base_{k}']) else: items.append(dict[k]) return tuple(str(i) for i in items) return derive_key if many and not key: raise ValueError('Key must have a value') if not rows: return # Create keying function first_column = next(iter(rows[0].keys())) key_fields = create_key_fields(base, first_column) derive_key = create_key_fn(key_fields) # group rows keyed_data = {} for row in rows: row_key = derive_key(row) # Delete key fields. Its possible for base_name_en AND name_en to be in the same row. # Therefore, prioritize deleting base_ versions first for k in key_fields: if f'base_{k}' in row: del row[f'base_{k}'] elif k in row: del row[k] if groups: row = util.group_fields(row, groups=groups) entry = keyed_data.setdefault(row_key, []) entry.append(row) if not many and len(entry) > 1: raise ValueError( f"Key {row_key} has too many matching entries in sub data") # Group base base = {derive_key(e): e for e in base.values()} "Test the keys to see that sub's keys exist in base" unlinked = [k for k in keyed_data.keys() if k not in base.keys()] if unlinked: raise Exception( "Several entries in sub data map cannot be joined. Their keys are " + ','.join('None' if e is None else str(e) for e in unlinked)) for data_key, data_entries in keyed_data.items(): base_entry = base[data_key] if key: if many: base_entry[key] = data_entries else: base_entry[key] = data_entries[0] elif isinstance(data_entries[0], abc.Mapping): util.joindicts(base_entry, data_entries[0]) else: # We cannot merge a dictionary with a non-dictionary raise Exception( "Invalid data, the data map must be a dictionary for a keyless merge" )
def test_group_fields(): item = {'level': 2, 'description_en': 'test', 'description_ja': None} grouped = util.group_fields(item, groups=('description', )) expected = {'level': 2, 'description': {'en': 'test', 'ja': None}} assert grouped == expected, "description should have been grouped"