def test_union_with_empty_sub_dict(self): """unioning with a dict that has an empty sub-dict""" d = UnionDict({"title": {}}) e = UnionDict({"title": {"text": "Alignment Position"}}) f = UnionDict(e.copy()) e |= d self.assertEqual(e, f)
def test_union_value_dict(self): """replacing union or of a value with a dict should be dict""" d = UnionDict({"A": {"B": "Blah"}}) e = UnionDict({"A": "Blah"}) f = UnionDict(d.copy()) f |= e self.assertNotEqual(d, f) e |= d self.assertEqual(d, e)
class ReadOnlyDataStoreBase: """a read only data store""" store_suffix = None def __init__(self, source, suffix=None, limit=None, verbose=False, md5=True): """ Parameters ---------- source path to directory / zip file. Forced to end with store_suffix. suffix only members whose name matches the suffix are considered included limit the maximum number of members to consider verbose displays files that don't match search (applies only to the Zipped variant) md5 : bool record md5 hexadecimal checksum of read data when possible """ # assuming delimiter is / # todo this approach to caching persistent arguments for reconstruction # is fragile. Need an inspect module based approach d = locals() self._persistent = UnionDict( {k: v for k, v in d.items() if k != "self"}) source = str(source) suffix = suffix or "" if suffix != "*": # wild card search for all suffix = re.sub(r"^[\s.*]+", "", suffix) # tidy the suffix source = re.sub(r"/+$", "", source) # tidy the source self.suffix = suffix if self.store_suffix and not source.endswith(self.store_suffix): source = ".".join([source, self.store_suffix]) self.source = str(pathlib.Path(source).expanduser()) self.mode = "r" self._members = [] self.limit = limit self._verbose = verbose self._md5 = md5 self._checksums = {} def __getstate__(self): return self._persistent.copy() def __setstate__(self, data): new = self.__class__(**data) self.__dict__.update(new.__dict__) return self def __repr__(self): if len(self) > 3: sample = str(list(self[:3])) sample = f"{sample[:-1]}..." else: sample = list(self) num = len(self) name = self.__class__.__name__ return f"{num}x member {name}(source='{self.source}', members={sample})" def __str__(self): return str(list(self)) def head(self, n=5): """displays top n members""" pprint(self[:n]) def tail(self, n=5): """displays last n members""" pprint(self[-n:]) def __iter__(self): for i, member in enumerate(self.members): if not isinstance(member, DataStoreMember): member = DataStoreMember(self.get_absolute_identifier(member), self) self.members[i] = member yield member def __getitem__(self, index): return self.members[index] def __len__(self): return len(self.members) def __contains__(self, identifier): """whether relative identifier has been stored""" if isinstance(identifier, DataStoreMember): return identifier.parent is self if not identifier.endswith(self.suffix): suffix = pathlib.Path(identifier).suffix # possible an "added" file if self.store_suffix == "zip": klass = ReadOnlyZippedDataStore else: klass = ReadOnlyDirectoryDataStore new = klass(self.source, suffix=suffix) return identifier in new identifier = self.get_relative_identifier(identifier) result = False for member in self.members: if identifier in member: result = True break return result def get_member(self, identifier): """returns DataStoreMember""" identifier = self.get_relative_identifier(identifier) for member in self.members: if identifier in member: return member return None def get_relative_identifier(self, identifier): """returns the identifier relative to store root path""" if isinstance(identifier, DataStoreMember) and identifier.parent is self: return identifier source = self.source identifier = os.path.basename(identifier) if source.endswith(".zip"): # we insert the source path into identifier name # for zip members to ensure inflation creates a directory # containing them source = source.replace(".zip", "") source = os.path.basename(source) identifier = f"{source}{os.sep}{identifier}" else: identifier = Path(identifier) identifier = identifier.name return identifier def get_absolute_identifier(self, identifier, from_relative=False): """returns the identifier relative to the root path""" if not from_relative: identifier = self.get_relative_identifier(identifier) source = self.source.replace(".zip", "") if isinstance(identifier, DataStoreMember): identifier = identifier.name elif not identifier.startswith(source): identifier = f"{source}{os.sep}{identifier}" return identifier def read(self, identifier): """reads data corresponding to identifier""" if isinstance(identifier, DataStoreMember) and identifier.parent is self: identifier = identifier.name source = self.open(identifier) data = source.read() if self._md5: self._checksums[identifier] = get_text_hexdigest(data) source.close() return data @property def members(self): raise NotImplementedError # override in subclasses def open(self, identifier): raise NotImplementedError def filtered(self, pattern=None, callback=None): """returns list of members for which callback returns True""" assert any([callback, pattern]), "Must provide a pattern or a callback" if pattern: result = [m for m in self if fnmatch(m, pattern)] else: result = [m for m in self if callback(m)] return result def md5(self, identifier, force=True): """ Parameters ---------- identifier name of data store member force : bool forces reading of data if not already done Returns ------- md5 checksum for the member, if available, None otherwise """ md5_setting = self._md5 # for restoring automatic md5 calc setting absoluteid = self.get_absolute_identifier(identifier) if force and absoluteid not in self._checksums: self._md5 = True _ = self.read(absoluteid) result = self._checksums.get(absoluteid, None) self._md5 = md5_setting return result