def visual_history(self, id_, folder=None): """List with all steps/Data objects before the current one. The current avatar is also generated.""" uuid = UUID() data = None lastuuid = UUID(id_) firstdata = self.fetch(UUIDData(lastuuid)) # TODO: solve this check in pjdata if firstdata.history is None: firstdata.history = [] history = (list(firstdata.history) == 0) or firstdata.historystr if folder: lastuuid.generate_avatar(f"{folder}/{f'{id_}.jpg'}") lst = [] for transformer in history: if isinstance(transformer, Transformer): name = transformer.name transformeruuid = transformer.uuid else: name = transformer["name"] transformeruuid = transformer["uuid"] dic = { "label": uuid.id, "name": name, "help": str(transformer), "stored": data is not None } if folder: filename = f"{uuid}.jpg" dic["avatar"] = filename uuid.generate_avatar(f"{folder}/{filename}") lst.append(dic) uuid = uuid * transformeruuid data = self.fetch(UUIDData(uuid)) return lst
def _fetch_impl(self, data: Data, lock: bool = False) -> Data: # Fetch data info. uuid = data.uuid self.query(f"select * from data where id=?", [uuid.id]) result = self.get_one() if result is None: if lock: self.lock(data) return None # values_by_id = {row['id']: row['value'] for row in rall} if result["names"] == "": print("W: Previously locked by other process.", data) raise LockedEntryException(data) names = result["names"].split(",") mids = result["matrices"].split(",") hids = result["history"].split(",") name_by_mid = dict(zip(mids, names)) # Fetch matrices (lazily, if storage_info is provided). new_mids = [mid for mid in mids if mid not in data.ids_lst] matrices = data.matrices if self.storage_info is None: matrices_by_mid = self.fetch_dumps(new_mids) for mid in new_mids: matrices[name_by_mid[mid]] = matrices_by_mid[mid] else: for mid in new_mids: matrices[name_by_mid[mid]] = UUID(mid) # Fetch history. serialized_tranfs = self.fetch_dumps(hids, aslist=True) # TODO: deserializar antes de por no histórico history = History(serialized_tranfs) # TODO: failure and frozen should be stored/fetched! # TODO: would it be worth to update uuid/uuids here, instead of recalculating it from the start at Data.init? uuids = data.uuids uuids.update(dict(zip(names, map(UUID, mids)))) return Data( uuid=uuid, uuids=uuids, history=history, failure=None, frozen=False, hollow=False, stream=None, storage_info=self.storage_info, **matrices, )
def uuid(self) -> UUID: """Lazily calculated unique identifier for this dataset. Should be accessed direct as a class member: 'uuid'. Returns ------- A unique identifier UUID object. """ if self._uuid is None: content = self._uuid_impl() self._uuid = content if isinstance(content, UUID) else UUID( content.encode()) return self._uuid
def _uuid_impl(self): return UUID(self.serialized.encode())
def _cfuuid_impl(self, data=None): return UUID(serialize(self.config["hashes"]).encode())
import numpy as np from pjdata.aux.compression import pack from pjdata.aux.uuid import UUID from pjdata.content.data import Data # Testes ############################ from pjdata.history import History matrices = { "X": np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), "Y": np.array([[1, 2, 3, 4]]), "Xd": ['length', 'width'], "Yd": ['class'], "Xt": ["real", "real", "real", "real"], "Yt": [1, 2, 3, 4] } uuids = {k: UUID(pack(v)) for k, v in matrices.items()} data = Data(uuid=UUID(), uuids={ "X": UUID(), "Y": UUID() }, failure=None, frozen=False, history=History([]), hollow=False, stream=None, **matrices) print('OK', data)
def _cfuuid_impl(self, data=None): """UUID excluding 'model' and 'enhance' flags. Identifies the transformer.""" return UUID(self._cfserialized().encode())
def _uuid_impl(self): """Complete UUID; including 'model' and 'enhance' flags. Identifies the component.""" return self._cfuuid_impl() * UUID(str(self.hasenhancer + self.hasmodel).rjust(14, "0"))
def f(): a = UUID(int2pmat(2 ** 128 - 1)) b = UUID('12345678901234') (a * b) * b.t
from random import random from timeit import timeit from pjdata.aux.linalg import int2pmat, print_binmatrix, pmat2int, \ int2fac, pmat_mult from pjdata.aux.uuid import UUID # Show output of operations. a = UUID(int2pmat(2 ** 128 - 1)) b = UUID('12345678901234') c = UUID(1) print(a, b, c) print() print((a * b)) print((a * b) * b) print((a * b) * b.t) print((a * b) * c) fac = int2fac(2 ** 128 + 3214134) # Check for collisions. s = set() r = set() aa = bb = 0 for i in range(100000): while aa in r: aa = round(random() * 2 ** 128) while bb in r: bb = round(random() * 2 ** 128) r.add(aa) r.add(bb)
def read_arff(filename): """ Create Data from ARFF file. Assume X,y classification task and last attribute as target. And that there were no transformations (history) on this Data. A short hash will be added to the name, to ensure unique names. Actually, the first collision is expected after 1M different datasets with the same name ( n = 2**(log(107**6, 2)/2) ). Since we already expect unique names like 'iris', and any transformed dataset is expected to enter the system through a transformer, 1M should be safe enough. Ideally, a single 'iris' be will stored. In practice, no more than a dozen are expected. Parameters ---------- filename path of the dataset description dataset description Returns ------- (dict of matrix hashes, Data object) """ # Load file. file = open(filename, "r") dic = arff.load(file, encode_nominal=False) # ['description', 'relation', 'attributes', 'data'] name = dic["relation"] description = dic["description"] file.close() # Extract attributes and targets. Arr = np.array(dic["data"]) Att = dic["attributes"][0:-1] TgtAtt = dic["attributes"][-1] # Extract X values (numeric when possible), descriptions and types. X = Arr[:, 0:-1] Xd = [tup[0] for tup in Att] Xt = [translate_type(tup[1]) for tup in Att] if len(nominal_idxs(Xt)) == 0: X = X.astype(float) # Extract Y values (assumes categorical), descriptions and types. Y = np.ascontiguousarray(Arr[:, -1].reshape((Arr.shape[0], 1))) Yd = [TgtAtt[0]] Yt = [translate_type(TgtAtt[1])] # Calculate pseudo-unique hash for X and Y, and a pseudo-unique name. matrices = {"X": X, "Y": Y, "Xd": Xd, "Yd": Yd, "Xt": Xt, "Yt": Yt} uuids = {k: UUID(pack(v)) for k, v in matrices.items()} original_hashes = {k: v.id for k, v in uuids.items()} # # old, unique, name... # name_ = splitted[-1] + '_' + enc( # md5_int(serialize(original_hashes).encode()))[:6] # Generate the first transformation of a Data object: being born. faketransformer = FakeStep(FakeFile(filename, original_hashes)) uuid, uuids = li.evolve_id(UUID(), {}, [faketransformer], matrices) # Create a temporary Data object (i.e. with a fake history). data = Data( history=History([faketransformer]), failure=None, frozen=False, hollow=False, stream=None, storage_info=None, uuid=uuid, uuids=uuids, X=X, Y=Y, Xt=Xt, Yt=Yt, Xd=Xd, Yd=Yd, ) # Patch the Data object with the real transformer and history. transformer = Step(FakeFile(filename, original_hashes)) data.history = History([transformer]) return original_hashes, data, name, description
def _uuid_impl(self): return UUID(jsonable["component_uuid"])