def _delete_transform_from_data_view( cls, transform: Transform, updated_transforms: TransformList, updated_labels: LabelSequence, data_view: DataView, ) -> Tuple[TransformList, LabelSequence]: log.info(f"Removing transform from {data_view.id}") transform_tree = data_view.transform_tree updated_transforms = TransformList(updated_transforms) updated_labels = LabelSequence(updated_labels) # the transforms queued for removal del_transforms: Deque[Transform] = deque([transform]) while del_transforms: log.info(f"about to pop {del_transforms[0]}") transform = del_transforms.popleft() if isinstance(transform, EnrichmentTransform): for label_name in transform.output_labels: log.info(f"removing by name {label_name}") updated_labels.remove_by_name(label_name) del_transforms.extend( transform_tree.get_children_of_transform(transform)) log.info(f"removing transform: {transform.serialize()}") updated_transforms.remove(transform) return updated_transforms, updated_labels
def _add_transform_to_data_view( cls, transform: Transform, updated_transforms: TransformList, updated_labels: LabelSequence, data_view: DataView, ) -> Tuple[TransformList, LabelSequence]: log.info(f"Adding transform to {data_view.id}") updated_transforms = TransformList(updated_transforms) updated_labels = LabelSequence(updated_labels) updated_transforms.append(transform) if isinstance(transform, EnrichmentTransform): updated_labels.extendleft( [Label(name) for name in transform.output_labels]) return updated_transforms, updated_labels
def test_set_comparison(): t = [ ExactMatch("aaa", "bbb"), ExactMatch("aaa", "bbb"), HasText("aaa", "bbb"), DoesNotMatchAny("aaa", ["bbb", "ccc"]), ] lists = [ TransformList([t[0], t[2]]), TransformList([t[1], t[2]]), TransformList([t[0], t[3]]), TransformList([t[0]]), TransformList([]), ] sets = [set(transform_list) for transform_list in lists] assert sets[0] == sets[0] assert sets[0] == sets[1] assert sets[0] != sets[2] assert sets[0] != sets[3] assert sets[0] != sets[4] assert sets[1] == sets[1] assert sets[1] != sets[2] assert sets[1] != sets[3] assert sets[1] != sets[4] assert sets[2] == sets[2] assert sets[2] != sets[3] assert sets[2] != sets[4] assert sets[3] == sets[3] assert sets[3] != sets[4] assert sets[4] == sets[4]
def test_set_comparison(): transforms = TransformList([ ExactMatch("aaa", "bbb"), ExactMatch("aaa", "bbb"), HasText("aaa", "bbb"), DoesNotMatchAny("aaa", ["bbb", "ccc"]), ]) assert {transforms[0]} == {transforms[0]} assert {transforms[0], transforms[2]} == {transforms[0], transforms[2]} assert {transforms[0], transforms[2]} == {transforms[1], transforms[2]} assert {transforms[0], transforms[1]} == {transforms[0]} assert {transforms[0], transforms[2]} != {transforms[0], transforms[3]} assert {transforms[0]} != {transforms[2]} assert {transforms[0]} != {transforms[3]}
def deserialize(cls, d: Dict[str]) -> DataView: data_view_id = DataViewId(d[cls.KEY_ID]) parent_data_view_id = DataViewId(d[cls.KEY_PARENT_ID]) dataset_id = DatasetId(d[cls.KEY_DATASET_ID]) user_id = UserId(d[cls.KEY_USER_ID]) labels = LabelSequence.deserialize(d[cls.KEY_COLUMN_LABELS]) transforms = TransformList.deserialize(d[cls.KEY_TRANSFORMS]) return DataView( data_view_id=data_view_id, parent_data_view_id=parent_data_view_id, dataset_id=dataset_id, user_id=user_id, labels=labels, transforms=transforms, )
def __init__( self, data_view_id: DataViewId, parent_data_view_id: DataViewId, dataset_id: DatasetId, user_id: UserId, labels: Optional[LabelSequence] = None, transforms: Optional[TransformList] = None, ): self.id = data_view_id self.parent_id = parent_data_view_id self.dataset_id = dataset_id self.user_id = user_id self.transforms = transforms or TransformList() self._labels = labels or LabelSequence() self._label_by_name: Dict[str, Label] = {}
def transform_data_view( self, data_view_id: DataViewId, add_transforms: Optional[List[Transform]] = None, del_transforms: Optional[List[Transform]] = None, ) -> DataView: data_view = self.by_id(data_view_id) if data_view is None: raise ValueError(f"Could not find DataView for id {data_view_id}") updated_transforms = TransformList(data_view.transforms) updated_labels = LabelSequence(data_view.labels) for transforms, apply_change in [ (del_transforms or [], self._delete_transform_from_data_view), (add_transforms or [], self._add_transform_to_data_view), ]: for transform in transforms: updated_transforms, updated_labels = apply_change( transform, updated_transforms, updated_labels, data_view, ) # see if this DataView already exists serialization = self._serialize_for_cache( data_view.dataset_id, updated_transforms, ) existing_id = self._data_view_id_by_serialization.get( serialization, None) if existing_id: log.info(f"using cached DataView {existing_id}") return self.by_id(existing_id) log.info("saving new DataView") return self.create( parent=data_view_id, user=data_view.user_id, dataset=data_view.dataset_id, labels=updated_labels, transforms=updated_transforms, )
def from_dict(cls, d: Dict) -> Query: transform_dicts = d.get(cls.KEY_TRANSFORMS, []) transforms: TransformList = TransformList() for d in transform_dicts: class_name: List[Dict] = d.get(cls.KEY_CLASS_NAME, None) args = d.get(cls.KEY_ARGS, {}) if class_name is None: log.error( "Transform has no class name - skipping: {}".format(d)) continue transform_cls = transform_manager.transform_by_name(class_name) transform = transform_cls(**args) transforms.append(transform) return Query(transforms=transforms, )
def create( self, parent: Optional[Union[DataView, DataViewId]], user: Union[User, UserId], dataset: Union[Dataset, DatasetId], labels: LabelSequence, transforms: Optional[TransformList] = None, ) -> DataView: log.debug("DataViewHandler.create") try: parent_id = parent.id except AttributeError: parent_id = parent try: dataset_id = dataset.id except AttributeError: dataset_id = dataset try: user_id = user.id except AttributeError: user_id = user if not transforms: transforms = TransformList() data_view = DataView( data_view_id=DataViewId(self._next_id), parent_data_view_id=parent_id, dataset_id=dataset_id, user_id=user_id, labels=labels, transforms=transforms, ) self._data_views.append(data_view) self._index_data_view(data_view) log.info("saving new DataView: %s", data_view.id) self.save() return data_view
def _serialize_for_cache(cls, dataset_id: DatasetId, transforms: TransformList) -> str: serialized_transforms = transforms.serialize() if transforms else [] return json.dumps([dataset_id, serialized_transforms])