def test_xpathfilter_can_be_applied(self): extractor = self.TestExtractor('', n=4) dataset_filter = XPathDatasetFilter('/item[id > 1]') filtered = extractor.select(dataset_filter) self.assertEqual(2, len(filtered))
def set_filter(self, value=None): if not value: self.config.remove('filter') else: # check filter XPathDatasetFilter(value) self.config.filter = value
def test_item_filter_can_be_applied(self): class TestExtractor(Extractor): def __iter__(self): for i in range(4): yield DatasetItem(id=i, subset='train') extractor = TestExtractor() filtered = XPathDatasetFilter(extractor, '/item[id > 1]') self.assertEqual(2, len(filtered))
def export(self, save_dir, output_format, filter_expr=None, **converter_kwargs): save_dir = osp.abspath(save_dir) os.makedirs(save_dir, exist_ok=True) dataset = self if filter_expr: dataset_filter = XPathDatasetFilter(filter_expr) dataset = dataset.select(dataset_filter) converter = self.env.make_converter(output_format, **converter_kwargs) converter(dataset, save_dir)
def extract(self, save_dir, filter_expr=None): project = Project(self.config) if filter_expr: XPathDatasetFilter(filter_expr) project.set_filter(filter_expr) project.save(save_dir)
def __init__(self, project): super().__init__() self._project = project config = self.config env = self.env dataset_filter = None if config.filter: dataset_filter = XPathDatasetFilter(config.filter) self._filter = dataset_filter sources = {} for s_name, source in config.sources.items(): s_format = source.format if not s_format: s_format = env.PROJECT_EXTRACTOR_NAME options = {} options.update(source.options) url = source.url if not source.url: url = osp.join(config.project_dir, config.sources_dir, s_name) sources[s_name] = env.make_extractor(s_format, url, **options) self._sources = sources own_source = None own_source_dir = osp.join(config.project_dir, config.dataset_dir) if osp.isdir(own_source_dir): own_source = env.make_extractor(DEFAULT_FORMAT, own_source_dir) # merge categories # TODO: implement properly with merging and annotations remapping categories = {} for source in self._sources.values(): categories.update(source.categories()) for source in self._sources.values(): for cat_type, source_cat in source.categories().items(): assert categories[cat_type] == source_cat if own_source is not None and len(own_source) != 0: categories.update(own_source.categories()) self._categories = categories # merge items subsets = defaultdict(lambda: Subset(self)) for source_name, source in self._sources.items(): for item in source: if dataset_filter and not dataset_filter(item): continue existing_item = subsets[item.subset].items.get(item.id) if existing_item is not None: image = None if existing_item.has_image: # TODO: think of image comparison image = lambda: existing_item.image path = existing_item.path if item.path != path: path = None item = DatasetItemWrapper(item=item, path=path, image=image, annotations=self._merge_anno( existing_item.annotations, item.annotations)) else: s_config = config.sources[source_name] if s_config and \ s_config.format != self.env.PROJECT_EXTRACTOR_NAME: # NOTE: consider imported sources as our own dataset path = None else: path = item.path if path is None: path = [] path = [source_name] + path item = DatasetItemWrapper(item=item, path=path, annotations=item.annotations) subsets[item.subset].items[item.id] = item # override with our items, fallback to existing images if own_source is not None: for item in own_source: if dataset_filter and not dataset_filter(item): continue if not item.has_image: existing_item = subsets[item.subset].items.get(item.id) if existing_item is not None: image = None if existing_item.has_image: # TODO: think of image comparison image = lambda: existing_item.image item = DatasetItemWrapper(item=item, path=None, annotations=item.annotations, image=image) subsets[item.subset].items[item.id] = item # TODO: implement subset remapping when needed subsets_filter = config.subsets if len(subsets_filter) != 0: subsets = { k: v for k, v in subsets.items() if k in subsets_filter} self._subsets = dict(subsets) self._length = None