示例#1
0
    def test_xpathfilter_can_be_applied(self):
        extractor = self.TestExtractor('', n=4)
        dataset_filter = XPathDatasetFilter('/item[id > 1]')

        filtered = extractor.select(dataset_filter)

        self.assertEqual(2, len(filtered))
示例#2
0
 def set_filter(self, value=None):
     if not value:
         self.config.remove('filter')
     else:
         # check filter
         XPathDatasetFilter(value)
         self.config.filter = value
示例#3
0
    def test_item_filter_can_be_applied(self):
        class TestExtractor(Extractor):
            def __iter__(self):
                for i in range(4):
                    yield DatasetItem(id=i, subset='train')

        extractor = TestExtractor()

        filtered = XPathDatasetFilter(extractor, '/item[id > 1]')

        self.assertEqual(2, len(filtered))
示例#4
0
    def export(self, save_dir, output_format,
            filter_expr=None, **converter_kwargs):
        save_dir = osp.abspath(save_dir)
        os.makedirs(save_dir, exist_ok=True)

        dataset = self
        if filter_expr:
            dataset_filter = XPathDatasetFilter(filter_expr)
            dataset = dataset.select(dataset_filter)

        converter = self.env.make_converter(output_format, **converter_kwargs)
        converter(dataset, save_dir)
示例#5
0
 def extract(self, save_dir, filter_expr=None):
     project = Project(self.config)
     if filter_expr:
         XPathDatasetFilter(filter_expr)
         project.set_filter(filter_expr)
     project.save(save_dir)
示例#6
0
    def __init__(self, project):
        super().__init__()

        self._project = project
        config = self.config
        env = self.env

        dataset_filter = None
        if config.filter:
            dataset_filter = XPathDatasetFilter(config.filter)
        self._filter = dataset_filter

        sources = {}
        for s_name, source in config.sources.items():
            s_format = source.format
            if not s_format:
                s_format = env.PROJECT_EXTRACTOR_NAME
            options = {}
            options.update(source.options)

            url = source.url
            if not source.url:
                url = osp.join(config.project_dir, config.sources_dir, s_name)
            sources[s_name] = env.make_extractor(s_format,
                url, **options)
        self._sources = sources

        own_source = None
        own_source_dir = osp.join(config.project_dir, config.dataset_dir)
        if osp.isdir(own_source_dir):
            own_source = env.make_extractor(DEFAULT_FORMAT, own_source_dir)

        # merge categories
        # TODO: implement properly with merging and annotations remapping
        categories = {}
        for source in self._sources.values():
            categories.update(source.categories())
        for source in self._sources.values():
            for cat_type, source_cat in source.categories().items():
                assert categories[cat_type] == source_cat
        if own_source is not None and len(own_source) != 0:
            categories.update(own_source.categories())
        self._categories = categories

        # merge items
        subsets = defaultdict(lambda: Subset(self))
        for source_name, source in self._sources.items():
            for item in source:
                if dataset_filter and not dataset_filter(item):
                    continue

                existing_item = subsets[item.subset].items.get(item.id)
                if existing_item is not None:
                    image = None
                    if existing_item.has_image:
                        # TODO: think of image comparison
                        image = lambda: existing_item.image

                    path = existing_item.path
                    if item.path != path:
                        path = None
                    item = DatasetItemWrapper(item=item, path=path,
                        image=image, annotations=self._merge_anno(
                            existing_item.annotations, item.annotations))
                else:
                    s_config = config.sources[source_name]
                    if s_config and \
                            s_config.format != self.env.PROJECT_EXTRACTOR_NAME:
                        # NOTE: consider imported sources as our own dataset
                        path = None
                    else:
                        path = item.path
                        if path is None:
                            path = []
                        path = [source_name] + path
                    item = DatasetItemWrapper(item=item, path=path,
                        annotations=item.annotations)

                subsets[item.subset].items[item.id] = item

        # override with our items, fallback to existing images
        if own_source is not None:
            for item in own_source:
                if dataset_filter and not dataset_filter(item):
                    continue

                if not item.has_image:
                    existing_item = subsets[item.subset].items.get(item.id)
                    if existing_item is not None:
                        image = None
                        if existing_item.has_image:
                            # TODO: think of image comparison
                            image = lambda: existing_item.image
                        item = DatasetItemWrapper(item=item, path=None,
                            annotations=item.annotations, image=image)

                subsets[item.subset].items[item.id] = item

        # TODO: implement subset remapping when needed
        subsets_filter = config.subsets
        if len(subsets_filter) != 0:
            subsets = { k: v for k, v in subsets.items() if k in subsets_filter}
        self._subsets = dict(subsets)

        self._length = None