Exemplo n.º 1
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        close_file = True
        log.info("Updateing datasets from file list: %s", self._input_source)
        if hasattr(self._input_source, 'read'):
            input_file = self._input_source
            close_file = False
        elif isinstance(self._input_source,
                        str) and self._input_source.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._input_source, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._input_source)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components,
                                             dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        if close_file:
            input_file.close()
Exemplo n.º 2
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        file_list = []
        log.info("Updateing datasets from file list: %s", self._source_file)
        if self._source_file.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._source_file, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._source_file)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            directory_name = "/".join(parts[:-1])
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components,
                                             dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        input_file.close()
Exemplo n.º 3
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        file_list = []
        log.info("Updateing datasets from file list: %s", self._source_file)
        if self._source_file.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._source_file, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._source_file)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            directory_name = "/".join(parts[:-1])
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        input_file.close()