def update_datasets(self, filter=None): if filter is None: filter = self._filter close_file = True log.info("Updateing datasets from file list: %s", self._input_source) if hasattr(self._input_source, 'read'): input_file = self._input_source close_file = False elif isinstance(self._input_source, str) and self._input_source.startswith("gs://"): log.info("Using tensorflow for IO") from tensorflow.python.lib.io.file_io import FileIO input_file = FileIO(self._input_source, "r") log.info("Tensorflow reported size: %d", input_file.size()) else: input_file = open(self._input_source) lines = input_file.readlines() for line in lines: fpath = line.strip() parts = fpath.split("/") file_name = parts[-1] match = self._re.match(file_name) if not match: continue match_components = match.groupdict() dataset_path = self._prepend_path + fpath dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path) dataset = self.get_dataset_by_id(dataset_id) if not filter(dataset_id, match_components, dataset): self.remove_dataset_by_id(dataset_id) if close_file: input_file.close()
def update_datasets(self, filter=None): if filter is None: filter = self._filter file_list = [] log.info("Updateing datasets from file list: %s", self._source_file) if self._source_file.startswith("gs://"): log.info("Using tensorflow for IO") from tensorflow.python.lib.io.file_io import FileIO input_file = FileIO(self._source_file, "r") log.info("Tensorflow reported size: %d", input_file.size()) else: input_file = open(self._source_file) lines = input_file.readlines() for line in lines: fpath = line.strip() parts = fpath.split("/") file_name = parts[-1] directory_name = "/".join(parts[:-1]) match = self._re.match(file_name) if not match: continue match_components = match.groupdict() dataset_path = self._prepend_path + fpath dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path) dataset = self.get_dataset_by_id(dataset_id) if not filter(dataset_id, match_components, dataset): self.remove_dataset_by_id(dataset_id) input_file.close()