def __iter__(self) -> Iterator[DataEntry]: source_name = "list_data" # Basic idea is to split the dataset into roughly equally sized segments # with lower and upper bound, where each worker is assigned one segment bounds = util.get_bounds_for_mp_data_loading(len(self)) for row_number, data in enumerate(self.list_data): if not bounds.lower <= row_number < bounds.upper: continue data = data.copy() data = self.process(data) data["source"] = SourceContext(source=source_name, row=row_number) yield data
def __iter__(self): # Basic idea is to split the dataset into roughly equally sized segments # with lower and upper bound, where each worker is assigned one segment bounds = get_bounds_for_mp_data_loading(len(self)) if not self.cache or (self.cache and not self._data_cache): with self.open(self.path) as jsonl_file: for line_number, raw in enumerate(jsonl_file): if not bounds.lower <= line_number < bounds.upper: continue span = Span(path=self.path, line=line_number) try: parsed_line = Line(json.loads(raw), span=span) if self.cache: self._data_cache.append(parsed_line) yield parsed_line except ValueError: raise GluonTSDataError( f"Could not read json line {line_number}, {raw}") else: yield from self._data_cache