def test_multiple_cursors(self): # pylint: disable=protected-access lazy_instances1 = _LazyInstances(lambda: (i for i in self.instances)) lazy_instances2 = _LazyInstances(lambda: (i for i in self.instances)) eager_instances1 = self.instances[:] eager_instances2 = self.instances[:] for instances1, instances2 in [(eager_instances1, eager_instances2), (lazy_instances1, lazy_instances2)]: iterator = BasicIterator(batch_size=1, instances_per_epoch=2) iterator.index_with(self.vocab) # First epoch through dataset1 batches = list(iterator._create_batches(instances1, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[0]], [self.instances[1]]] # First epoch through dataset2 batches = list(iterator._create_batches(instances2, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[0]], [self.instances[1]]] # Second epoch through dataset1 batches = list(iterator._create_batches(instances1, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[2]], [self.instances[3]]] # Second epoch through dataset2 batches = list(iterator._create_batches(instances2, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[2]], [self.instances[3]]]
def test_multiple_cursors(self): # pylint: disable=protected-access instances1 = _LazyInstances(lambda: (i for i in self.instance_iterable)) instances2 = _LazyInstances(lambda: (i for i in self.instance_iterable)) iterator = LazyBasicIterator(batch_size=1, instances_per_epoch=2) iterator.index_with(self.vocab) # First epoch through dataset1 batches = list(iterator._create_batches(instances1, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[0]], [self.instances[1]]] # First epoch through dataset2 batches = list(iterator._create_batches(instances2, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[0]], [self.instances[1]]] # Second epoch through dataset1 batches = list(iterator._create_batches(instances1, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[2]], [self.instances[3]]] # Second epoch through dataset2 batches = list(iterator._create_batches(instances2, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[2]], [self.instances[3]]]
def test_multiple_cursors(self): lazy_instances1 = _LazyInstances(lambda: (i for i in self.instances)) lazy_instances2 = _LazyInstances(lambda: (i for i in self.instances)) eager_instances1 = self.instances[:] eager_instances2 = self.instances[:] for instances1, instances2 in [ (eager_instances1, eager_instances2), (lazy_instances1, lazy_instances2), ]: iterator = BasicIterator(batch_size=1, instances_per_epoch=2) iterator.index_with(self.vocab) # First epoch through dataset1 batches = list(iterator._create_batches(instances1, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[0]], [self.instances[1]]] # First epoch through dataset2 batches = list(iterator._create_batches(instances2, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[0]], [self.instances[1]]] # Second epoch through dataset1 batches = list(iterator._create_batches(instances1, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[2]], [self.instances[3]]] # Second epoch through dataset2 batches = list(iterator._create_batches(instances2, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[2]], [self.instances[3]]]
def read(self, *args, **kwargs) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning( "DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if lazy: return _LazyInstances(lambda: iter(self._read(*args, **kwargs))) else: instances = self._read(*args, **kwargs) if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError( f"No instances were read from the given args ({args}). " f"and kwargs ({kwargs})Is the path correct?") return instances
def read(self, file_path: str) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ # self.folder_path = file_path img_file = os.path.join(file_path,'imgs.tsv') imgid2img = self.load_feature(img_file) lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning("DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if self._cache_directory: cache_file = self._get_cache_location_for_file_path(file_path) else: cache_file = None if lazy: return _LazyInstances(lambda: self._read(file_path, imgid2img), cache_file, self.deserialize_instance, self.serialize_instance) else: # First we read the instances, either from a cache or from the original file. if cache_file and os.path.exists(cache_file): instances = self._instances_from_cache_file(cache_file) else: instances = self._read(file_path) # Then some validation. if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) # And finally we write to the cache if we need to. if cache_file and not os.path.exists(cache_file): logger.info(f"Caching instances to {cache_file}") self._instances_to_cache_file(cache_file, instances) return instances
def read(self, file_path: str) -> Iterable[Instance]: return _LazyInstances(self._one_epoch)
def read(self, file_path: str) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning( "DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if lazy: return _LazyInstances(lambda: iter(self._read(file_path))) else: if self.cache_path is not None: # create a key for the file based on the reader config hash_ = self.get_hash(file_path) pathlib.Path(self.cache_path).mkdir(parents=True, exist_ok=True) cache_file = os.path.join(self.cache_path, (hash_ + '.cache')) if not os.path.exists(cache_file) or self.overwrite_cache: instances = self._read(file_path) if not isinstance(instances, list): instances = [ instance for instance in Tqdm.tqdm(instances) ] if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) logger.info(f'caching instances to file: {cache_file}') with open(cache_file, 'wb') as cache: dill.dump(instances, cache) else: logger.info( f'Reading instances from cache file: {cache_file}') # instances = [] # with open(cache_file, 'rb') as cache: # start = time.time() # instances = [] # for line in Tqdm.tqdm(cache): # instances.append(self.deserialize_instance(line.strip())) # print(time.time()-start) with open(cache_file, 'rb') as f_in: instances = dill.load(f_in) else: instances = self._read(file_path) if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return instances