def filenames(self, filename_list): if isinstance(filename_list, str): filename_list = [filename_list] uniq = set(filename_list) if len(uniq) != len(filename_list): self.logger.warning("duplicate files/arrays detected") filename_list = list(uniq) from pyemma.coordinates.data.data_in_memory import DataInMemory if self._is_reader: if isinstance(self, DataInMemory): import warnings warnings.warn('filenames are not being used for DataInMemory') return self._ntraj = len(filename_list) if self._ntraj == 0: raise ValueError("empty file list") # validate files for f in filename_list: try: stat = os.stat(f) except EnvironmentError: self.logger.exception('Error during access of file "%s"' % f) raise ValueError('could not read file "%s"' % f) if not os.path.isfile( f): # can be true for symlinks to directories raise ValueError('"%s" is not a valid file') if stat.st_size == 0: raise ValueError('file "%s" is empty' % f) # number of trajectories/data sets self._filenames = filename_list # determine len and dim via cache lookup, lengths = [] offsets = [] ndims = [] # avoid cyclic imports from pyemma.coordinates.data.util.traj_info_cache import TrajectoryInfoCache from pyemma._base.progress import ProgressReporter pg = ProgressReporter() pg.register(len(filename_list), 'Obtaining file info') with pg.context(): for filename in filename_list: if config.use_trajectory_lengths_cache: info = TrajectoryInfoCache.instance()[filename, self] else: info = self._get_traj_info(filename) # nested data set support. if hasattr(info, 'children'): lengths.append(info.length) offsets.append(info.offsets) ndims.append(info.ndim) for c in info.children: lengths.append(c.length) offsets.append(c.offsets) ndims.append(c.ndim) else: lengths.append(info.length) offsets.append(info.offsets) ndims.append(info.ndim) if len(filename_list) > 3: pg.update(1) # ensure all trajs have same dim if not np.unique(ndims).size == 1: # group files by their dimensions to give user indicator ndims = np.array(ndims) filename_list = np.asarray(filename_list) sort_inds = np.argsort(ndims) import itertools, operator res = {} for dim, files in itertools.groupby( zip(ndims[sort_inds], filename_list[sort_inds]), operator.itemgetter(0)): res[dim] = list(f[1] for f in files) raise ValueError( "Input data has different dimensions ({dims})!" " Files grouped by dimensions: {groups}".format( dims=res.keys(), groups=res)) self._ndim = ndims[0] self._lengths = lengths self._offsets = offsets else: # propagate this until we finally have a a reader self.data_producer.filenames = filename_list
def setUpClass(cls): cls.old_instance = TrajectoryInfoCache.instance() config.use_trajectory_lengths_cache = True
def test_get_instance(self): # test for exceptions in singleton creation inst = TrajectoryInfoCache.instance() inst.current_db_version
def filenames(self, filename_list): if isinstance(filename_list, string_types): filename_list = [filename_list] uniq = set(filename_list) if len(uniq) != len(filename_list): self.logger.warning("duplicate files/arrays detected") filename_list = list(uniq) from pyemma.coordinates.data.data_in_memory import DataInMemory if self._is_reader: if isinstance(self, DataInMemory): import warnings warnings.warn('filenames are not being used for DataInMemory') return self._ntraj = len(filename_list) if self._ntraj == 0: raise ValueError("empty file list") # validate files for f in filename_list: try: stat = os.stat(f) except EnvironmentError: self.logger.exception('Error during access of file "%s"' % f) raise ValueError('could not read file "%s"' % f) if not os.path.isfile(f): # can be true for symlinks to directories raise ValueError('"%s" is not a valid file') if stat.st_size == 0: raise ValueError('file "%s" is empty' % f) # number of trajectories/data sets self._filenames = filename_list # determine len and dim via cache lookup, lengths = [] offsets = [] ndims = [] # avoid cyclic imports from pyemma.coordinates.data.util.traj_info_cache import TrajectoryInfoCache if len(filename_list) > 3: self._progress_register(len(filename_list), 'Obtaining file info') for filename in filename_list: if config['use_trajectory_lengths_cache'] == 'True': info = TrajectoryInfoCache.instance()[filename, self] else: info = self._get_traj_info(filename) lengths.append(info.length) offsets.append(info.offsets) ndims.append(info.ndim) if len(filename_list) > 3: self._progress_update(1) # ensure all trajs have same dim if not np.unique(ndims).size == 1: raise ValueError("input data has different dimensions!" " Dimensions are = %s" % zip(filename_list, ndims)) self._ndim = ndims[0] self._lengths = lengths self._offsets = offsets else: # propate this until we finally have a a reader? self.data_producer.filenames = filename_list