def create_file_reader(input_files, topology, featurizer, chunk_size=1000, **kw): r""" Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer. Parameters ---------- :param input_files: A single input file or a list of input files. :param topology: A topology file. If given, the featurizer argument can be None. :param featurizer: A featurizer. If given, the topology file can be None. :param chunk_size: The chunk size with which the corresponding reader gets initialized. :return: Returns the reader. """ from pyemma.coordinates.data.numpy_filereader import NumPyFileReader from pyemma.coordinates.data.py_csv_reader import PyCSVReader from pyemma.coordinates.data import FeatureReader from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader # fragmented trajectories if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and any(isinstance(item, (list, tuple)) for item in input_files)): return FragmentedTrajectoryReader(input_files, topology, chunk_size, featurizer) # normal trajectories if (isinstance(input_files, string_types) or (isinstance(input_files, (list, tuple)) and (any(isinstance(item, string_types) for item in input_files) or len(input_files) is 0))): reader = None # check: if single string create a one-element list if isinstance(input_files, string_types): input_list = [input_files] elif len(input_files) > 0 and all( isinstance(item, string_types) for item in input_files): input_list = input_files else: if len(input_files) is 0: raise ValueError("The passed input list should not be empty.") else: raise ValueError( "The passed list did not exclusively contain strings or was a list of lists " "(fragmented trajectory).") # TODO: this does not handle suffixes like .xyz.gz (rare) _, suffix = os.path.splitext(input_list[0]) # check: do all files have the same file type? If not: raise ValueError. if all(item.endswith(suffix) for item in input_list): # do all the files exist? If not: Raise value error all_exist = True err_msg = "" for item in input_list: if not os.path.isfile(item): err_msg += "\n" if len(err_msg) > 0 else "" err_msg += "File %s did not exist or was no file" % item all_exist = False if not all_exist: raise ValueError( "Some of the given input files were directories" " or did not exist:\n%s" % err_msg) if all_exist: from mdtraj.formats.registry import FormatRegistry # CASE 1.1: file types are MD files if suffix in list(FormatRegistry.loaders.keys()): # check: do we either have a featurizer or a topology file name? If not: raise ValueError. # create a MD reader with file names and topology if not featurizer and not topology: raise ValueError( "The input files were MD files which makes it mandatory to have either a " "featurizer or a topology file.") reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology, chunksize=chunk_size) else: if suffix in ['.npy', '.npz']: reader = NumPyFileReader(input_list, chunksize=chunk_size) # otherwise we assume that given files are ascii tabulated data else: reader = PyCSVReader(input_list, chunksize=chunk_size, **kw) else: raise ValueError( "Not all elements in the input list were of the type %s!" % suffix) else: raise ValueError("Input \"%s\" was no string or list of strings." % input) return reader
def _get_reader_instance(self, instance_number): if instance_number == 0: return DataInMemory(self.data) elif instance_number == 1: return FeatureReader(self.data_feature_reader, topologyfile=self.topfile)
def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw): r""" Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer. Parameters ---------- :param input_files: A single input file or a list of input files. :param topology: A topology file. If given, the featurizer argument can be None. :param featurizer: A featurizer. If given, the topology file can be None. :param chunksize: The chunk size with which the corresponding reader gets initialized. :return: Returns the reader. """ from pyemma.coordinates.data.numpy_filereader import NumPyFileReader from pyemma.coordinates.data.py_csv_reader import PyCSVReader from pyemma.coordinates.data import FeatureReader from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader # fragmented trajectories if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and any(isinstance(item, (list, tuple)) for item in input_files)): return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer) # normal trajectories if (isinstance(input_files, (Path, str)) or (isinstance(input_files, (list, tuple)) and (any(isinstance(item, (Path, str)) for item in input_files) or len(input_files) == 0))): # check: if single string create a one-element list if isinstance(input_files, (Path, str)): input_list = [input_files] elif len(input_files) > 0 and all(isinstance(item, (Path, str)) for item in input_files): input_list = input_files else: if len(input_files) == 0: raise ValueError("The passed input list should not be empty.") else: raise ValueError("The passed list did not exclusively contain strings or was a list of lists " "(fragmented trajectory).") # convert to list of paths input_list = [Path(f) for f in input_list] # TODO: this does not handle suffixes like .xyz.gz (rare) suffix = input_list[0].suffix # check: do all files have the same file type? If not: raise ValueError. if all(item.suffix == suffix for item in input_list): # do all the files exist? If not: Raise value error all_exist = True from six import StringIO err_msg = StringIO() for item in input_list: if not item.is_file(): err_msg.write('\n' if err_msg.tell() > 0 else "") err_msg.write('File %s did not exist or was no file' % item) all_exist = False if not all_exist: raise ValueError('Some of the given input files were directories' ' or did not exist:\n%s' % err_msg.getvalue()) featurizer_or_top_provided = featurizer is not None or topology is not None # we need to check for h5 first, because of mdtraj custom HDF5 traj format (which is deprecated). if suffix in ('.h5', '.hdf5') and not featurizer_or_top_provided: # This check is potentially expensive for lots of files, we also re-open the file twice (causing atime updates etc.) # So we simply require that no featurizer option is given. # and not all((_is_mdtraj_hdf5_file(f) for f in input_files)): from pyemma.coordinates.data.h5_reader import H5Reader reader = H5Reader(filenames=input_files, chunk_size=chunksize, **kw) # CASE 1.1: file types are MD files elif FeatureReader.supports_format(suffix): # check: do we either have a featurizer or a topology file name? If not: raise ValueError. # create a MD reader with file names and topology if not featurizer_or_top_provided: raise ValueError('The input files were MD files which makes it mandatory to have either a ' 'Featurizer or a topology file.') if suffix in ('.pdb', '.pdb.gz'): raise ValueError('PyEMMA can not read PDB-fake-trajectories. ' 'Please consider using a sane trajectory format (e.g. xtc, dcd).') reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology, chunksize=chunksize) elif suffix in ('.npy', '.npz'): reader = NumPyFileReader(input_list, chunksize=chunksize) # otherwise we assume that given files are ascii tabulated data else: reader = PyCSVReader(input_list, chunksize=chunksize, **kw) else: raise ValueError('Not all elements in the input list were of the type %s!' % suffix) else: raise ValueError('Input "{}" was no string or list of strings.'.format(input_files)) return reader