Пример #1
0
def create_file_reader(input_files,
                       topology,
                       featurizer,
                       chunk_size=1000,
                       **kw):
    r"""
    Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer.
    Parameters
    ----------
    :param input_files:
        A single input file or a list of input files.
    :param topology:
        A topology file. If given, the featurizer argument can be None.
    :param featurizer:
        A featurizer. If given, the topology file can be None.
    :param chunk_size:
        The chunk size with which the corresponding reader gets initialized.
    :return: Returns the reader.
    """
    from pyemma.coordinates.data.numpy_filereader import NumPyFileReader
    from pyemma.coordinates.data.py_csv_reader import PyCSVReader
    from pyemma.coordinates.data import FeatureReader
    from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader

    # fragmented trajectories
    if (isinstance(input_files, (list, tuple)) and len(input_files) > 0
            and any(isinstance(item, (list, tuple)) for item in input_files)):
        return FragmentedTrajectoryReader(input_files, topology, chunk_size,
                                          featurizer)

    # normal trajectories
    if (isinstance(input_files, string_types)
            or (isinstance(input_files, (list, tuple)) and
                (any(isinstance(item, string_types)
                     for item in input_files) or len(input_files) is 0))):
        reader = None
        # check: if single string create a one-element list
        if isinstance(input_files, string_types):
            input_list = [input_files]
        elif len(input_files) > 0 and all(
                isinstance(item, string_types) for item in input_files):
            input_list = input_files
        else:
            if len(input_files) is 0:
                raise ValueError("The passed input list should not be empty.")
            else:
                raise ValueError(
                    "The passed list did not exclusively contain strings or was a list of lists "
                    "(fragmented trajectory).")

        # TODO: this does not handle suffixes like .xyz.gz (rare)
        _, suffix = os.path.splitext(input_list[0])

        # check: do all files have the same file type? If not: raise ValueError.
        if all(item.endswith(suffix) for item in input_list):

            # do all the files exist? If not: Raise value error
            all_exist = True
            err_msg = ""
            for item in input_list:
                if not os.path.isfile(item):
                    err_msg += "\n" if len(err_msg) > 0 else ""
                    err_msg += "File %s did not exist or was no file" % item
                    all_exist = False
            if not all_exist:
                raise ValueError(
                    "Some of the given input files were directories"
                    " or did not exist:\n%s" % err_msg)

            if all_exist:
                from mdtraj.formats.registry import FormatRegistry

                # CASE 1.1: file types are MD files
                if suffix in list(FormatRegistry.loaders.keys()):
                    # check: do we either have a featurizer or a topology file name? If not: raise ValueError.
                    # create a MD reader with file names and topology
                    if not featurizer and not topology:
                        raise ValueError(
                            "The input files were MD files which makes it mandatory to have either a "
                            "featurizer or a topology file.")

                    reader = FeatureReader(input_list,
                                           featurizer=featurizer,
                                           topologyfile=topology,
                                           chunksize=chunk_size)
                else:
                    if suffix in ['.npy', '.npz']:
                        reader = NumPyFileReader(input_list,
                                                 chunksize=chunk_size)
                    # otherwise we assume that given files are ascii tabulated data
                    else:
                        reader = PyCSVReader(input_list,
                                             chunksize=chunk_size,
                                             **kw)
        else:
            raise ValueError(
                "Not all elements in the input list were of the type %s!" %
                suffix)
    else:
        raise ValueError("Input \"%s\" was no string or list of strings." %
                         input)
    return reader
 def _get_reader_instance(self, instance_number):
     if instance_number == 0:
         return DataInMemory(self.data)
     elif instance_number == 1:
         return FeatureReader(self.data_feature_reader,
                              topologyfile=self.topfile)
Пример #3
0
def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
    r"""
    Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer.
    Parameters
    ----------
    :param input_files:
        A single input file or a list of input files.
    :param topology:
        A topology file. If given, the featurizer argument can be None.
    :param featurizer:
        A featurizer. If given, the topology file can be None.
    :param chunksize:
        The chunk size with which the corresponding reader gets initialized.
    :return: Returns the reader.
    """
    from pyemma.coordinates.data.numpy_filereader import NumPyFileReader
    from pyemma.coordinates.data.py_csv_reader import PyCSVReader
    from pyemma.coordinates.data import FeatureReader
    from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader

    # fragmented trajectories
    if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and
            any(isinstance(item, (list, tuple)) for item in input_files)):
        return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer)

    # normal trajectories
    if (isinstance(input_files, (Path, str))
            or (isinstance(input_files, (list, tuple))
                and (any(isinstance(item, (Path, str)) for item in input_files)
                     or len(input_files) == 0))):
        # check: if single string create a one-element list
        if isinstance(input_files, (Path, str)):
            input_list = [input_files]
        elif len(input_files) > 0 and all(isinstance(item, (Path, str)) for item in input_files):
            input_list = input_files
        else:
            if len(input_files) == 0:
                raise ValueError("The passed input list should not be empty.")
            else:
                raise ValueError("The passed list did not exclusively contain strings or was a list of lists "
                                 "(fragmented trajectory).")

        # convert to list of paths
        input_list = [Path(f) for f in input_list]

        # TODO: this does not handle suffixes like .xyz.gz (rare)
        suffix = input_list[0].suffix

        # check: do all files have the same file type? If not: raise ValueError.
        if all(item.suffix == suffix for item in input_list):

            # do all the files exist? If not: Raise value error
            all_exist = True
            from six import StringIO
            err_msg = StringIO()
            for item in input_list:
                if not item.is_file():
                    err_msg.write('\n' if err_msg.tell() > 0 else "")
                    err_msg.write('File %s did not exist or was no file' % item)
                    all_exist = False
            if not all_exist:
                raise ValueError('Some of the given input files were directories'
                                 ' or did not exist:\n%s' % err_msg.getvalue())
            featurizer_or_top_provided = featurizer is not None or topology is not None
            # we need to check for h5 first, because of mdtraj custom HDF5 traj format (which is deprecated).
            if suffix in ('.h5', '.hdf5') and not featurizer_or_top_provided:
                # This check is potentially expensive for lots of files, we also re-open the file twice (causing atime updates etc.)
                # So we simply require that no featurizer option is given.
                # and not all((_is_mdtraj_hdf5_file(f) for f in input_files)):
                from pyemma.coordinates.data.h5_reader import H5Reader
                reader = H5Reader(filenames=input_files, chunk_size=chunksize, **kw)
            # CASE 1.1: file types are MD files
            elif FeatureReader.supports_format(suffix):
                # check: do we either have a featurizer or a topology file name? If not: raise ValueError.
                # create a MD reader with file names and topology
                if not featurizer_or_top_provided:
                    raise ValueError('The input files were MD files which makes it mandatory to have either a '
                                     'Featurizer or a topology file.')

                if suffix in ('.pdb', '.pdb.gz'):
                    raise ValueError('PyEMMA can not read PDB-fake-trajectories. '
                                     'Please consider using a sane trajectory format (e.g. xtc, dcd).')

                reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology,
                                       chunksize=chunksize)
            elif suffix in ('.npy', '.npz'):
                reader = NumPyFileReader(input_list, chunksize=chunksize)
            # otherwise we assume that given files are ascii tabulated data
            else:
                reader = PyCSVReader(input_list, chunksize=chunksize, **kw)
        else:
            raise ValueError('Not all elements in the input list were of the type %s!' % suffix)
    else:
        raise ValueError('Input "{}" was no string or list of strings.'.format(input_files))
    return reader