예제 #1
0
def _thread_worker_compress(job: int,
                            in_file_paths: List[str],
                            verbose: bool = False) -> int:
    for file_path in in_file_paths:
        if verbose:
            print("JOB" + str(job) + " - gz: " + file_path)
        bash.compress_gzip(in_path=file_path, verbose=verbose)
    return 0
    def __init__(self,
                 input_value: str,
                 auto_save: bool = True,
                 stride: int = 1,
                 skip: int = 0):
        if input_value is None:
            self.TITLE = "Empty Trajectory"
            self.database = pandas.DataFrame({"": []})

        elif isinstance(input_value, str):
            if input_value.endswith(".gz"):
                tmp_input = bash.compress_gzip(input_value, extract=True)
                self._read_from_file(input_path=tmp_input,
                                     auto_save=auto_save,
                                     stride=stride,
                                     skip=skip)
                bash.compress_gzip(tmp_input)
            else:
                self._read_from_file(input_path=input_value,
                                     auto_save=auto_save,
                                     stride=stride,
                                     skip=skip)
            self.path = input_value

        elif isinstance(input_value, list) and all(
            [x is str for x in input_value]):
            self._read_from_file(input_path=input_value[0],
                                 auto_save=False,
                                 stride=stride,
                                 skip=skip)

            for tmp_traj_file in input_value[1:]:
                self += __class__(tmp_traj_file, auto_save=False)

            if auto_save:
                auto_save_path = input_value[0] + ".h5"
                self.write(auto_save_path)

        elif isinstance(input_value.__class__, __class__) or issubclass(
                input_value.__class__, __class__):
            for attr in vars(input_value):
                setattr(self, attr, getattr(input_value, attr))
            self.database = input_value.database.copy(deep=True)
        else:
            raise IOError("Constructor not found")
예제 #3
0
def compress_files(in_paths: List[str], n_processes: int = 1) -> List[str]:
    """compress a list of files

    Parameters
    ----------
    in_paths :  List[str]

    n_processes: int
        how many processes can be used in parallel?

    Returns
    -------
    List[str]
        outpaths
    """

    if type(in_paths) == str:
        in_paths = [in_paths]
    out_paths = []

    # check:
    for path in in_paths:
        if os.path.exists(path):
            archive_path = path + ".gz"
            out_paths.append(archive_path)
        else:
            warnings.warn("File Path: " + path + " was not found!")

    # do:
    print("Gen Gzips:")
    if n_processes == 1:
        for path in in_paths:
            bash.compress_gzip(in_path=path, out_path=path + ".gz")
    else:  # do parallel
        p = mult.Pool(n_processes)
        distribute = [(job, in_paths[job::n_processes], True)
                      for job in range(n_processes)]
        p.starmap(_thread_worker_compress, distribute)
        p.close()
        p.join()
    return out_paths
예제 #4
0
    def __init__(
        self,
        xyz=None,
        topology=None,
        time=None,
        unitcell_lengths=None,
        unitcell_angles=None,
        traj_path=None,
        in_cnf: [str, Cnf] = None,
        timestep_duration: float = 0.002,
        _future_file: bool = False,
    ):

        self._future_file = _future_file
        if xyz is None and topology is None and traj_path is None and in_cnf is None:
            self._future_file = None

        if traj_path is not None and (traj_path.endswith(".h5")
                                      or traj_path.endswith(".hf5")):
            trj = self.load(traj_path)
            self.__dict__.update(vars(trj))

        elif traj_path is not None and (traj_path.endswith(".trc")
                                        or traj_path.endswith(".trc.gz")):

            # Parse TRC
            compress = False
            if traj_path.endswith(".gz"):
                traj_path = bash.compress_gzip(in_path=traj_path, extract=True)
                compress = True

            unitcell_angles = None
            unitcell_lengths = None

            if isinstance(traj_path, str):
                xyz, step, time, unitcell_lengths, unitcell_angles = self.parse_trc_efficiently(
                    traj_path)

            if compress:
                traj_path = bash.compress_gzip(in_path=traj_path)

            # Topology from Cnf
            if isinstance(in_cnf, str):
                in_cnf = Cnf(in_cnf)
            elif isinstance(in_cnf, Cnf) and hasattr(in_cnf, "POSITION"):
                pass
            else:
                in_cnf = self.get_dummy_cnf(xyz)

            # get cnf boxDims
            if hasattr(in_cnf, "GENBOX") and (unitcell_lengths is None
                                              and unitcell_angles is None):
                unitcell_angles = np.array(
                    list(in_cnf.GENBOX.angles) * len(xyz)).reshape(
                        len(xyz), len(in_cnf.GENBOX.length))
                unitcell_lengths = np.array(
                    list(in_cnf.GENBOX.length) * len(xyz)).reshape(
                        len(xyz), len(in_cnf.GENBOX.length))

            # Topo tmp file
            tmpFile = tempfile.NamedTemporaryFile(suffix="_tmp.pdb")
            in_cnf.write_pdb(tmpFile.name)
            single = mdtraj.load_pdb(tmpFile.name)
            tmpFile.close()

            super().__init__(
                xyz=xyz,
                topology=single.topology,
                time=time,
                unitcell_lengths=unitcell_lengths,
                unitcell_angles=unitcell_angles,
            )
            self._step = step

        elif not (xyz is None and topology is None):
            super().__init__(
                xyz=xyz,
                topology=topology,
                time=time,
                unitcell_lengths=unitcell_lengths,
                unitcell_angles=unitcell_angles,
            )

            self._step = np.array(np.round(self._time / timestep_duration),
                                  dtype=int)
            self.TITLE = TITLE(
                content=" Generic Title... to be changed by YOU!")

        else:
            self._unitcell_lengths = []
            self._unitcell_angles = []
            self._xyz = np.array([], ndmin=2)
            self._topology = None
            self._future_file = True

        self.path = traj_path
예제 #5
0
def find_and_unarchive_tar_files(trc_files: List[str], verbose: bool = False):
    # archive handling
    archived_files = list(
        filter(lambda x: (".tar" in x or ".gz" in x or ".tar.gz" in x),
               trc_files))
    not_archived_files = list(
        filter(lambda x: not ("tar" in x or ".gz" in x or ".tar.gz" in x),
               trc_files))
    unarchived_files = []
    if verbose:
        print("archives: ", archived_files)
    if verbose:
        print("narchives: ", not_archived_files)

    # untar files:
    for tared_file in archived_files:
        if len(not_archived_files) == 0 or not any(
            [noAfile in tared_file for noAfile in not_archived_files]):
            try:
                # print("Ungzip ->\t", tared_file)
                out_path = bash.compress_gzip(in_path=tared_file,
                                              out_path=tared_file.replace(
                                                  ".tar",
                                                  "").replace(".gz", ""),
                                              extract=True)
            except SubprocessError:
                # print("Failed gzip, trying tar")
                out_path = bash.extract_tar(
                    in_path=tared_file,
                    out_path=tared_file.replace(".tar", "").replace(".gz", ""),
                    gunzip_compression=True,
                )

            # fix for stupid taring!    #todo: remove part
            if any([
                    "cluster" == xfile
                    for xfile in os.listdir(os.path.dirname(tared_file))
            ]) and not os.path.exists(out_path):
                nfound = True
                for cpath, tdir, files in os.walk(
                        os.path.dirname(tared_file) + "/cluster"):
                    if os.path.basename(tared_file).replace(
                            ".tar", "").replace(".gz", "") in files:
                        if verbose:
                            print(
                                "FOUND PATH: ",
                                cpath + "/" +
                                os.path.basename(tared_file).replace(
                                    ".tar", "").replace(".gz", ""),
                            )
                        wrong_path = cpath + "/" + os.path.basename(
                            tared_file).replace(".tar", "").replace(".gz", "")
                        out_file = bash.move_file(
                            wrong_path,
                            tared_file.replace(".tar", "").replace(".gz", ""))
                        unarchived_files.append(out_file)
                        nfound = False
                        break
                if nfound:
                    raise IOError("could not find untarred file!")
            else:
                unarchived_files.append(out_path)

                # raise Exception("this tar needs special treatment as it is in cluster/yadda/yadda/fun.trc")
        else:
            if verbose:
                print([
                    noAfile for noAfile in not_archived_files
                    if (noAfile in tared_file)
                ])
    new_files = not_archived_files
    new_files.extend(unarchived_files)

    use_tre_file_paths = sorted(
        new_files, key=lambda x: int(x.split("_")[-1].split(".")[0]))
    return use_tre_file_paths, unarchived_files
예제 #6
0
def _thread_worker_cat_trc(
    job: int,
    replicaID_range: List[int],
    trc_files: Dict[int, List[str]],
    out_prefix: str,
    topology_path: str,
    out_trcs: dict,
    dt: float,
    time: float = 0,
    verbose: bool = False,
    boundary_conditions: str = "r cog",
    include_all: bool = False,
):
    """_thread_worker_cat_trc
        This thread worker_scripts concatenates all .trc files of one replica into one file.

    Parameters
    ----------
    job :   rank of this thread
    replicaID_range :   x_range - list of all
    trc_files :     Dict[int, List[str]]
        Dictionary containing all replicas, with list of all trc files concerning one trc.
    out_prefix : str
        output prefix
    verbose : bool
        verbosity?

    Returns
    -------
    None
    """

    gromPP = gromosPP.GromosPP()
    start_dir = os.getcwd()
    if verbose:
        print("JOB " + str(job) + ": range " + str(list(replicaID_range)))
    for replicaID in replicaID_range:
        out_path = out_prefix + str(replicaID) + ".trc"
        compress_out_path = out_path + ".gz"

        out_trcs.update({replicaID: compress_out_path})

        if os.path.exists(
                compress_out_path):  # found perfect compressed trc file:)
            warnings.warn("Skipped generating file as I found: " +
                          compress_out_path)
            if os.path.exists(out_path):
                bash.remove_file(out_path)
            continue
        elif os.path.exists(
                out_path):  # did not find compressed file. will compress
            warnings.warn("Skipped generating file as I found: " + out_path)
            continue
        else:  # concat files
            if verbose:
                print("JOB " + str(job) + ": " + "write out " + out_path +
                      "\n")
            out_dir = os.path.dirname(out_path)
            tmp_dir = bash.make_folder(out_dir + "/TMP_replica_" +
                                       str(replicaID),
                                       additional_option="-p")
            os.chdir(tmp_dir)
            if include_all:
                out_path = gromPP.frameout(
                    in_top_path=topology_path,
                    in_coord_path=" ".join(trc_files[replicaID]),
                    periodic_boundary_condition=boundary_conditions,
                    single_file=True,
                    out_file_format="trc",
                    out_file_path=out_path,
                    time=time,
                    dt=dt,
                    include="ALL",
                )
            else:
                out_path = gromPP.frameout(
                    in_top_path=topology_path,
                    in_coord_path=" ".join(trc_files[replicaID]),
                    periodic_boundary_condition=boundary_conditions,
                    single_file=True,
                    out_file_format="trc",
                    out_file_path=out_path,
                    time=time,
                    dt=dt,
                )
            os.chdir(start_dir)
            bash.wait_for_fileSystem(out_path)
            bash.remove_folder(tmp_dir)
            if verbose:
                print("JOB " + str(job) + ": " + "write out " + out_path +
                      "\t DONE\n")

        if verbose:
            print("JOB " + str(job) + ": " + "compress " + compress_out_path +
                  "\n")
        compressed_trc = bash.compress_gzip(out_path,
                                            out_path=compress_out_path)

        if verbose:
            print("JOB " + str(job) + ": " + "compress " + compressed_trc +
                  "\t DONE\n")
예제 #7
0
def _thread_worker_cat_tre(
    job: int,
    replicaID_range: List[int],
    tre_files: Dict[int, List[str]],
    out_prefix: str,
    out_tres: dict,
    verbose: bool = False,
):
    if verbose:
        print("JOB " + str(job) + ": range " + str(list(replicaID_range)))

    for replicaID in replicaID_range:
        use_tre_file_paths, unarchived_files = find_and_unarchive_tar_files(
            tre_files[replicaID], verbose=verbose)
        if verbose:
            print("FILES: ", use_tre_file_paths)
        if verbose:
            print("Archs:", unarchived_files)

        out_path = out_prefix + str(replicaID) + ".tre"
        compressed_tre = out_path + ".gz"
        if os.path.exists(compressed_tre):
            warnings.warn("Skipped generating .tre.gz file as I found: " +
                          out_path)
        else:
            if os.path.exists(out_path):
                warnings.warn("Skipped generating .tre file as I found: " +
                              out_path + "\n\t Continue Compressing.")
            else:
                tre_file = tre.Tre(use_tre_file_paths[0])
                if verbose:
                    print("JOB " + str(job) + ": parsing " +
                          os.path.basename(use_tre_file_paths[0]))
                if len(use_tre_file_paths) > 1:
                    for tre_file_path in use_tre_file_paths[1:]:
                        if verbose:
                            print("JOB " + str(job) + ": append " +
                                  os.path.basename(tre_file_path))
                        tre_file += tre.Tre(tre_file_path)
                if verbose:
                    print("JOB " + str(job) + ": write out " +
                          os.path.basename(out_path))
                tre_file.write(out_path)
                bash.wait_for_fileSystem(out_path)
                if verbose:
                    print("JOB " + str(job) + ": done " +
                          os.path.basename(out_path))
                del tre_file

                if verbose:
                    print("JOB " + str(job) + ":  compress " +
                          os.path.basename(out_path))
                compressed_tre = bash.compress_gzip(out_path,
                                                    out_path=compressed_tre)
                if verbose:
                    print("JOB " + str(job) + ": " + "compress " +
                          compressed_tre + "\t DONE\n")

        # file cleaning:
        for file_path in use_tre_file_paths:
            bash.compress_gzip(file_path)
        out_tres.update({replicaID: compressed_tre})