def _thread_worker_compress(job: int, in_file_paths: List[str], verbose: bool = False) -> int: for file_path in in_file_paths: if verbose: print("JOB" + str(job) + " - gz: " + file_path) bash.compress_gzip(in_path=file_path, verbose=verbose) return 0
def __init__(self, input_value: str, auto_save: bool = True, stride: int = 1, skip: int = 0): if input_value is None: self.TITLE = "Empty Trajectory" self.database = pandas.DataFrame({"": []}) elif isinstance(input_value, str): if input_value.endswith(".gz"): tmp_input = bash.compress_gzip(input_value, extract=True) self._read_from_file(input_path=tmp_input, auto_save=auto_save, stride=stride, skip=skip) bash.compress_gzip(tmp_input) else: self._read_from_file(input_path=input_value, auto_save=auto_save, stride=stride, skip=skip) self.path = input_value elif isinstance(input_value, list) and all( [x is str for x in input_value]): self._read_from_file(input_path=input_value[0], auto_save=False, stride=stride, skip=skip) for tmp_traj_file in input_value[1:]: self += __class__(tmp_traj_file, auto_save=False) if auto_save: auto_save_path = input_value[0] + ".h5" self.write(auto_save_path) elif isinstance(input_value.__class__, __class__) or issubclass( input_value.__class__, __class__): for attr in vars(input_value): setattr(self, attr, getattr(input_value, attr)) self.database = input_value.database.copy(deep=True) else: raise IOError("Constructor not found")
def compress_files(in_paths: List[str], n_processes: int = 1) -> List[str]: """compress a list of files Parameters ---------- in_paths : List[str] n_processes: int how many processes can be used in parallel? Returns ------- List[str] outpaths """ if type(in_paths) == str: in_paths = [in_paths] out_paths = [] # check: for path in in_paths: if os.path.exists(path): archive_path = path + ".gz" out_paths.append(archive_path) else: warnings.warn("File Path: " + path + " was not found!") # do: print("Gen Gzips:") if n_processes == 1: for path in in_paths: bash.compress_gzip(in_path=path, out_path=path + ".gz") else: # do parallel p = mult.Pool(n_processes) distribute = [(job, in_paths[job::n_processes], True) for job in range(n_processes)] p.starmap(_thread_worker_compress, distribute) p.close() p.join() return out_paths
def __init__( self, xyz=None, topology=None, time=None, unitcell_lengths=None, unitcell_angles=None, traj_path=None, in_cnf: [str, Cnf] = None, timestep_duration: float = 0.002, _future_file: bool = False, ): self._future_file = _future_file if xyz is None and topology is None and traj_path is None and in_cnf is None: self._future_file = None if traj_path is not None and (traj_path.endswith(".h5") or traj_path.endswith(".hf5")): trj = self.load(traj_path) self.__dict__.update(vars(trj)) elif traj_path is not None and (traj_path.endswith(".trc") or traj_path.endswith(".trc.gz")): # Parse TRC compress = False if traj_path.endswith(".gz"): traj_path = bash.compress_gzip(in_path=traj_path, extract=True) compress = True unitcell_angles = None unitcell_lengths = None if isinstance(traj_path, str): xyz, step, time, unitcell_lengths, unitcell_angles = self.parse_trc_efficiently( traj_path) if compress: traj_path = bash.compress_gzip(in_path=traj_path) # Topology from Cnf if isinstance(in_cnf, str): in_cnf = Cnf(in_cnf) elif isinstance(in_cnf, Cnf) and hasattr(in_cnf, "POSITION"): pass else: in_cnf = self.get_dummy_cnf(xyz) # get cnf boxDims if hasattr(in_cnf, "GENBOX") and (unitcell_lengths is None and unitcell_angles is None): unitcell_angles = np.array( list(in_cnf.GENBOX.angles) * len(xyz)).reshape( len(xyz), len(in_cnf.GENBOX.length)) unitcell_lengths = np.array( list(in_cnf.GENBOX.length) * len(xyz)).reshape( len(xyz), len(in_cnf.GENBOX.length)) # Topo tmp file tmpFile = tempfile.NamedTemporaryFile(suffix="_tmp.pdb") in_cnf.write_pdb(tmpFile.name) single = mdtraj.load_pdb(tmpFile.name) tmpFile.close() super().__init__( xyz=xyz, topology=single.topology, time=time, unitcell_lengths=unitcell_lengths, unitcell_angles=unitcell_angles, ) self._step = step elif not (xyz is None and topology is None): super().__init__( xyz=xyz, topology=topology, time=time, unitcell_lengths=unitcell_lengths, unitcell_angles=unitcell_angles, ) self._step = np.array(np.round(self._time / timestep_duration), dtype=int) self.TITLE = TITLE( content=" Generic Title... to be changed by YOU!") else: self._unitcell_lengths = [] self._unitcell_angles = [] self._xyz = np.array([], ndmin=2) self._topology = None self._future_file = True self.path = traj_path
def find_and_unarchive_tar_files(trc_files: List[str], verbose: bool = False): # archive handling archived_files = list( filter(lambda x: (".tar" in x or ".gz" in x or ".tar.gz" in x), trc_files)) not_archived_files = list( filter(lambda x: not ("tar" in x or ".gz" in x or ".tar.gz" in x), trc_files)) unarchived_files = [] if verbose: print("archives: ", archived_files) if verbose: print("narchives: ", not_archived_files) # untar files: for tared_file in archived_files: if len(not_archived_files) == 0 or not any( [noAfile in tared_file for noAfile in not_archived_files]): try: # print("Ungzip ->\t", tared_file) out_path = bash.compress_gzip(in_path=tared_file, out_path=tared_file.replace( ".tar", "").replace(".gz", ""), extract=True) except SubprocessError: # print("Failed gzip, trying tar") out_path = bash.extract_tar( in_path=tared_file, out_path=tared_file.replace(".tar", "").replace(".gz", ""), gunzip_compression=True, ) # fix for stupid taring! #todo: remove part if any([ "cluster" == xfile for xfile in os.listdir(os.path.dirname(tared_file)) ]) and not os.path.exists(out_path): nfound = True for cpath, tdir, files in os.walk( os.path.dirname(tared_file) + "/cluster"): if os.path.basename(tared_file).replace( ".tar", "").replace(".gz", "") in files: if verbose: print( "FOUND PATH: ", cpath + "/" + os.path.basename(tared_file).replace( ".tar", "").replace(".gz", ""), ) wrong_path = cpath + "/" + os.path.basename( tared_file).replace(".tar", "").replace(".gz", "") out_file = bash.move_file( wrong_path, tared_file.replace(".tar", "").replace(".gz", "")) unarchived_files.append(out_file) nfound = False break if nfound: raise IOError("could not find untarred file!") else: unarchived_files.append(out_path) # raise Exception("this tar needs special treatment as it is in cluster/yadda/yadda/fun.trc") else: if verbose: print([ noAfile for noAfile in not_archived_files if (noAfile in tared_file) ]) new_files = not_archived_files new_files.extend(unarchived_files) use_tre_file_paths = sorted( new_files, key=lambda x: int(x.split("_")[-1].split(".")[0])) return use_tre_file_paths, unarchived_files
def _thread_worker_cat_trc( job: int, replicaID_range: List[int], trc_files: Dict[int, List[str]], out_prefix: str, topology_path: str, out_trcs: dict, dt: float, time: float = 0, verbose: bool = False, boundary_conditions: str = "r cog", include_all: bool = False, ): """_thread_worker_cat_trc This thread worker_scripts concatenates all .trc files of one replica into one file. Parameters ---------- job : rank of this thread replicaID_range : x_range - list of all trc_files : Dict[int, List[str]] Dictionary containing all replicas, with list of all trc files concerning one trc. out_prefix : str output prefix verbose : bool verbosity? Returns ------- None """ gromPP = gromosPP.GromosPP() start_dir = os.getcwd() if verbose: print("JOB " + str(job) + ": range " + str(list(replicaID_range))) for replicaID in replicaID_range: out_path = out_prefix + str(replicaID) + ".trc" compress_out_path = out_path + ".gz" out_trcs.update({replicaID: compress_out_path}) if os.path.exists( compress_out_path): # found perfect compressed trc file:) warnings.warn("Skipped generating file as I found: " + compress_out_path) if os.path.exists(out_path): bash.remove_file(out_path) continue elif os.path.exists( out_path): # did not find compressed file. will compress warnings.warn("Skipped generating file as I found: " + out_path) continue else: # concat files if verbose: print("JOB " + str(job) + ": " + "write out " + out_path + "\n") out_dir = os.path.dirname(out_path) tmp_dir = bash.make_folder(out_dir + "/TMP_replica_" + str(replicaID), additional_option="-p") os.chdir(tmp_dir) if include_all: out_path = gromPP.frameout( in_top_path=topology_path, in_coord_path=" ".join(trc_files[replicaID]), periodic_boundary_condition=boundary_conditions, single_file=True, out_file_format="trc", out_file_path=out_path, time=time, dt=dt, include="ALL", ) else: out_path = gromPP.frameout( in_top_path=topology_path, in_coord_path=" ".join(trc_files[replicaID]), periodic_boundary_condition=boundary_conditions, single_file=True, out_file_format="trc", out_file_path=out_path, time=time, dt=dt, ) os.chdir(start_dir) bash.wait_for_fileSystem(out_path) bash.remove_folder(tmp_dir) if verbose: print("JOB " + str(job) + ": " + "write out " + out_path + "\t DONE\n") if verbose: print("JOB " + str(job) + ": " + "compress " + compress_out_path + "\n") compressed_trc = bash.compress_gzip(out_path, out_path=compress_out_path) if verbose: print("JOB " + str(job) + ": " + "compress " + compressed_trc + "\t DONE\n")
def _thread_worker_cat_tre( job: int, replicaID_range: List[int], tre_files: Dict[int, List[str]], out_prefix: str, out_tres: dict, verbose: bool = False, ): if verbose: print("JOB " + str(job) + ": range " + str(list(replicaID_range))) for replicaID in replicaID_range: use_tre_file_paths, unarchived_files = find_and_unarchive_tar_files( tre_files[replicaID], verbose=verbose) if verbose: print("FILES: ", use_tre_file_paths) if verbose: print("Archs:", unarchived_files) out_path = out_prefix + str(replicaID) + ".tre" compressed_tre = out_path + ".gz" if os.path.exists(compressed_tre): warnings.warn("Skipped generating .tre.gz file as I found: " + out_path) else: if os.path.exists(out_path): warnings.warn("Skipped generating .tre file as I found: " + out_path + "\n\t Continue Compressing.") else: tre_file = tre.Tre(use_tre_file_paths[0]) if verbose: print("JOB " + str(job) + ": parsing " + os.path.basename(use_tre_file_paths[0])) if len(use_tre_file_paths) > 1: for tre_file_path in use_tre_file_paths[1:]: if verbose: print("JOB " + str(job) + ": append " + os.path.basename(tre_file_path)) tre_file += tre.Tre(tre_file_path) if verbose: print("JOB " + str(job) + ": write out " + os.path.basename(out_path)) tre_file.write(out_path) bash.wait_for_fileSystem(out_path) if verbose: print("JOB " + str(job) + ": done " + os.path.basename(out_path)) del tre_file if verbose: print("JOB " + str(job) + ": compress " + os.path.basename(out_path)) compressed_tre = bash.compress_gzip(out_path, out_path=compressed_tre) if verbose: print("JOB " + str(job) + ": " + "compress " + compressed_tre + "\t DONE\n") # file cleaning: for file_path in use_tre_file_paths: bash.compress_gzip(file_path) out_tres.update({replicaID: compressed_tre})