def load_setup_files(proj_folder, traj_fname): # 0_0.hd5 is run 0 clone 0 run_index = int(traj_fname.split("_")[0]) print(run_index) glob_input = proj_folder + "/RUN%d/CLONE0/payload-*.tar.bz2" % run_index print(glob_input) payload_file = glob.glob(glob_input)[0] print(payload_file) if not payload_file: raise("Error:Payload files not found") print(os.path.abspath(payload_file)) with enter_temp_directory(): archive = tarfile.open(payload_file, mode='r:bz2') archive.extract("system.xml") archive.extract("integrator.xml") archive.extract("state.xml") with open("state.xml") as state_input: state = XmlSerializer.deserialize(state_input.read()) with open("system.xml") as system_input: system = XmlSerializer.deserialize(system_input.read()) with open("integrator.xml") as integrator_input: integrator = XmlSerializer.deserialize(integrator_input.read()) return state, system, integrator
def test_project(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) print(base_dir) print(type(base_dir)) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__weighted_transform': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2,'msm__lag_time': 1, 'bootstrap__n_samples':1} create_fake_data(base_dir, protein_list, project_dict) setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_pipeline(base_dir) prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml")) assert isinstance(prj, ProteinSeries) assert isinstance(prj.tica_mdl ,tICA) assert _test_protein_without_project() assert _test_protein_with_project(prj) assert _test_tic_dict(prj) assert _test_obs_mapping(prj) return
def test_setup_series_analysis(): base_dir = os.path.join("./fake_series") mdl_dir = os.path.join(base_dir,"new_mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["fake_kinase1", "fake_kinase2"] project_dict = {"fake_kinase1": ["fake_proj1", "fake_proj2"], "fake_kinase2": ["fake_proj3"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 2, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 174} with enter_temp_directory(): create_fake_series() setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) assert os.path.isdir(mdl_dir) for protein in protein_list: assert os.path.isdir(os.path.join(mdl_dir, protein)) assert(os.path.isfile(os.path.join(base_dir,"series.yaml"))) fin = open(os.path.join(mdl_dir,"project.yaml"), 'r') yaml_file = yaml.load(fin) assert yaml_file["base_dir"] == base_dir assert yaml_file["series_name"] == series_name assert yaml_file["protein_list"] == protein_list assert yaml_file["project_dict"] == project_dict assert yaml_file["mdl_params"] == mdl_params return
def load_setup_files(proj_folder, traj_fname): # 0_0.hd5 is run 0 clone 0 run_index = int(traj_fname.split("_")[0]) print(run_index) glob_input = proj_folder + "/RUN%d/CLONE0/payload-*.tar.bz2" % run_index print(glob_input) payload_file = glob.glob(glob_input)[0] print(payload_file) if not payload_file: raise ("Error:Payload files not found") print(os.path.abspath(payload_file)) with enter_temp_directory(): archive = tarfile.open(payload_file, mode='r:bz2') archive.extract("system.xml") archive.extract("integrator.xml") archive.extract("state.xml") with open("state.xml") as state_input: state = XmlSerializer.deserialize(state_input.read()) with open("system.xml") as system_input: system = XmlSerializer.deserialize(system_input.read()) with open("integrator.xml") as integrator_input: integrator = XmlSerializer.deserialize(integrator_input.read()) return state, system, integrator
def hdf5_concatenate_core17(job_tuple): """Concatenate tar bzipped XTC files created by Folding@Home Core17. Parameters ---------- path : str Path to directory containing "results-*.tar.bz2". E.g. a single CLONE directory. top : mdtraj.Topology Topology for system output_filename : str Filename of output HDF5 file to generate. Notes ----- We use HDF5 because it provides an easy way to store the metadata associated with which files have already been processed. """ proj_folder, top_folder, db_name, run, clone = job_tuple path = os.path.join(proj_folder,"RUN%d/CLONE%d/"%(run,clone)) top = md.load(os.path.join(top_folder,"%d.pdb"%run)) output_filename = os.path.join(proj_folder,"trajectories/%d_%d.hdf5"%(run,clone)) glob_input = os.path.join(path, "results-*.tar.bz2") filenames = glob.glob(glob_input) filenames = sorted(filenames, key=keynat) if len(filenames) <= 0: return trj_file = HDF5TrajectoryFile(output_filename, mode='a') try: trj_file._create_earray(where='/', name='processed_filenames',atom=trj_file.tables.StringAtom(1024), shape=(0,)) trj_file.topology = top.topology except trj_file.tables.NodeError: pass for filename in filenames: if six.b(filename) in trj_file._handle.root.processed_filenames: # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six. print("Already processed %s" % filename) continue with enter_temp_directory(): print("Processing %s" % filename) archive = tarfile.open(filename, mode='r:bz2') try: archive.extract("positions.xtc") trj = md.load("positions.xtc", top=top) for frame in trj: trj_file.write(coordinates=frame.xyz, cell_lengths=frame.unitcell_lengths, cell_angles=frame.unitcell_angles) trj_file._handle.root.processed_filenames.append([filename]) except: #something wrong with the current trajectory file. Warn and return immediately warnings.warn("Problem at %s.Stopping trajectory here"%filename) return return
def test_change_protein_data_dir(): with enter_temp_directory(): create_fake_series() yaml_file ={} yaml_file["base_dir"] = "./fake_series" protein = "fake_kinase1" with enter_protein_data_dir(yaml_file, protein): current_folder_path, current_folder_name = os.path.split(os.getcwd()) assert current_folder_name == "fake_kinase1" return
def test_change_protein_data_dir(): with enter_temp_directory(): create_fake_series() yaml_file = {} yaml_file["base_dir"] = "./fake_series" protein = "fake_kinase1" with enter_protein_data_dir(yaml_file, protein): current_folder_path, current_folder_name = os.path.split( os.getcwd()) assert current_folder_name == "fake_kinase1" return
def concatenate_core17(path, top, output_filename): """Concatenate tar bzipped XTC files created by Folding@Home Core17. Parameters ---------- path : str Path to directory containing "results-*.tar.bz2". E.g. a single CLONE directory. top : mdtraj.Topology Topology for system output_filename : str Filename of output HDF5 file to generate. Notes ----- We use HDF5 because it provides an easy way to store the metadata associated with which files have already been processed. """ glob_input = os.path.join(path, "results-*.tar.bz2") filenames = glob.glob(glob_input) filenames = sorted(filenames, key=keynat) if len(filenames) <= 0: return trj_file = HDF5TrajectoryFile(output_filename, mode='a') try: trj_file._create_earray(where='/', name='processed_filenames', atom=trj_file.tables.StringAtom(1024), shape=(0, )) trj_file.topology = top.topology except trj_file.tables.NodeError: pass for filename in filenames: if six.b( filename ) in trj_file._handle.root.processed_filenames: # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six. print("Already processed %s" % filename) continue with enter_temp_directory(): print("Processing %s" % filename) archive = tarfile.open(filename, mode='r:bz2') archive.extract("positions.xtc") trj = md.load("positions.xtc", top=top) for frame in trj: trj_file.write(coordinates=frame.xyz, cell_lengths=frame.unitcell_lengths, cell_angles=frame.unitcell_angles) trj_file._handle.root.processed_filenames.append([filename])
def test_pipeline(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir, "mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = { "kinase_1": [ "fake_proj1", ], "kinase_2": ["fake_proj2"] } mdl_params = { 'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootstrap__n_samples': 1 } create_fake_data(base_dir, protein_list, project_dict) yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_protein_tica(yaml_file) transform_protein_tica(yaml_file) fit_protein_kmeans(yaml_file) transform_protein_kmeans(yaml_file) fit_msms(yaml_file) fit_bootstrap(yaml_file) raw_count_obs = 0 for p in protein_list: for j in glob.glob(os.path.join(base_dir, p, feature_dir, "*.jl")): raw_count_obs += verboseload(j).shape[0] tica_mdl = verboseload(os.path.join(mdl_dir, "tica_mdl.pkl")) #make sure the mdl is seeing all the data, could probably have a far stronger test here assert tica_mdl.n_observations_ == raw_count_obs assert os.path.exists(os.path.join(mdl_dir, "kinase_1/tica_data.pkl")) assert os.path.exists(os.path.join(mdl_dir, "kinase_2/tica_data.pkl")) assert os.path.exists(os.path.join(mdl_dir, "kinase_1/msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir, "kinase_2/msm_mdl.pkl")) assert os.path.exists( os.path.join(mdl_dir, "kinase_2/bootstrap_msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir, "kmeans_mdl.pkl")) return
def concatenate_core17(path, top, output_filename): """Concatenate tar bzipped XTC files created by Folding@Home Core17. Parameters ---------- path : str Path to directory containing "results-*.tar.bz2". E.g. a single CLONE directory. top : mdtraj.Topology Topology for system output_filename : str Filename of output HDF5 file to generate. Notes ----- We use HDF5 because it provides an easy way to store the metadata associated with which files have already been processed. """ glob_input = os.path.join(path, "results-*.tar.bz2") filenames = glob.glob(glob_input) filenames = sorted(filenames, key=keynat) if len(filenames) <= 0: return trj_file = HDF5TrajectoryFile(output_filename, mode='a') try: trj_file._create_earray(where='/', name='processed_filenames',atom=trj_file.tables.StringAtom(1024), shape=(0,)) trj_file.topology = top.topology except trj_file.tables.NodeError: pass for filename in filenames: if six.b(filename) in trj_file._handle.root.processed_filenames: # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six. print("Already processed %s" % filename) continue with enter_temp_directory(): print("Processing %s" % filename) archive = tarfile.open(filename, mode='r:bz2') archive.extract("positions.xtc") trj = md.load("positions.xtc", top=top) for frame in trj: trj_file.write(coordinates=frame.xyz, cell_lengths=frame.unitcell_lengths, cell_angles=frame.unitcell_angles, time=frame.time) trj_file._handle.root.processed_filenames.append([filename])
def _load_project_clone(protein, project, run, clone): main_dir = base_dir with enter_temp_directory(): top = mdt.load( os.path.join(main_dir, protein, project, "topologies", "%d.pdb" % run)) t = [ _trj_load(f, top) for f in sorted(glob.glob( os.path.join(main_dir, protein, project, "RUN%d" % run, "CLONE%d" % clone, "results*")), key=keynat) ] print("Length of t is :", len(t)) print(t[0]) trj = t[0] + t[1:] return trj, trj.remove_solvent()
def _load_project_clone(protein, project, run, clone): main_dir = base_dir with enter_temp_directory(): top = mdt.load(os.path.join(main_dir, protein, project, "topologies", "%d.pdb"%run)) t = [_trj_load(f,top) for f in sorted(glob.glob(os.path.join(main_dir, protein, project, "RUN%d"%run,"CLONE%d"%clone, "results*")), key=keynat)] print("Length of t is :", len(t)) print(t[0]) trj = t[0] + t[1:] return trj, trj.remove_solvent()
def test_pipeline(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootstrap__n_samples':1 } create_fake_data(base_dir, protein_list, project_dict) yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_protein_tica(yaml_file) transform_protein_tica(yaml_file) fit_protein_kmeans(yaml_file) transform_protein_kmeans(yaml_file) fit_msms(yaml_file) fit_bootstrap(yaml_file) raw_count_obs = 0 for p in protein_list: for j in glob.glob(os.path.join(base_dir,p,feature_dir,"*.jl")): raw_count_obs += verboseload(j).shape[0] tica_mdl = verboseload(os.path.join(mdl_dir,"tica_mdl.pkl")) #make sure the mdl is seeing all the data, could probably have a far stronger test here assert tica_mdl.n_observations_ == raw_count_obs assert os.path.exists(os.path.join(mdl_dir,"kinase_1/tica_data.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kinase_2/tica_data.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kinase_1/msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kinase_2/msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kinase_2/bootstrap_msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kmeans_mdl.pkl")) return
def test_multiple_mdls(): base_dir = os.path.join("./fake_series") mdl_dir = os.path.join(base_dir,"new_mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["fake_kinase1", "fake_kinase2"] project_dict = {"fake_kinase1": ["fake_proj1", "fake_proj2"], "fake_kinase2": ["fake_proj3"]} mdl_params = {'tica__n_components': 4, 'tica__lag_time': 223, 'tica__kinetic_mapping': True, 'tica__gamma': 0.0121, 'cluster__n_clusters': 212} with enter_temp_directory(): create_fake_series() for i in range(3): setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) time.sleep(1) assert len(glob.glob("./fake_series/*/project.yaml")) == 3 return
def test_slicer(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootstrap__n_samples':1 } create_fake_data(base_dir, protein_list, project_dict) yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) dict_feat_ind={} dict_feat_ind["kinase_1"] =[0, 2] dict_feat_ind["kinase_2"] =[1, 1, 0, 2] series_feature_slicer(yaml_file, dict_feat_ind) for protein in protein_list: with enter_protein_data_dir(yaml_file, protein): assert (os.path.isdir("sliced_feature_dir")) flist = glob.glob("./%s/*.jl"%feature_dir) for fname in flist: original_file = verboseload(fname) expected_file = original_file[:, dict_feat_ind[protein]] written_file = verboseload("./%s/%s"%("sliced_feature_dir", os.path.basename(fname) )) assert (expected_file==written_file).all() return
def test_plotting_utils(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir, "mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = { "kinase_1": [ "fake_proj1", ], "kinase_2": ["fake_proj2"] } mdl_params = { 'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootrap__n_samples': 1 } create_fake_data(base_dir, protein_list, project_dict) setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_pipeline(base_dir) prj = ProteinSeries(os.path.join(mdl_dir, "project.yaml")) prt1 = Protein(prj, "kinase_1") prt2 = Protein(prj, "kinase_2") prt1._mlpt_fct = 0.0 prt2._mlpt_fct = 0.0 n_bins = 100 lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2], range(prt1.n_tics_), n_bins) def test_bounds(): locally_calc = {} for i in range(prt1.n_tics_): locally_calc[i] = [] global_min = min( min([min(i) for i in prt1.tica_data.values()]), min([min(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_min) global_max = max( max([max(i) for i in prt1.tica_data.values()]), max([max(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_max) for i in range(prt1.n_tics_): assert (lin_spaced_tic_dict[i][0] == locally_calc[i][0]) assert (lin_spaced_tic_dict[i][-1] == locally_calc[i][-1]) assert (len(lin_spaced_tic_dict[i]) == n_bins) return True def test_histogram_data(): H_dict, H_calc, _ = tica_histogram(prj, prt1, [0], x_array=lin_spaced_tic_dict[0], n_bins=None) assert (len(H_dict.keys()) == prt1.n_states_) assert (len(H_calc) == len(lin_spaced_tic_dict[0]) - 1) rnd_state = np.random.randint(0, prt1.n_states_) assert (np.allclose( H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state], bins=lin_spaced_tic_dict[0], normed=True)[0])) return True def test_one_dim_free_energy(): df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None, lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False) assert ((df.protein_name == prt1.name).all()) assert ((df.mdl_index == "mle").all()) return True assert (test_bounds()) assert (test_histogram_data()) assert (test_one_dim_free_energy()) return
def concatenate_core17(path, top_filename, output_filename, maxtime=None, maxpackets=None): """Concatenate tar bzipped XTC files created by Folding@Home Core17. This version accepts only filenames and paths. Parameters ---------- path : str Path to directory containing "results-*.tar.bz2". E.g. a single CLONE directory. top_filename : str Filepath to read Topology for system output_filename : str Filename of output HDF5 file to generate. maxpackets : int, optional, default=None If specified, will stop processing after `maxpackets` results packets have been processed maxtime : int, optional, default=None If specified, will stop processing after `maxtime` seconds have passed. Notes ----- We use HDF5 because it provides an easy way to store the metadata associated with which files have already been processed. """ # Open topology file. top = md.load(top_filename % vars()) # Glob file paths and return result files in sequential order. glob_input = os.path.join(path, "results-*.tar.bz2") filenames = glob.glob(glob_input) filenames = natsorted(filenames) print("Concatenating XTC files from '%s' into '%s' [%d results packets found]" % (path, output_filename, len(filenames))) # If no result files are present, return. if len(filenames) <= 0: del top return # Check integrity of trajectory if it exists. delete_trajectory_if_broken(output_filename) # Open trajectory for appending. trj_file = HDF5TrajectoryFile(output_filename, mode='a') MAX_FILEPATH_LENGTH = 1024 # Is this large enough? try: # TODO: Store MD5 hashes instead of filenames? trj_file._create_earray(where='/', name='processed_filenames',atom=trj_file.tables.StringAtom(MAX_FILEPATH_LENGTH), shape=(0,)) trj_file.topology = top.topology except trj_file.tables.NodeError: # Object already exists; skip ahead. pass result_packets_processed = 0 initial_time = time.time() try: for filename in filenames: # Check that we haven't violated our filename length assumption if len(filename) > MAX_FILEPATH_LENGTH: msg = "Filename is longer than hard-coded MAX_FILEPATH_LENGTH limit (%d > %d). Increase MAX_FILEPATH_LENGTH and rebuild." % (len(filename), MAX_FILEPATH_LENGTH) print(msg) raise Exception(msg) # Check if we have already processed this file if six.b(filename) in trj_file._handle.root.processed_filenames: # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six. print("Already processed %s" % filename) continue # Extract frames from trajectory in a temporary directory absfilename = os.path.abspath(filename) with enter_temp_directory(): # Extract frames archive = tarfile.open(absfilename, mode='r:bz2') archive.extract("positions.xtc") trj = md.load("positions.xtc", top=top) print(" appending %d frames from '%s' to '%s'" % (trj.n_frames, filename, output_filename)) for frame in trj: trj_file.write(coordinates=frame.xyz, cell_lengths=frame.unitcell_lengths, cell_angles=frame.unitcell_angles, time=frame.time) os.unlink("positions.xtc") del archive, trj # Append list of processed files trj_file._handle.root.processed_filenames.append([filename]) # Flush data trj_file.flush() # Track statistics on processed packets elapsed_time = time.time() - initial_time result_packets_processed += 1 # Return if we have processed the requested number of results packets. if maxpackets and (result_packets_processed >= maxpackets): break except RuntimeError: print("Cannot munge %s due to damaged XTC %s or mismatch with topology file." % (path, filename)) # Clean up. trj_file.close() del top, trj_file
def ensure_result_packet_is_decompressed(result_packet, topology, atom_indices=None, chunksize=10, delete_on_unpack=False, compress_xml=False): """ Ensure that the specified result packet is decompressed. If this is a ws7/ws8 compressed result packet, safely convert it to uncompressed: * decompress it into a temporary directory * move it into place * verify integrity of files * unlink (delete) the old result packet if everything looks OK [OPTIONAL] If this is a directory, this function returns immediately. .. warning: This will irreversibly delete the compressed work packet, replacing it with an uncompressed one. Parameters ---------- result_packet : str Path to original result packet topology : mdtraj.Topology Topology to use for verifying integrity of trajectory atom_indices : list of int, optional, default=None Atom indices to read when verifying integrity of trajectory If None, all atoms will be read. delete_on_unpack : bool, optional, default=True If True, will delete old ws8-style .tar.bz2 files after they have been unpacked. WARNING: THIS COULD BE DANGEROUS compress_xml : bool, optional, default=False If True, will compress XML files after unpacking them. chunksize : int, optional, default=10 Number of frames to read each call to mdtraj.iterload for verifying trajectory integrity Returns ------- result_packet : str Path to new result packet directory """ # Return if this is just a directory if os.path.isdir(result_packet): return result_packet # If this is a tarball, extract salient information. # Format: results-002.tar.bz2 absfilename = os.path.abspath(result_packet) (basepath, filename) = os.path.split(absfilename) pattern = r'results-(\d+).tar.bz2' if not re.match(pattern, filename): raise Exception("Compressed results packet filename '%s' does not match expected format (results-001.tar.bz2)" % result_packet) frame_number = int(re.match(pattern, filename).group(1)) # Extract frames from trajectory in a temporary directory print(" Extracting %s" % result_packet) with enter_temp_directory(): # Create target directory extracted_archive_directory = tempfile.mkdtemp() # Extract all contents archive = tarfile.open(absfilename, mode='r:bz2') archive.extractall(path=extracted_archive_directory) # Compress XML files if compress_xml: xml_filenames = glob.glob('%s/*.xml' % extracted_archive_directory) for filename in xml_filenames: print(" Compressing %s" % os.path.basename(filename)) subprocess.call(['gzip', filename]) # Create new result packet name new_result_packet = os.path.join(basepath, 'results%d' % frame_number) # Move directory into place shutil.move(extracted_archive_directory, new_result_packet) # Verify integrity of archive contents xtc_filename = os.path.join(new_result_packet, 'positions.xtc') if not os.path.exists(xtc_filename): raise Exception("Result packet archive '%s' does not contain positions.xtc; aborting unpacking.") try: for chunk in md.iterload(xtc_filename, top=topology, atom_indices=atom_indices, chunk=chunksize): pass except Exception as e: msg = "Result packet archive '%s' failed trajectory integrity check; aborting unpacking.\n" msg += str(e) raise Exception(msg) # Cleanup archive object del archive if delete_on_unpack: # Remove archive permanently print(" Permanently removing %s" % absfilename) os.unlink(absfilename) # Return updated result packet directory name return new_result_packet
def hdf5_concatenate(job_tuple): """Concatenate tar bzipped or nonbized XTC files created by Folding@Home . Parameters ---------- path : str Path to directory containing "results-*.tar.bz2". E.g. a single CLONE directory. top : mdtraj.Topology Topology for system output_filename : str Filename of output HDF5 file to generate. Notes ----- We use HDF5 because it provides an easy way to store the metadata associated with which files have already been processed. """ proj, protein_folder, proj_folder, top_folder, run, clone, protein_only = job_tuple path = os.path.join(proj_folder,"RUN%d/CLONE%d/"%(run,clone)) top = md.load(os.path.join(top_folder,"%d.pdb"%run)) str_top = top.remove_solvent() glob_input = os.path.join(path, "results*") filenames = sorted(glob.glob(glob_input), key=keynat) if len(filenames) <= 0: return #output path for stripped trajectory strip_prot_out_filename = os.path.join(protein_folder, "protein_traj/%s_%d_%d.hdf5"%(proj,run,clone)) str_trj_file = HDF5TrajectoryFile(strip_prot_out_filename, mode='a') str_trj_file_wrapper = HDF5TrajectoryFileWrapper(str_trj_file) str_trj_file_wrapper.setup(str_top.topology) if not protein_only: #output path for full trajectory output_filename = os.path.join(protein_folder, "trajectories/%s_%d_%d.hdf5"%(proj,run,clone)) trj_file = HDF5TrajectoryFile(output_filename, mode='a') trj_file_wrapper = HDF5TrajectoryFileWrapper(trj_file) trj_file_wrapper.setup(top.topology) for index, filename in enumerate(filenames): #if we find it in both then no problem we can continue to the next filename if ( protein_only or trj_file_wrapper.check_filename(filename)) and \ str_trj_file_wrapper.check_filename(filename): print("Already processed %s" % filename) continue with enter_temp_directory(): print("Processing %s" % filename) #try loading the file try: trj = _traj_loader(filename,top) #if that fails, give up on this clone entirely and move on except: print("Failed at %s "%filename) break #if loading is successful, try adding it if (not protein_only) and (not trj_file_wrapper.check_filename(filename)): if trj_file_wrapper.validate_filename(index, filename, filenames): trj_file_wrapper.write_file(filename, trj) #now the stripped file if not str_trj_file_wrapper.check_filename(filename): if str_trj_file_wrapper.validate_filename(index, filename, filenames): trj = trj.remove_solvent() str_trj_file_wrapper.write_file(filename, trj) return
def concatenate_core17(path, top_filename, output_filename, maxtime=None, maxpackets=None): """Concatenate tar bzipped XTC files created by Folding@Home Core17. This version accepts only filenames and paths. Parameters ---------- path : str Path to directory containing "results-*.tar.bz2". E.g. a single CLONE directory. top_filename : str Filepath to read Topology for system output_filename : str Filename of output HDF5 file to generate. maxpackets : int, optional, default=None If specified, will stop processing after `maxpackets` results packets have been processed maxtime : int, optional, default=None If specified, will stop processing after `maxtime` seconds have passed. Notes ----- We use HDF5 because it provides an easy way to store the metadata associated with which files have already been processed. """ # Open topology file. top = md.load(top_filename % vars()) # Glob file paths and return result files in sequential order. glob_input = os.path.join(path, "results-*.tar.bz2") filenames = glob.glob(glob_input) filenames = natsorted(filenames) print( "Concatenating XTC files from '%s' into '%s' [%d results packets found]" % (path, output_filename, len(filenames))) # If no result files are present, return. if len(filenames) <= 0: del top return # Check integrity of trajectory if it exists. delete_trajectory_if_broken(output_filename) # Open trajectory for appending. trj_file = HDF5TrajectoryFile(output_filename, mode='a') MAX_FILEPATH_LENGTH = 1024 # Is this large enough? try: # TODO: Store MD5 hashes instead of filenames? trj_file._create_earray( where='/', name='processed_filenames', atom=trj_file.tables.StringAtom(MAX_FILEPATH_LENGTH), shape=(0, )) trj_file.topology = top.topology except trj_file.tables.NodeError: # Object already exists; skip ahead. pass result_packets_processed = 0 initial_time = time.time() try: for filename in filenames: # Check that we haven't violated our filename length assumption if len(filename) > MAX_FILEPATH_LENGTH: msg = "Filename is longer than hard-coded MAX_FILEPATH_LENGTH limit (%d > %d). Increase MAX_FILEPATH_LENGTH and rebuild." % ( len(filename), MAX_FILEPATH_LENGTH) print(msg) raise Exception(msg) # Check if we have already processed this file if six.b( filename ) in trj_file._handle.root.processed_filenames: # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six. print("Already processed %s" % filename) continue # Extract frames from trajectory in a temporary directory absfilename = os.path.abspath(filename) with enter_temp_directory(): # Extract frames archive = tarfile.open(absfilename, mode='r:bz2') archive.extract("positions.xtc") trj = md.load("positions.xtc", top=top) print(" appending %d frames from '%s' to '%s'" % (trj.n_frames, filename, output_filename)) for frame in trj: trj_file.write(coordinates=frame.xyz, cell_lengths=frame.unitcell_lengths, cell_angles=frame.unitcell_angles, time=frame.time) os.unlink("positions.xtc") del archive, trj # Append list of processed files trj_file._handle.root.processed_filenames.append([filename]) # Flush data trj_file.flush() # Track statistics on processed packets elapsed_time = time.time() - initial_time result_packets_processed += 1 # Return if we have processed the requested number of results packets. if maxpackets and (result_packets_processed >= maxpackets): break except RuntimeError: print( "Cannot munge %s due to damaged XTC %s or mismatch with topology file." % (path, filename)) # Clean up. trj_file.close() del top, trj_file
def ensure_result_packet_is_decompressed(result_packet, topology, atom_indices=None, chunksize=10, delete_on_unpack=False, compress_xml=False): """ Ensure that the specified result packet is decompressed. If this is a ws7/ws8 compressed result packet, safely convert it to uncompressed: * decompress it into a temporary directory * move it into place * verify integrity of files * unlink (delete) the old result packet if everything looks OK [OPTIONAL] If this is a directory, this function returns immediately. .. warning: This will irreversibly delete the compressed work packet, replacing it with an uncompressed one. Parameters ---------- result_packet : str Path to original result packet topology : mdtraj.Topology Topology to use for verifying integrity of trajectory atom_indices : list of int, optional, default=None Atom indices to read when verifying integrity of trajectory If None, all atoms will be read. delete_on_unpack : bool, optional, default=True If True, will delete old ws8-style .tar.bz2 files after they have been unpacked. WARNING: THIS COULD BE DANGEROUS compress_xml : bool, optional, default=False If True, will compress XML files after unpacking them. chunksize : int, optional, default=10 Number of frames to read each call to mdtraj.iterload for verifying trajectory integrity Returns ------- result_packet : str Path to new result packet directory """ # Return if this is just a directory if os.path.isdir(result_packet): return result_packet # If this is a tarball, extract salient information. # Format: results-002.tar.bz2 absfilename = os.path.abspath(result_packet) (basepath, filename) = os.path.split(absfilename) pattern = r'results-(\d+).tar.bz2' if not re.match(pattern, filename): raise Exception( "Compressed results packet filename '%s' does not match expected format (results-001.tar.bz2)" % result_packet) frame_number = int(re.match(pattern, filename).group(1)) # Extract frames from trajectory in a temporary directory print(" Extracting %s" % result_packet) with enter_temp_directory(): # Create target directory extracted_archive_directory = tempfile.mkdtemp() # Extract all contents archive = tarfile.open(absfilename, mode='r:bz2') archive.extractall(path=extracted_archive_directory) # Compress XML files if compress_xml: xml_filenames = glob.glob('%s/*.xml' % extracted_archive_directory) for filename in xml_filenames: print(" Compressing %s" % os.path.basename(filename)) subprocess.call(['gzip', filename]) # Create new result packet name new_result_packet = os.path.join(basepath, 'results%d' % frame_number) # Move directory into place shutil.move(extracted_archive_directory, new_result_packet) # Verify integrity of archive contents xtc_filename = os.path.join(new_result_packet, 'positions.xtc') if not os.path.exists(xtc_filename): raise Exception( "Result packet archive '%s' does not contain positions.xtc; aborting unpacking." ) try: for chunk in md.iterload(xtc_filename, top=topology, atom_indices=atom_indices, chunk=chunksize): pass except Exception as e: msg = "Result packet archive '%s' failed trajectory integrity check; aborting unpacking.\n" msg += str(e) raise Exception(msg) # Cleanup archive object del archive if delete_on_unpack: # Remove archive permanently print(" Permanently removing %s" % absfilename) os.unlink(absfilename) # Return updated result packet directory name return new_result_packet
def test_plotting_utils(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2,'msm__lag_time': 1, 'bootrap__n_samples':1 } create_fake_data(base_dir, protein_list, project_dict) setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_pipeline(base_dir) prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml")) prt1 = Protein(prj, "kinase_1") prt2 = Protein(prj, "kinase_2") prt1._mlpt_fct = 0.0 prt2._mlpt_fct = 0.0 n_bins = 100 lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2], range(prt1.n_tics_), n_bins) def test_bounds(): locally_calc={} for i in range(prt1.n_tics_): locally_calc[i] =[] global_min = min(min([min(i) for i in prt1.tica_data.values()]), min([min(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_min) global_max = max(max([max(i) for i in prt1.tica_data.values()]), max([max(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_max) for i in range(prt1.n_tics_): assert(lin_spaced_tic_dict[i][0]==locally_calc[i][0]) assert(lin_spaced_tic_dict[i][-1]==locally_calc[i][-1]) assert(len(lin_spaced_tic_dict[i])==n_bins) return True def test_histogram_data(): H_dict, H_calc, _ = tica_histogram(prj, prt1, [0], x_array=lin_spaced_tic_dict[0], n_bins=None) assert(len(H_dict.keys()) == prt1.n_states_) assert(len(H_calc) == len(lin_spaced_tic_dict[0])-1) rnd_state = np.random.randint(0,prt1.n_states_) assert(np.allclose(H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state], bins = lin_spaced_tic_dict[0], normed=True)[0])) return True def test_one_dim_free_energy(): df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None , lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False) assert((df.protein_name==prt1.name).all()) assert((df.mdl_index=="mle").all()) return True assert(test_bounds()) assert(test_histogram_data()) assert(test_one_dim_free_energy()) return