Exemplo n.º 1
0
def load_setup_files(proj_folder, traj_fname):
    # 0_0.hd5 is run 0 clone 0
    run_index = int(traj_fname.split("_")[0])

    print(run_index)
    glob_input = proj_folder + "/RUN%d/CLONE0/payload-*.tar.bz2" % run_index
    print(glob_input)
    payload_file = glob.glob(glob_input)[0]

    print(payload_file)
    if not payload_file:
        raise("Error:Payload files not found")

    print(os.path.abspath(payload_file))
    with enter_temp_directory():
        archive = tarfile.open(payload_file, mode='r:bz2')
        archive.extract("system.xml")
        archive.extract("integrator.xml")
        archive.extract("state.xml")

        with open("state.xml") as state_input:
            state = XmlSerializer.deserialize(state_input.read())
        with open("system.xml") as system_input:
            system = XmlSerializer.deserialize(system_input.read())
        with open("integrator.xml") as integrator_input:
            integrator = XmlSerializer.deserialize(integrator_input.read())

    return state, system, integrator
Exemplo n.º 2
0
def test_project():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        print(base_dir)
        print(type(base_dir))
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                        "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                  'tica__weighted_transform': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 2,'msm__lag_time': 1,
                  'bootstrap__n_samples':1}

        create_fake_data(base_dir, protein_list, project_dict)

        setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)
        fit_pipeline(base_dir)

        prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml"))

        assert isinstance(prj, ProteinSeries)
        assert isinstance(prj.tica_mdl ,tICA)

        assert _test_protein_without_project()
        assert _test_protein_with_project(prj)
        assert _test_tic_dict(prj)

        assert _test_obs_mapping(prj)
    return
def test_setup_series_analysis():

    base_dir = os.path.join("./fake_series")
    mdl_dir = os.path.join(base_dir,"new_mdl_dir")
    feature_dir = "feature_dir"
    series_name = "fake_series"

    protein_list = ["fake_kinase1", "fake_kinase2"]
    project_dict = {"fake_kinase1": ["fake_proj1", "fake_proj2"],
                    "fake_kinase2": ["fake_proj3"]}
    mdl_params = {'tica__n_components': 1, 'tica__lag_time': 2,
                  'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 174}

    with enter_temp_directory():
        create_fake_series()
        setup_series_analysis(base_dir, mdl_dir, feature_dir,
                              series_name, protein_list,
                              project_dict, mdl_params)

        assert os.path.isdir(mdl_dir)
        for protein in protein_list:
            assert os.path.isdir(os.path.join(mdl_dir, protein))

        assert(os.path.isfile(os.path.join(base_dir,"series.yaml")))
        fin = open(os.path.join(mdl_dir,"project.yaml"), 'r')
        yaml_file = yaml.load(fin)

        assert yaml_file["base_dir"] == base_dir
        assert yaml_file["series_name"] == series_name
        assert yaml_file["protein_list"] == protein_list
        assert yaml_file["project_dict"] == project_dict
        assert yaml_file["mdl_params"] == mdl_params

    return
Exemplo n.º 4
0
def load_setup_files(proj_folder, traj_fname):
    # 0_0.hd5 is run 0 clone 0
    run_index = int(traj_fname.split("_")[0])

    print(run_index)
    glob_input = proj_folder + "/RUN%d/CLONE0/payload-*.tar.bz2" % run_index
    print(glob_input)
    payload_file = glob.glob(glob_input)[0]

    print(payload_file)
    if not payload_file:
        raise ("Error:Payload files not found")

    print(os.path.abspath(payload_file))
    with enter_temp_directory():
        archive = tarfile.open(payload_file, mode='r:bz2')
        archive.extract("system.xml")
        archive.extract("integrator.xml")
        archive.extract("state.xml")

        with open("state.xml") as state_input:
            state = XmlSerializer.deserialize(state_input.read())
        with open("system.xml") as system_input:
            system = XmlSerializer.deserialize(system_input.read())
        with open("integrator.xml") as integrator_input:
            integrator = XmlSerializer.deserialize(integrator_input.read())

    return state, system, integrator
Exemplo n.º 5
0
def hdf5_concatenate_core17(job_tuple):
    """Concatenate tar bzipped XTC files created by Folding@Home Core17.

    Parameters
    ----------
    path : str
        Path to directory containing "results-*.tar.bz2".  E.g. a single CLONE directory.
    top : mdtraj.Topology
        Topology for system
    output_filename : str
        Filename of output HDF5 file to generate.

    Notes
    -----
    We use HDF5 because it provides an easy way to store the metadata associated
    with which files have already been processed.
    """

    proj_folder, top_folder, db_name, run, clone = job_tuple
    path = os.path.join(proj_folder,"RUN%d/CLONE%d/"%(run,clone))
    top = md.load(os.path.join(top_folder,"%d.pdb"%run))
    output_filename =  os.path.join(proj_folder,"trajectories/%d_%d.hdf5"%(run,clone))

    glob_input = os.path.join(path, "results-*.tar.bz2")
    filenames = glob.glob(glob_input)
    filenames = sorted(filenames, key=keynat)

    if len(filenames) <= 0:
        return

    trj_file = HDF5TrajectoryFile(output_filename, mode='a')

    try:
        trj_file._create_earray(where='/', name='processed_filenames',atom=trj_file.tables.StringAtom(1024), shape=(0,))
        trj_file.topology = top.topology
    except trj_file.tables.NodeError:
        pass

    for filename in filenames:
        if six.b(filename) in trj_file._handle.root.processed_filenames:  # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six.
            print("Already processed %s" % filename)
            continue
        with enter_temp_directory():
            print("Processing %s" % filename)
            archive = tarfile.open(filename, mode='r:bz2')
            try:
                archive.extract("positions.xtc")
                trj = md.load("positions.xtc", top=top)
                for frame in trj:
                    trj_file.write(coordinates=frame.xyz, cell_lengths=frame.unitcell_lengths, cell_angles=frame.unitcell_angles)
                    trj_file._handle.root.processed_filenames.append([filename])
            except:
                #something wrong with the current trajectory file. Warn and return immediately
                warnings.warn("Problem at %s.Stopping trajectory here"%filename)
                return
    return 
Exemplo n.º 6
0
def test_change_protein_data_dir():
    with enter_temp_directory():
        create_fake_series()
        yaml_file ={}
        yaml_file["base_dir"] = "./fake_series"
        protein = "fake_kinase1"
        with enter_protein_data_dir(yaml_file, protein):
            current_folder_path, current_folder_name = os.path.split(os.getcwd())
            assert current_folder_name == "fake_kinase1"
    return
Exemplo n.º 7
0
def test_change_protein_data_dir():
    with enter_temp_directory():
        create_fake_series()
        yaml_file = {}
        yaml_file["base_dir"] = "./fake_series"
        protein = "fake_kinase1"
        with enter_protein_data_dir(yaml_file, protein):
            current_folder_path, current_folder_name = os.path.split(
                os.getcwd())
            assert current_folder_name == "fake_kinase1"
    return
Exemplo n.º 8
0
def concatenate_core17(path, top, output_filename):
    """Concatenate tar bzipped XTC files created by Folding@Home Core17.
    
    Parameters
    ----------
    path : str
        Path to directory containing "results-*.tar.bz2".  E.g. a single CLONE directory.
    top : mdtraj.Topology
        Topology for system
    output_filename : str
        Filename of output HDF5 file to generate.
    
    Notes
    -----
    We use HDF5 because it provides an easy way to store the metadata associated
    with which files have already been processed.
    """
    glob_input = os.path.join(path, "results-*.tar.bz2")
    filenames = glob.glob(glob_input)
    filenames = sorted(filenames, key=keynat)

    if len(filenames) <= 0:
        return

    trj_file = HDF5TrajectoryFile(output_filename, mode='a')

    try:
        trj_file._create_earray(where='/',
                                name='processed_filenames',
                                atom=trj_file.tables.StringAtom(1024),
                                shape=(0, ))
        trj_file.topology = top.topology
    except trj_file.tables.NodeError:
        pass

    for filename in filenames:
        if six.b(
                filename
        ) in trj_file._handle.root.processed_filenames:  # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six.
            print("Already processed %s" % filename)
            continue
        with enter_temp_directory():
            print("Processing %s" % filename)
            archive = tarfile.open(filename, mode='r:bz2')
            archive.extract("positions.xtc")
            trj = md.load("positions.xtc", top=top)

            for frame in trj:
                trj_file.write(coordinates=frame.xyz,
                               cell_lengths=frame.unitcell_lengths,
                               cell_angles=frame.unitcell_angles)

            trj_file._handle.root.processed_filenames.append([filename])
Exemplo n.º 9
0
def test_pipeline():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir, "mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {
            "kinase_1": [
                "fake_proj1",
            ],
            "kinase_2": ["fake_proj2"]
        }
        mdl_params = {
            'tica__n_components': 1,
            'tica__lag_time': 1,
            'tica__kinetic_mapping': True,
            'tica__shrinkage': 0.01,
            'cluster__n_clusters': 2,
            'msm__lag_time': 1,
            'bootstrap__n_samples': 1
        }

        create_fake_data(base_dir, protein_list, project_dict)

        yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                          series_name, protein_list,
                                          project_dict, mdl_params)
        fit_protein_tica(yaml_file)
        transform_protein_tica(yaml_file)
        fit_protein_kmeans(yaml_file)
        transform_protein_kmeans(yaml_file)
        fit_msms(yaml_file)
        fit_bootstrap(yaml_file)

        raw_count_obs = 0
        for p in protein_list:
            for j in glob.glob(os.path.join(base_dir, p, feature_dir, "*.jl")):
                raw_count_obs += verboseload(j).shape[0]
        tica_mdl = verboseload(os.path.join(mdl_dir, "tica_mdl.pkl"))
        #make sure the mdl is seeing all the data, could probably have a far stronger test here
        assert tica_mdl.n_observations_ == raw_count_obs
        assert os.path.exists(os.path.join(mdl_dir, "kinase_1/tica_data.pkl"))
        assert os.path.exists(os.path.join(mdl_dir, "kinase_2/tica_data.pkl"))
        assert os.path.exists(os.path.join(mdl_dir, "kinase_1/msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir, "kinase_2/msm_mdl.pkl"))
        assert os.path.exists(
            os.path.join(mdl_dir, "kinase_2/bootstrap_msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir, "kmeans_mdl.pkl"))

        return
Exemplo n.º 10
0
def concatenate_core17(path, top, output_filename):
    """Concatenate tar bzipped XTC files created by Folding@Home Core17.
    
    Parameters
    ----------
    path : str
        Path to directory containing "results-*.tar.bz2".  E.g. a single CLONE directory.
    top : mdtraj.Topology
        Topology for system
    output_filename : str
        Filename of output HDF5 file to generate.
    
    Notes
    -----
    We use HDF5 because it provides an easy way to store the metadata associated
    with which files have already been processed.
    """
    glob_input = os.path.join(path, "results-*.tar.bz2")
    filenames = glob.glob(glob_input)
    filenames = sorted(filenames, key=keynat)
    
    if len(filenames) <= 0:
        return
    
    trj_file = HDF5TrajectoryFile(output_filename, mode='a')
    
    try:
        trj_file._create_earray(where='/', name='processed_filenames',atom=trj_file.tables.StringAtom(1024), shape=(0,))
        trj_file.topology = top.topology
    except trj_file.tables.NodeError:
        pass

    for filename in filenames:
        if six.b(filename) in trj_file._handle.root.processed_filenames:  # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six.
            print("Already processed %s" % filename)
            continue
        with enter_temp_directory():
            print("Processing %s" % filename)
            archive = tarfile.open(filename, mode='r:bz2')
            archive.extract("positions.xtc")
            trj = md.load("positions.xtc", top=top)

            for frame in trj:
                trj_file.write(coordinates=frame.xyz, cell_lengths=frame.unitcell_lengths, cell_angles=frame.unitcell_angles, time=frame.time)
            
            trj_file._handle.root.processed_filenames.append([filename])
Exemplo n.º 11
0
def _load_project_clone(protein, project, run, clone):
    main_dir = base_dir
    with enter_temp_directory():
        top = mdt.load(
            os.path.join(main_dir, protein, project, "topologies",
                         "%d.pdb" % run))
        t = [
            _trj_load(f, top) for f in sorted(glob.glob(
                os.path.join(main_dir, protein, project, "RUN%d" %
                             run, "CLONE%d" % clone, "results*")),
                                              key=keynat)
        ]
        print("Length of t is :", len(t))
        print(t[0])
        trj = t[0] + t[1:]

    return trj, trj.remove_solvent()
Exemplo n.º 12
0
def _load_project_clone(protein, project, run, clone):
    main_dir = base_dir
    with enter_temp_directory():
        top = mdt.load(os.path.join(main_dir, protein,
                                    project, "topologies",
                                    "%d.pdb"%run))
        t = [_trj_load(f,top) for f in
             sorted(glob.glob(os.path.join(main_dir,
                                           protein,
                                           project,
                                           "RUN%d"%run,"CLONE%d"%clone,
                                           "results*")),
                        key=keynat)]
        print("Length of t is :", len(t))
        print(t[0])
        trj = t[0] + t[1:]

    return trj, trj.remove_solvent()
Exemplo n.º 13
0
def test_pipeline():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                        "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                  'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 2,
                  'msm__lag_time': 1, 'bootstrap__n_samples':1 }

        create_fake_data(base_dir, protein_list, project_dict)


        yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)
        fit_protein_tica(yaml_file)
        transform_protein_tica(yaml_file)
        fit_protein_kmeans(yaml_file)
        transform_protein_kmeans(yaml_file)
        fit_msms(yaml_file)
        fit_bootstrap(yaml_file)

        raw_count_obs = 0
        for p in protein_list:
            for j in glob.glob(os.path.join(base_dir,p,feature_dir,"*.jl")):
                raw_count_obs += verboseload(j).shape[0]
        tica_mdl = verboseload(os.path.join(mdl_dir,"tica_mdl.pkl"))
        #make sure the mdl is seeing all the data, could probably have a far stronger test here
        assert tica_mdl.n_observations_ == raw_count_obs
        assert os.path.exists(os.path.join(mdl_dir,"kinase_1/tica_data.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kinase_2/tica_data.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kinase_1/msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kinase_2/msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kinase_2/bootstrap_msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kmeans_mdl.pkl"))

        return
Exemplo n.º 14
0
def test_multiple_mdls():
    base_dir = os.path.join("./fake_series")
    mdl_dir = os.path.join(base_dir,"new_mdl_dir")
    feature_dir = "feature_dir"
    series_name = "fake_series"
    protein_list = ["fake_kinase1", "fake_kinase2"]
    project_dict = {"fake_kinase1": ["fake_proj1", "fake_proj2"],
                    "fake_kinase2": ["fake_proj3"]}
    mdl_params = {'tica__n_components': 4, 'tica__lag_time': 223,
                  'tica__kinetic_mapping': True, 'tica__gamma': 0.0121,
                  'cluster__n_clusters': 212}

    with enter_temp_directory():
        create_fake_series()
        for i in range(3):
            setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)
            time.sleep(1)
        assert len(glob.glob("./fake_series/*/project.yaml")) == 3
    return
Exemplo n.º 15
0
def test_slicer():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                        "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                  'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 2,
                  'msm__lag_time': 1, 'bootstrap__n_samples':1 }

        create_fake_data(base_dir, protein_list, project_dict)

        yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)

        dict_feat_ind={}
        dict_feat_ind["kinase_1"] =[0, 2]
        dict_feat_ind["kinase_2"] =[1, 1, 0, 2]

        series_feature_slicer(yaml_file, dict_feat_ind)


        for protein in protein_list:
            with enter_protein_data_dir(yaml_file, protein):
                assert (os.path.isdir("sliced_feature_dir"))
                flist = glob.glob("./%s/*.jl"%feature_dir)
                for fname in flist:
                    original_file = verboseload(fname)
                    expected_file = original_file[:, dict_feat_ind[protein]]
                    written_file = verboseload("./%s/%s"%("sliced_feature_dir",
                                                          os.path.basename(fname)
                                                          ))
                    assert (expected_file==written_file).all()
    return
Exemplo n.º 16
0
def test_plotting_utils():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir, "mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {
            "kinase_1": [
                "fake_proj1",
            ],
            "kinase_2": ["fake_proj2"]
        }
        mdl_params = {
            'tica__n_components': 1,
            'tica__lag_time': 1,
            'tica__kinetic_mapping': True,
            'tica__shrinkage': 0.01,
            'cluster__n_clusters': 2,
            'msm__lag_time': 1,
            'bootrap__n_samples': 1
        }

        create_fake_data(base_dir, protein_list, project_dict)
        setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name,
                              protein_list, project_dict, mdl_params)

        fit_pipeline(base_dir)
        prj = ProteinSeries(os.path.join(mdl_dir, "project.yaml"))

        prt1 = Protein(prj, "kinase_1")
        prt2 = Protein(prj, "kinase_2")

        prt1._mlpt_fct = 0.0
        prt2._mlpt_fct = 0.0
        n_bins = 100

        lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2],
                                                    range(prt1.n_tics_),
                                                    n_bins)

        def test_bounds():
            locally_calc = {}
            for i in range(prt1.n_tics_):
                locally_calc[i] = []
                global_min = min(
                    min([min(i) for i in prt1.tica_data.values()]),
                    min([min(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_min)

                global_max = max(
                    max([max(i) for i in prt1.tica_data.values()]),
                    max([max(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_max)

            for i in range(prt1.n_tics_):
                assert (lin_spaced_tic_dict[i][0] == locally_calc[i][0])
                assert (lin_spaced_tic_dict[i][-1] == locally_calc[i][-1])
                assert (len(lin_spaced_tic_dict[i]) == n_bins)

            return True

        def test_histogram_data():
            H_dict, H_calc, _ = tica_histogram(prj,
                                               prt1, [0],
                                               x_array=lin_spaced_tic_dict[0],
                                               n_bins=None)
            assert (len(H_dict.keys()) == prt1.n_states_)
            assert (len(H_calc) == len(lin_spaced_tic_dict[0]) - 1)
            rnd_state = np.random.randint(0, prt1.n_states_)
            assert (np.allclose(
                H_dict[rnd_state],
                np.histogram(prt1.tic_dict[0][rnd_state],
                             bins=lin_spaced_tic_dict[0],
                             normed=True)[0]))
            return True

        def test_one_dim_free_energy():
            df = one_dim_tic_free_energy(prj,
                                         prt1,
                                         0,
                                         n_bins=None,
                                         lin_spaced_tic=lin_spaced_tic_dict[0],
                                         errorbars=False)

            assert ((df.protein_name == prt1.name).all())
            assert ((df.mdl_index == "mle").all())

            return True

        assert (test_bounds())
        assert (test_histogram_data())
        assert (test_one_dim_free_energy())

        return
Exemplo n.º 17
0
def concatenate_core17(path, top_filename, output_filename, maxtime=None, maxpackets=None):
    """Concatenate tar bzipped XTC files created by Folding@Home Core17.
    This version accepts only filenames and paths.

    Parameters
    ----------
    path : str
        Path to directory containing "results-*.tar.bz2".  E.g. a single CLONE directory.
    top_filename : str
        Filepath to read Topology for system
    output_filename : str
        Filename of output HDF5 file to generate.
    maxpackets : int, optional, default=None
        If specified, will stop processing after `maxpackets` results packets have been processed
    maxtime : int, optional, default=None
        If specified, will stop processing after `maxtime` seconds have passed.

    Notes
    -----
    We use HDF5 because it provides an easy way to store the metadata associated
    with which files have already been processed.
    """

    # Open topology file.
    top = md.load(top_filename % vars())

    # Glob file paths and return result files in sequential order.
    glob_input = os.path.join(path, "results-*.tar.bz2")
    filenames = glob.glob(glob_input)
    filenames = natsorted(filenames)

    print("Concatenating XTC files from '%s' into '%s' [%d results packets found]" % (path, output_filename, len(filenames)))

    # If no result files are present, return.
    if len(filenames) <= 0:
        del top
        return

    # Check integrity of trajectory if it exists.
    delete_trajectory_if_broken(output_filename)

    # Open trajectory for appending.
    trj_file = HDF5TrajectoryFile(output_filename, mode='a')

    MAX_FILEPATH_LENGTH = 1024 # Is this large enough?
    try:
        # TODO: Store MD5 hashes instead of filenames?
        trj_file._create_earray(where='/', name='processed_filenames',atom=trj_file.tables.StringAtom(MAX_FILEPATH_LENGTH), shape=(0,))
        trj_file.topology = top.topology
    except trj_file.tables.NodeError:
        # Object already exists; skip ahead.
        pass

    result_packets_processed = 0
    initial_time = time.time()
    try:
        for filename in filenames:
            # Check that we haven't violated our filename length assumption
            if len(filename) > MAX_FILEPATH_LENGTH:
                msg = "Filename is longer than hard-coded MAX_FILEPATH_LENGTH limit (%d > %d). Increase MAX_FILEPATH_LENGTH and rebuild." % (len(filename), MAX_FILEPATH_LENGTH)
                print(msg)
                raise Exception(msg)
            # Check if we have already processed this file
            if six.b(filename) in trj_file._handle.root.processed_filenames:  # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six.
                print("Already processed %s" % filename)
                continue
            # Extract frames from trajectory in a temporary directory
            absfilename = os.path.abspath(filename)
            with enter_temp_directory():
                # Extract frames
                archive = tarfile.open(absfilename, mode='r:bz2')
                archive.extract("positions.xtc")
                trj = md.load("positions.xtc", top=top)
                print("   appending %d frames from '%s' to '%s'" % (trj.n_frames, filename, output_filename))
                for frame in trj:
                    trj_file.write(coordinates=frame.xyz, cell_lengths=frame.unitcell_lengths, cell_angles=frame.unitcell_angles, time=frame.time)
                os.unlink("positions.xtc")
                del archive, trj

                # Append list of processed files
                trj_file._handle.root.processed_filenames.append([filename])

                # Flush data
                trj_file.flush()

            # Track statistics on processed packets
            elapsed_time = time.time() - initial_time
            result_packets_processed += 1

            # Return if we have processed the requested number of results packets.
            if maxpackets and (result_packets_processed >= maxpackets):
                break

    except RuntimeError:
        print("Cannot munge %s due to damaged XTC %s or mismatch with topology file." % (path, filename))

    # Clean up.
    trj_file.close()
    del top, trj_file
Exemplo n.º 18
0
def ensure_result_packet_is_decompressed(result_packet, topology, atom_indices=None, chunksize=10, delete_on_unpack=False, compress_xml=False):
    """
    Ensure that the specified result packet is decompressed.

    If this is a ws7/ws8 compressed result packet, safely convert it to uncompressed:
    * decompress it into a temporary directory
    * move it into place
    * verify integrity of files
    * unlink (delete) the old result packet if everything looks OK [OPTIONAL]

    If this is a directory, this function returns immediately.

    .. warning: This will irreversibly delete the compressed work packet, replacing
    it with an uncompressed one.

    Parameters
    ----------
    result_packet : str
        Path to original result packet
    topology : mdtraj.Topology
        Topology to use for verifying integrity of trajectory
    atom_indices : list of int, optional, default=None
        Atom indices to read when verifying integrity of trajectory
        If None, all atoms will be read.
    delete_on_unpack : bool, optional, default=True
        If True, will delete old ws8-style .tar.bz2 files after they have been unpacked.
        WARNING: THIS COULD BE DANGEROUS
    compress_xml : bool, optional, default=False
        If True, will compress XML files after unpacking them.
    chunksize : int, optional, default=10
        Number of frames to read each call to mdtraj.iterload for verifying trajectory integrity

    Returns
    -------
    result_packet : str
        Path to new result packet directory

    """
    # Return if this is just a directory
    if os.path.isdir(result_packet):
        return result_packet

    # If this is a tarball, extract salient information.
    # Format: results-002.tar.bz2
    absfilename = os.path.abspath(result_packet)
    (basepath, filename) = os.path.split(absfilename)
    pattern = r'results-(\d+).tar.bz2'
    if not re.match(pattern, filename):
        raise Exception("Compressed results packet filename '%s' does not match expected format (results-001.tar.bz2)" % result_packet)
    frame_number = int(re.match(pattern, filename).group(1))

    # Extract frames from trajectory in a temporary directory
    print("      Extracting %s" % result_packet)
    with enter_temp_directory():
        # Create target directory
        extracted_archive_directory = tempfile.mkdtemp()

        # Extract all contents
        archive = tarfile.open(absfilename, mode='r:bz2')
        archive.extractall(path=extracted_archive_directory)

        # Compress XML files
        if compress_xml:
            xml_filenames = glob.glob('%s/*.xml' % extracted_archive_directory)
            for filename in xml_filenames:
                print("      Compressing %s" % os.path.basename(filename))
                subprocess.call(['gzip', filename])

        # Create new result packet name
        new_result_packet = os.path.join(basepath, 'results%d' % frame_number)

        # Move directory into place
        shutil.move(extracted_archive_directory, new_result_packet)

        # Verify integrity of archive contents
        xtc_filename = os.path.join(new_result_packet, 'positions.xtc')
        if not os.path.exists(xtc_filename):
            raise Exception("Result packet archive '%s' does not contain positions.xtc; aborting unpacking.")
        try:
            for chunk in md.iterload(xtc_filename, top=topology, atom_indices=atom_indices, chunk=chunksize):
                pass
        except Exception as e:
            msg = "Result packet archive '%s' failed trajectory integrity check; aborting unpacking.\n"
            msg += str(e)
            raise Exception(msg)

        # Cleanup archive object
        del archive

        if delete_on_unpack:
            # Remove archive permanently
            print("      Permanently removing %s" % absfilename)
            os.unlink(absfilename)

        # Return updated result packet directory name
        return new_result_packet
Exemplo n.º 19
0
def hdf5_concatenate(job_tuple):
    """Concatenate tar bzipped or nonbized XTC files created by Folding@Home .
    Parameters
    ----------
    path : str
        Path to directory containing "results-*.tar.bz2".  E.g. a single CLONE directory.
    top : mdtraj.Topology
        Topology for system
    output_filename : str
        Filename of output HDF5 file to generate.
    Notes
    -----
    We use HDF5 because it provides an easy way to store the metadata associated
    with which files have already been processed.
    """

    proj, protein_folder, proj_folder, top_folder, run, clone, protein_only = job_tuple

    path = os.path.join(proj_folder,"RUN%d/CLONE%d/"%(run,clone))
    top = md.load(os.path.join(top_folder,"%d.pdb"%run))
    str_top = top.remove_solvent()


    glob_input = os.path.join(path, "results*")
    filenames = sorted(glob.glob(glob_input), key=keynat)

    if len(filenames) <= 0:
        return

    #output path for stripped trajectory
    strip_prot_out_filename = os.path.join(protein_folder,
                                           "protein_traj/%s_%d_%d.hdf5"%(proj,run,clone))
    str_trj_file = HDF5TrajectoryFile(strip_prot_out_filename, mode='a')
    str_trj_file_wrapper = HDF5TrajectoryFileWrapper(str_trj_file)
    str_trj_file_wrapper.setup(str_top.topology)

    if not protein_only:
        #output path for full trajectory
        output_filename =  os.path.join(protein_folder,
                                    "trajectories/%s_%d_%d.hdf5"%(proj,run,clone))
        trj_file = HDF5TrajectoryFile(output_filename, mode='a')
        trj_file_wrapper = HDF5TrajectoryFileWrapper(trj_file)
        trj_file_wrapper.setup(top.topology)


    for index, filename in enumerate(filenames):
        #if we find it in both then no problem we can continue to the next filename
        if ( protein_only or trj_file_wrapper.check_filename(filename)) and \
                str_trj_file_wrapper.check_filename(filename):
            print("Already processed %s" % filename)
            continue
        with enter_temp_directory():
            print("Processing %s" % filename)
            #try loading the file
            try:
                trj = _traj_loader(filename,top)
            #if that fails, give up on this clone entirely and move on
            except:
                print("Failed at %s "%filename)
                break
            #if loading is successful, try adding it
            if (not protein_only) and (not trj_file_wrapper.check_filename(filename)):
                if trj_file_wrapper.validate_filename(index, filename, filenames):
                    trj_file_wrapper.write_file(filename, trj)
            #now the stripped file
            if not str_trj_file_wrapper.check_filename(filename):
                if str_trj_file_wrapper.validate_filename(index, filename, filenames):
                    trj = trj.remove_solvent()
                    str_trj_file_wrapper.write_file(filename, trj)

    return
Exemplo n.º 20
0
def concatenate_core17(path,
                       top_filename,
                       output_filename,
                       maxtime=None,
                       maxpackets=None):
    """Concatenate tar bzipped XTC files created by Folding@Home Core17.
    This version accepts only filenames and paths.

    Parameters
    ----------
    path : str
        Path to directory containing "results-*.tar.bz2".  E.g. a single CLONE directory.
    top_filename : str
        Filepath to read Topology for system
    output_filename : str
        Filename of output HDF5 file to generate.
    maxpackets : int, optional, default=None
        If specified, will stop processing after `maxpackets` results packets have been processed
    maxtime : int, optional, default=None
        If specified, will stop processing after `maxtime` seconds have passed.

    Notes
    -----
    We use HDF5 because it provides an easy way to store the metadata associated
    with which files have already been processed.
    """

    # Open topology file.
    top = md.load(top_filename % vars())

    # Glob file paths and return result files in sequential order.
    glob_input = os.path.join(path, "results-*.tar.bz2")
    filenames = glob.glob(glob_input)
    filenames = natsorted(filenames)

    print(
        "Concatenating XTC files from '%s' into '%s' [%d results packets found]"
        % (path, output_filename, len(filenames)))

    # If no result files are present, return.
    if len(filenames) <= 0:
        del top
        return

    # Check integrity of trajectory if it exists.
    delete_trajectory_if_broken(output_filename)

    # Open trajectory for appending.
    trj_file = HDF5TrajectoryFile(output_filename, mode='a')

    MAX_FILEPATH_LENGTH = 1024  # Is this large enough?
    try:
        # TODO: Store MD5 hashes instead of filenames?
        trj_file._create_earray(
            where='/',
            name='processed_filenames',
            atom=trj_file.tables.StringAtom(MAX_FILEPATH_LENGTH),
            shape=(0, ))
        trj_file.topology = top.topology
    except trj_file.tables.NodeError:
        # Object already exists; skip ahead.
        pass

    result_packets_processed = 0
    initial_time = time.time()
    try:
        for filename in filenames:
            # Check that we haven't violated our filename length assumption
            if len(filename) > MAX_FILEPATH_LENGTH:
                msg = "Filename is longer than hard-coded MAX_FILEPATH_LENGTH limit (%d > %d). Increase MAX_FILEPATH_LENGTH and rebuild." % (
                    len(filename), MAX_FILEPATH_LENGTH)
                print(msg)
                raise Exception(msg)
            # Check if we have already processed this file
            if six.b(
                    filename
            ) in trj_file._handle.root.processed_filenames:  # On Py3, the pytables list of filenames has type byte (e.g. b"hey"), so we need to deal with this via six.
                print("Already processed %s" % filename)
                continue
            # Extract frames from trajectory in a temporary directory
            absfilename = os.path.abspath(filename)
            with enter_temp_directory():
                # Extract frames
                archive = tarfile.open(absfilename, mode='r:bz2')
                archive.extract("positions.xtc")
                trj = md.load("positions.xtc", top=top)
                print("   appending %d frames from '%s' to '%s'" %
                      (trj.n_frames, filename, output_filename))
                for frame in trj:
                    trj_file.write(coordinates=frame.xyz,
                                   cell_lengths=frame.unitcell_lengths,
                                   cell_angles=frame.unitcell_angles,
                                   time=frame.time)
                os.unlink("positions.xtc")
                del archive, trj

                # Append list of processed files
                trj_file._handle.root.processed_filenames.append([filename])

                # Flush data
                trj_file.flush()

            # Track statistics on processed packets
            elapsed_time = time.time() - initial_time
            result_packets_processed += 1

            # Return if we have processed the requested number of results packets.
            if maxpackets and (result_packets_processed >= maxpackets):
                break

    except RuntimeError:
        print(
            "Cannot munge %s due to damaged XTC %s or mismatch with topology file."
            % (path, filename))

    # Clean up.
    trj_file.close()
    del top, trj_file
Exemplo n.º 21
0
def ensure_result_packet_is_decompressed(result_packet,
                                         topology,
                                         atom_indices=None,
                                         chunksize=10,
                                         delete_on_unpack=False,
                                         compress_xml=False):
    """
    Ensure that the specified result packet is decompressed.

    If this is a ws7/ws8 compressed result packet, safely convert it to uncompressed:
    * decompress it into a temporary directory
    * move it into place
    * verify integrity of files
    * unlink (delete) the old result packet if everything looks OK [OPTIONAL]

    If this is a directory, this function returns immediately.

    .. warning: This will irreversibly delete the compressed work packet, replacing
    it with an uncompressed one.

    Parameters
    ----------
    result_packet : str
        Path to original result packet
    topology : mdtraj.Topology
        Topology to use for verifying integrity of trajectory
    atom_indices : list of int, optional, default=None
        Atom indices to read when verifying integrity of trajectory
        If None, all atoms will be read.
    delete_on_unpack : bool, optional, default=True
        If True, will delete old ws8-style .tar.bz2 files after they have been unpacked.
        WARNING: THIS COULD BE DANGEROUS
    compress_xml : bool, optional, default=False
        If True, will compress XML files after unpacking them.
    chunksize : int, optional, default=10
        Number of frames to read each call to mdtraj.iterload for verifying trajectory integrity

    Returns
    -------
    result_packet : str
        Path to new result packet directory

    """
    # Return if this is just a directory
    if os.path.isdir(result_packet):
        return result_packet

    # If this is a tarball, extract salient information.
    # Format: results-002.tar.bz2
    absfilename = os.path.abspath(result_packet)
    (basepath, filename) = os.path.split(absfilename)
    pattern = r'results-(\d+).tar.bz2'
    if not re.match(pattern, filename):
        raise Exception(
            "Compressed results packet filename '%s' does not match expected format (results-001.tar.bz2)"
            % result_packet)
    frame_number = int(re.match(pattern, filename).group(1))

    # Extract frames from trajectory in a temporary directory
    print("      Extracting %s" % result_packet)
    with enter_temp_directory():
        # Create target directory
        extracted_archive_directory = tempfile.mkdtemp()

        # Extract all contents
        archive = tarfile.open(absfilename, mode='r:bz2')
        archive.extractall(path=extracted_archive_directory)

        # Compress XML files
        if compress_xml:
            xml_filenames = glob.glob('%s/*.xml' % extracted_archive_directory)
            for filename in xml_filenames:
                print("      Compressing %s" % os.path.basename(filename))
                subprocess.call(['gzip', filename])

        # Create new result packet name
        new_result_packet = os.path.join(basepath, 'results%d' % frame_number)

        # Move directory into place
        shutil.move(extracted_archive_directory, new_result_packet)

        # Verify integrity of archive contents
        xtc_filename = os.path.join(new_result_packet, 'positions.xtc')
        if not os.path.exists(xtc_filename):
            raise Exception(
                "Result packet archive '%s' does not contain positions.xtc; aborting unpacking."
            )
        try:
            for chunk in md.iterload(xtc_filename,
                                     top=topology,
                                     atom_indices=atom_indices,
                                     chunk=chunksize):
                pass
        except Exception as e:
            msg = "Result packet archive '%s' failed trajectory integrity check; aborting unpacking.\n"
            msg += str(e)
            raise Exception(msg)

        # Cleanup archive object
        del archive

        if delete_on_unpack:
            # Remove archive permanently
            print("      Permanently removing %s" % absfilename)
            os.unlink(absfilename)

        # Return updated result packet directory name
        return new_result_packet
Exemplo n.º 22
0
def test_plotting_utils():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                            "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                      'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                      'cluster__n_clusters': 2,'msm__lag_time': 1,
                      'bootrap__n_samples':1
                      }


        create_fake_data(base_dir, protein_list, project_dict)
        setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                      series_name, protein_list,
                                      project_dict, mdl_params)

        fit_pipeline(base_dir)
        prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml"))

        prt1 = Protein(prj, "kinase_1")
        prt2 = Protein(prj, "kinase_2")

        prt1._mlpt_fct = 0.0
        prt2._mlpt_fct = 0.0
        n_bins = 100

        lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2],
                                                    range(prt1.n_tics_), n_bins)

        def test_bounds():
            locally_calc={}
            for i in range(prt1.n_tics_):
                locally_calc[i] =[]
                global_min = min(min([min(i) for i in prt1.tica_data.values()]),
                    min([min(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_min)

                global_max = max(max([max(i) for i in prt1.tica_data.values()]),
                    max([max(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_max)

            for i in range(prt1.n_tics_):
                assert(lin_spaced_tic_dict[i][0]==locally_calc[i][0])
                assert(lin_spaced_tic_dict[i][-1]==locally_calc[i][-1])
                assert(len(lin_spaced_tic_dict[i])==n_bins)

            return True

        def test_histogram_data():
            H_dict, H_calc, _ = tica_histogram(prj, prt1, [0],
                                               x_array=lin_spaced_tic_dict[0],
                                               n_bins=None)
            assert(len(H_dict.keys()) == prt1.n_states_)
            assert(len(H_calc) == len(lin_spaced_tic_dict[0])-1)
            rnd_state = np.random.randint(0,prt1.n_states_)
            assert(np.allclose(H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state],
                                                       bins = lin_spaced_tic_dict[0],
                                                       normed=True)[0]))
            return True


        def test_one_dim_free_energy():
            df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None ,
                        lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False)

            assert((df.protein_name==prt1.name).all())
            assert((df.mdl_index=="mle").all())

            return True

        assert(test_bounds())
        assert(test_histogram_data())
        assert(test_one_dim_free_energy())


        return