예제 #1
0
    def __init__(self, Metric, projfn="my_project.yaml", 
                 clcenterfn="centers.txt",
                 clusterfn="clusters.txt",
                 stepsize=None, timestep=None, 
                 flag_nopreprocess=False):
        '''
        load the project information and cluster output
        '''
        prj = Project(existing_project_file=projfn)
        trajs_lengths = np.array(prj.get_trajectory_lengths())
        
        # frame numbers always correspond to full trajectory irrespective of stride
        #trajs_lengths = trajs_lengths 
        self.trajs_lengths = trajs_lengths.astype(np.int)
        self.trajnames = prj.get_trajectory_filepaths()
        self.ntrajs = len(self.trajnames)
        self.stride = prj.get_stride()
        self.frames_per_traj=np.ceil(self.trajs_lengths/(self.stride*1.))
        
        self.ndim = prj.get_number_dimensions()
        self.trajectory_type = prj.get_trajectory_type()

        self.grof = prj.gro_filepath
        self.tprf = prj.tpr_filepath
        self.ndxf = prj.ndx_filepath
        
        self.Metric = Metric
        # # Get input data


        centids = txtreader.readcols(clcenterfn)
        self.centids = centids[:, 1]
        
        self.cltags = txtreader.readcols(clusterfn)       
        self.assignments = self._get_assignments()
        self.nodesizes = np.bincount(self.assignments[self.assignments > -1])
        
        self.stepsize = stepsize
        self.timestep = timestep
        logger.debug("Dimensionality %d",self.ndim)
def assign(Metric, project_filepath, cutoff, centerfile, flag_nopreprocess):
    ''' Method to add rest of the data to the cluster centers 
    '''
    # Initialize MPI.
    comm = MPI.COMM_WORLD
    mpi_size = comm.Get_size()
    my_rank = comm.Get_rank()
    comm.Barrier()
 
    # Print only at node 0.
#     def my_print(x):
#         print x
#     print0 = lambda x: my_print(x) if my_rank == 0 else None
    logger.info("refine start")
    # Say hello.
    print0(rank=my_rank,msg=" Initialized MPI.")
    logger.debug("Hello, from node %s",my_rank)
 
    # Load project file.
    print0(rank=my_rank,msg=" Reading project yaml file.")
    project = Project(existing_project_file = project_filepath)
 
    logger.debug("Initializing manager at node %s",my_rank)
    manager = Loadmanager(project.get_trajectory_lengths(),
                          project.get_trajectory_filepaths(),
                          mpi_size,my_rank)

    # Instantiate Metric class.
    if my_rank == 0:
        metric = Metric(tpr_filepath = project.get_tpr_filepath(),
                       stx_filepath = project.get_gro_filepath(),
                       ndx_filepath = project.get_ndx_filepath(),
                       number_dimensions = project.get_number_dimensions() )
        
        metric.destroy_pointers()
    else:
        metric = None
 
    metric = comm.bcast(metric, root=0)
    print0(rank=my_rank,msg="metric object broadcasted.")
 
    metric.create_pointers()
    
    manager.do_partition()
    # Take work share.
    my_partition = manager.myworkshare

 
    (my_trajectory_filepaths, my_trajectory_lengths, \
        my_trajectory_ID_offsets, my_trajectory_ID_ranges) = \
        map(list, my_partition)
 
    logger.info("Reading trajectories at %s",my_rank)
    my_frames = Framecollection.from_files(
            stride = 1,
            trajectory_type = project.get_trajectory_type(),
            trajectory_globalID_offsets = my_trajectory_ID_offsets,
            trajectory_filepath_list = my_trajectory_filepaths,
            trajectory_length_list = my_trajectory_lengths, )

        
    if flag_nopreprocess == False:
        # ----------------------------------------------------------------
        # Preprocess trajectories (modifying them in-place).
        # Metric preprocessing.
        logger.debug(" Preprocessing trajectories at rank %s",my_rank)
        metric.preprocess( frame_array_pointer = my_frames.get_first_frame_pointer(),
                           number_frames = my_frames.number_frames,
                           number_atoms = my_frames.number_atoms)
    else:
        
        print0(rank=my_rank,msg=" Will not preprocess trajectories")

         
    # ----------------------------------------------------------------
    # Preprocess trajectories (modifying them in-place).
    # Metric preprocessing.
#    print0(my_rank,"[Cluster] Preprocessing trajectories (for Metric).")
#    my_metric.preprocess(   frame_array_pointer = my_frames.get_first_frame_pointer(),
#                            number_frames = my_frames.number_frames(),
#                            number_atoms = my_frames.number_atoms(), )
 
    clustercenters = txtreader.readcols(centerfile)[:,1]
 
    clusters = {} # :: FrameID (cluster center) -> [FrameID] (the cluster -- its list of frames)
    my_unclustered = set([i for i in my_frames.globalIDs_iter])
    removed_vertices = set()
 
    for center_id in clustercenters:
        center_host_node = manager.find_node_of_frame(center_id)
        if center_host_node is None:
            raise KeyError("Next cluster center ID not found within any node.")
             
        
        # Broadcasting of center.
        if my_rank == center_host_node:
            center_frame = my_frames.get_frame(center_id)
        else:
            shape = (my_frames.number_atoms, 3)
            center_frame = np.empty(shape, dtype=my_frames.frames.dtype)
        
        comm.Bcast([center_frame, my_frames.mpi_frametype], root=center_host_node)
        if my_rank == 0 : logger.debug("Searching for members for center id %s", center_id)
        
        center_frame_pointer = center_frame.ctypes.data_as(ctypes.POINTER(gp_grompy.rvec))
        rmsd_buffer = np.empty(my_frames.number_frames, dtype=my_frames.frames.dtype)
        
        metric.compute_distances( 
            reference_frame_pointer = center_frame_pointer,
            frame_array_pointer = my_frames.get_first_frame_pointer(),
            number_frames = my_frames.number_frames,
            number_atoms = my_frames.number_atoms,
            real_output_buffer = rmsd_buffer, # writes results to this buffer.
            mask_ptr = None,
            mask_dummy_value = -1.0,
            )
         
        fst = lambda x: x[0]
        existsAndWithinCutoff = lambda x: (x[0] not in removed_vertices) and (0.0 <= x[1] <= cutoff)
        my_members = map(fst, filter(existsAndWithinCutoff,
                        zip(my_frames.globalIDs_iter, rmsd_buffer))) # for striding.
        
        # Broadcasting of members.
        members_gathered = comm.allgather(my_members)
        members = list(itertools.chain(*members_gathered))
        if my_rank == 0 : logger.debug("Found %s members",len(members))
        removed_vertices.update(members)
        clusters[center_id] = list(members)
        
        my_unclustered = my_unclustered.difference(set(members))
        
    unclustered_gathered = comm.allgather(my_unclustered)
    unclustered = list(itertools.chain(*unclustered_gathered))
    logger.debug("Unclustered %s", unclustered)
    
    for i in unclustered:
        clusters[i]=[i]

    logger.info("refine end")
         
    return clusters 
def cluster(Metric, project_filepath, cutoff, checkpoint_filepath=None,
            flag_nopreprocess = False):
    """ Cluster data
    
    Parameters
    ---------- 
    project_filepath : String
       The path to the YAML project file.
       
    cutoff           : Floating 
        The cutoff distance passed to the Metric class.
        
    chekpoint_filepath : string,optional
    flag_nopreprocess : bool,optional
        switches off preprocessing
    
    Returns
    -------
    clusters: dict
        a map (dictionary) from cluster center vertices C to lists of vertices,
        where the lists represent the set of vertices belonging to the cluster with center C.
        i.e. a map: FrameID -> [FrameID]
        
    """

    # ================================================================
    # Instantiation of helper classes.
    
    # Initialize MPI.
    comm = MPI.COMM_WORLD
    mpi_size = comm.Get_size()
    my_rank = comm.Get_rank()
    comm.Barrier()
    # Print only at node 0.
#     def my_print(x):
#         print x
#     print0 = lambda x: my_print(x) if my_rank == 0 else None

    # Say hello.
    print0(rank=my_rank,msg="Initialized MPI.")
    logger.debug("Hello, from node %s",my_rank)

    logger.info("Reading project file at node %s",my_rank)
    project = Project(existing_project_file = project_filepath)
    
    logger.debug("Initializing manager at node %s",my_rank)
    manager = Loadmanager(project.get_trajectory_lengths(),
                          project.get_trajectory_filepaths(),
                          mpi_size,my_rank)
    # metric has to be instantiated by only one mpi process as it needs user input
    # ie index groups
    if my_rank == 0:
         
        metric = Metric(tpr_filepath = project.get_tpr_filepath(),
                       stx_filepath = project.get_gro_filepath(),
                       ndx_filepath = project.get_ndx_filepath(),
                       number_dimensions = project.get_number_dimensions() )
        # since we have to broadcast it we need to destroy all pointers to arrays
        metric.destroy_pointers()
        logger.debug("Metric initialized at node 0")
    else:
        metric = None
    metric =  comm.bcast(metric, root = 0)
    print0(rank=my_rank,msg="metric object broadcasted.")
    # recreate all pointers in the object's instance
    metric.create_pointers()
    
    manager.do_partition()
    
    
    # Take work share.
    my_partition = manager.myworkshare
    
    (my_trajectory_filepaths, my_trajectory_lengths, \
     my_trajectory_ID_offsets, my_trajectory_ID_ranges) = \
     map(list, my_partition)


    #print0(my_rank,"\tDistribution: {0}".format(frame_globalID_distribution))
    logger.info("Reading trajectories at %s",my_rank)
    my_frames = Framecollection.from_files(
            stride = project.get_stride(),
            trajectory_type = project.get_trajectory_type(),
            trajectory_globalID_offsets = my_trajectory_ID_offsets,
            trajectory_filepath_list = my_trajectory_filepaths,
            trajectory_length_list = my_trajectory_lengths, )

        
    if flag_nopreprocess == False:
        # ----------------------------------------------------------------
        # Preprocess trajectories (modifying them in-place).
        # Metric preprocessing.
        logger.debug(" Preprocessing trajectories at rank %s",my_rank)
        metric.preprocess( frame_array_pointer = my_frames.get_first_frame_pointer(),
                           number_frames = my_frames.number_frames,
                           number_atoms = my_frames.number_atoms)
    else:
        
        print0(rank=my_rank,msg=" Will not preprocess trajectories")


    # ================================================================
    # Initial round of all-to-all neighbour counting.


    # Count the number of neighbours for all frames.
    # If frames are vertices and edges join frames having rmsd within the cutoff,
    # then we compute and record the degree of each vertex.

    if checkpoint_filepath is None:
        print0(rank=my_rank,msg="Counting 'neighbours' for all frames.")

        my_neighbour_count = allToAll_neighbourCount(cutoff, comm, mpi_size, my_rank,
                                metric, my_frames,manager) # :: Map Integer Integer

        print0(rank=my_rank,msg="Synchronizing neighbour counts.")
        neighbour_count_recvList = comm.allgather(my_neighbour_count)

        neighbour_counts = {}
        for node_neighbour_counts in neighbour_count_recvList:
            for frameID in node_neighbour_counts:
                try:
                    neighbour_counts[frameID] += node_neighbour_counts[frameID]
                except KeyError:
                    neighbour_counts[frameID]  = node_neighbour_counts[frameID]
    else :
        print0(rank=my_rank,msg="Using checkpoint file.")
        neighbour_counts = None



    print0(rank=my_rank,msg="Start clustering.")
    
    T=time()

    clusters = daura_clustering(neighbour_counts,
                    cutoff, comm, mpi_size, my_rank, manager, 
                    metric, my_frames, checkpoint_filepath)
    
    print0(rank=my_rank,msg=" Finished ... Total time: {0}".format(time()-T))

                    
    return clusters