Exemplo n.º 1
0
    def __init__(self, Metric, projfn="my_project.yaml", 
                 clcenterfn="centers.txt",
                 clusterfn="clusters.txt",
                 stepsize=None, timestep=None, 
                 flag_nopreprocess=False):
        '''
        load the project information and cluster output
        '''
        prj = Project(existing_project_file=projfn)
        trajs_lengths = np.array(prj.get_trajectory_lengths())
        
        # frame numbers always correspond to full trajectory irrespective of stride
        #trajs_lengths = trajs_lengths 
        self.trajs_lengths = trajs_lengths.astype(np.int)
        self.trajnames = prj.get_trajectory_filepaths()
        self.ntrajs = len(self.trajnames)
        self.stride = prj.get_stride()
        self.frames_per_traj=np.ceil(self.trajs_lengths/(self.stride*1.))
        
        self.ndim = prj.get_number_dimensions()
        self.trajectory_type = prj.get_trajectory_type()

        self.grof = prj.gro_filepath
        self.tprf = prj.tpr_filepath
        self.ndxf = prj.ndx_filepath
        
        self.Metric = Metric
        # # Get input data


        centids = txtreader.readcols(clcenterfn)
        self.centids = centids[:, 1]
        
        self.cltags = txtreader.readcols(clusterfn)       
        self.assignments = self._get_assignments()
        self.nodesizes = np.bincount(self.assignments[self.assignments > -1])
        
        self.stepsize = stepsize
        self.timestep = timestep
        logger.debug("Dimensionality %d",self.ndim)
def cluster(Metric, project_filepath, cutoff, checkpoint_filepath=None,
            flag_nopreprocess = False):
    """ Cluster data
    
    Parameters
    ---------- 
    project_filepath : String
       The path to the YAML project file.
       
    cutoff           : Floating 
        The cutoff distance passed to the Metric class.
        
    chekpoint_filepath : string,optional
    flag_nopreprocess : bool,optional
        switches off preprocessing
    
    Returns
    -------
    clusters: dict
        a map (dictionary) from cluster center vertices C to lists of vertices,
        where the lists represent the set of vertices belonging to the cluster with center C.
        i.e. a map: FrameID -> [FrameID]
        
    """

    # ================================================================
    # Instantiation of helper classes.
    
    # Initialize MPI.
    comm = MPI.COMM_WORLD
    mpi_size = comm.Get_size()
    my_rank = comm.Get_rank()
    comm.Barrier()
    # Print only at node 0.
#     def my_print(x):
#         print x
#     print0 = lambda x: my_print(x) if my_rank == 0 else None

    # Say hello.
    print0(rank=my_rank,msg="Initialized MPI.")
    logger.debug("Hello, from node %s",my_rank)

    logger.info("Reading project file at node %s",my_rank)
    project = Project(existing_project_file = project_filepath)
    
    logger.debug("Initializing manager at node %s",my_rank)
    manager = Loadmanager(project.get_trajectory_lengths(),
                          project.get_trajectory_filepaths(),
                          mpi_size,my_rank)
    # metric has to be instantiated by only one mpi process as it needs user input
    # ie index groups
    if my_rank == 0:
         
        metric = Metric(tpr_filepath = project.get_tpr_filepath(),
                       stx_filepath = project.get_gro_filepath(),
                       ndx_filepath = project.get_ndx_filepath(),
                       number_dimensions = project.get_number_dimensions() )
        # since we have to broadcast it we need to destroy all pointers to arrays
        metric.destroy_pointers()
        logger.debug("Metric initialized at node 0")
    else:
        metric = None
    metric =  comm.bcast(metric, root = 0)
    print0(rank=my_rank,msg="metric object broadcasted.")
    # recreate all pointers in the object's instance
    metric.create_pointers()
    
    manager.do_partition()
    
    
    # Take work share.
    my_partition = manager.myworkshare
    
    (my_trajectory_filepaths, my_trajectory_lengths, \
     my_trajectory_ID_offsets, my_trajectory_ID_ranges) = \
     map(list, my_partition)


    #print0(my_rank,"\tDistribution: {0}".format(frame_globalID_distribution))
    logger.info("Reading trajectories at %s",my_rank)
    my_frames = Framecollection.from_files(
            stride = project.get_stride(),
            trajectory_type = project.get_trajectory_type(),
            trajectory_globalID_offsets = my_trajectory_ID_offsets,
            trajectory_filepath_list = my_trajectory_filepaths,
            trajectory_length_list = my_trajectory_lengths, )

        
    if flag_nopreprocess == False:
        # ----------------------------------------------------------------
        # Preprocess trajectories (modifying them in-place).
        # Metric preprocessing.
        logger.debug(" Preprocessing trajectories at rank %s",my_rank)
        metric.preprocess( frame_array_pointer = my_frames.get_first_frame_pointer(),
                           number_frames = my_frames.number_frames,
                           number_atoms = my_frames.number_atoms)
    else:
        
        print0(rank=my_rank,msg=" Will not preprocess trajectories")


    # ================================================================
    # Initial round of all-to-all neighbour counting.


    # Count the number of neighbours for all frames.
    # If frames are vertices and edges join frames having rmsd within the cutoff,
    # then we compute and record the degree of each vertex.

    if checkpoint_filepath is None:
        print0(rank=my_rank,msg="Counting 'neighbours' for all frames.")

        my_neighbour_count = allToAll_neighbourCount(cutoff, comm, mpi_size, my_rank,
                                metric, my_frames,manager) # :: Map Integer Integer

        print0(rank=my_rank,msg="Synchronizing neighbour counts.")
        neighbour_count_recvList = comm.allgather(my_neighbour_count)

        neighbour_counts = {}
        for node_neighbour_counts in neighbour_count_recvList:
            for frameID in node_neighbour_counts:
                try:
                    neighbour_counts[frameID] += node_neighbour_counts[frameID]
                except KeyError:
                    neighbour_counts[frameID]  = node_neighbour_counts[frameID]
    else :
        print0(rank=my_rank,msg="Using checkpoint file.")
        neighbour_counts = None



    print0(rank=my_rank,msg="Start clustering.")
    
    T=time()

    clusters = daura_clustering(neighbour_counts,
                    cutoff, comm, mpi_size, my_rank, manager, 
                    metric, my_frames, checkpoint_filepath)
    
    print0(rank=my_rank,msg=" Finished ... Total time: {0}".format(time()-T))

                    
    return clusters