def readModelFile(model_file, train_sets=False, display=False): """Open the model file and extract all relevant information on the model from it. Argument(s): model_file {str} -- Path to the model file to load to predict the phases of the molecules. train_sets {bool} -- (Opt.) Load the training sets (arrays) from the file too. Default is False. display {bool} -- (Opt.) Print the metadata in the terminal using a custom formatting. Default is True. Output(s): metadata {dict of int, float & str} -- Metadata used and collected during the training. coordinates {np.ndarray} -- (Opt.) Array of the coordinates of the atoms of the molecules. Dimension(s) are in (n_frames, n_molecules, n_atoms_per_molecule, 2) distances {np.ndarray} -- (Opt.) Array of the distances of the atoms of the molecules. Dimension(s) are in (n_frames, n_molecules, n_distances). phases {np.ndarray} -- (Opt.) Array of all the molecule phases labeled in the system. Dimension(s) are in (n_frames, n_molecules). """ # Check input if not _is_boolean(train_sets): _error_input_type("Return training sets", "Boolean") # Load the informations from the XML file coordinates, distances, phases, metadata_content = openModelFile( model_file) # Display the content if requested if display: _display_metadata(metadata_content) # Return the required values if train_sets: return metadata_content, coordinates, distances, phases else: return metadata_content
def assignLeaflets(systems, geometry='bilayer'): """Compute the tessellations of the system for neighbour analysis. Argument(s): systems {list of class System} -- Instances of the System classes containing the molecules to save in a file. geometry {str} -- (Opt.) Geometry of the system to perform the tessellations on. The current geometries available are: *) bilayer - Analyse the 2D tesselations of a lipid bilayer on each leaflets. *) bilayer_3d - Analyse the 3D tessellations of a lipid bilayer. Requires ghosts to have been generated first. *) vesicle - Analyse the "2D" tessellations of a lipid vesicle by only keeping neighbours within the leaflet. Requires ghosts to have been generated first. *) vesicle_3d - Analyse the 3D tessellations of a lipid vesicle. Requires ghosts to have been generated first. By default, the geometry is set to a (2D) bilayer. Output(s): leaflets {np.ndarray} -- Array of the leaflets assigned to the membrane molecules. """ # Convert single system in list if _is_system(systems): systems = [systems] # Check and convert input if not _is_list_of( systems, type='system', check_array=True, recursive=False): _error_input_type('Systems', 'List of System (or single System)') # Extract the information from the system(s) representation = _system_to_tessellation(systems) # Get the leaflets representation.getLeaflets(geometry=geometry) return representation.leaflets
def predictPhases(coordinates, distances, models, final=True): """Predict the phases of the molecules based on the coordinates and distances spaces provided. Argument(s): coordinates {np.ndarray} -- Array of the coordinates of the atoms of the molecules, merged between all systems and all frames. Dimension(s) should be in (n_frames * n_molecules, 2 * n_atoms_per_molecule). distances {np.ndarray} -- Array of the distances of the atoms of the molecules, merged between all systems and all frames. Dimension(s) should be in in (n_frames * n_molecules, n_distances). models {str or dict of models} -- Path to the model file to load or dictionary of the Scikit-Learn models to use to predict the phases of the molecules. final {bool} -- (Opt.) Do the final prediction. Returns the predictions of the 1st 4 models if False. Default is True. Output(s): phases {np.ndarray} -- Array of all the molecule phases predicted in the system. Dimension(s) are in (n_frames, n_molecules). """ # Check the inputs if not _is_array(coordinates): _error_input_type("Coordinates", "NumPy array") if not _is_array(distances): _error_input_type("Distances", "NumPy array") # Format the array fmt_coordinates, fmt_distances = _format_input(coordinates, distances) # Check and extract the models trained_models, training_parameters = _get_models_from_source(models) # Predict the states of the lipids phases = _prediction_array(fmt_coordinates, fmt_distances, trained_models) # Do the final prediction if required if final: phases = _final_decision(phases, trained_models) return phases
def _format_training_set(systems, phases=['gel', 'fluid']): # Check the phases format if not _is_list_of( phases, type='string', check_array=True, recursive=False): _error_input_type('Phases', 'list of ' + str(str)) # Process all the systems all_coordinates = [] all_distances = [] all_states = [] for i, system in enumerate(systems): # Format the input current_coordinates, current_distances = _format_input( system.coordinates, system.distances) current_states = np.array([phases[i]] * current_coordinates.shape[0]) # Add the positions to the array all_coordinates.append(current_coordinates) all_distances.append(current_distances) all_states.append(current_states) # Merge all arrays all_coordinates = np.concatenate(all_coordinates) all_distances = np.concatenate(all_distances) all_states = np.concatenate(all_states) return all_coordinates, all_distances, all_states
def __init__ (self, names, ids, positions, boxes, phases): # Check the input of the function if not _is_array_of(names, type='string', recursive=True): _error_input_type('Molecule names', "Array of string") if not _is_array_of(ids, type='int', recursive=True): _error_input_type('Molecule IDs', "Array of int") if not _is_array_of(positions, type='float', recursive=True): _error_input_type('Molecule positions', "Array of float") if not _is_array_of(boxes, type='float', recursive=True): _error_input_type('Simulation boxes', "Array of float") if not _is_array_of(phases, type='string', recursive=True): _error_input_type('Molecule phases', "Array of string") # Save the parameters self.names = names self.ids = ids self.positions = positions self.boxes = boxes self.phases = phases # Initialize the other variables self.leaflets = None self.ghosts = None self.volumes = None self.vertices = None self.neighbors = None self.geometry = None self.threshold = None self.neighbors_phases = None self.phases_list = None
def doVoro(systems, geometry='bilayer', threshold=0.01, exclude_ghosts=None, read_neighbors=True): """Compute the tessellations of the system for neighbour analysis. Argument(s): systems {list of class System} -- Instances of the System classes containing the molecules to save in a file. geometry {str} -- (Opt.) Geometry of the system to perform the tessellations on. The current geometries available are: *) bilayer - Analyse the 2D tesselations of a lipid bilayer on each leaflets. *) bilayer_3d - Analyse the 3D tessellations of a lipid bilayer. Requires ghosts to have been generated first. *) vesicle - Analyse the "2D" tessellations of a lipid vesicle by only keeping neighbours within the leaflet. Requires ghosts to have been generated first. *) vesicle_3d - Analyse the 3D tessellations of a lipid vesicle. Requires ghosts to have been generated first. *) solution - Analyse the 3D tessellations of a solution of molecules. By default, the geometry is set to a (2D) bilayer. threshold {float} -- (Opt.) Relative area/volume threshold at which neighbours starts to be considered. Value is given as a percentage of the total area/volume. Default is 0.01 (1%). exclude_ghosts {list of int} -- (Opt.) List of systems indices, provided with the same order than in the argument systems, that should be excluded from ghost generation. Default is None. read_neighbors (bool) -- (Opt.) Automatically map the local environment during the tessellation. Default is True Output(s): representation {class Tessellation} -- Instance of the class Tessellation including the representation on the system and its Voronoi tessellation. """ # Convert single system in list if _is_system(systems): systems = [systems] # Check and convert input if not _is_list_of( systems, type='system', check_array=True, recursive=False): _error_input_type('Systems', 'List of System (or single System)') if not _is_boolean(read_neighbors): _error_input_type('Read neighbors', "Boolean") # Extract the information from the system(s) representation = _system_to_tessellation(systems) # Assign the leaflets and generate the ghosts if needed if geometry != "solution": # Get the leaflets representation.getLeaflets(geometry=geometry) # Generate the ghosts representation.ghosts = summonGhosts(systems, geometry=geometry, exclude_ghosts=exclude_ghosts) # Make the tessellation to find the neighbors representation.doVoronoi(geometry=geometry, threshold=threshold) # Read the local environment if needed if read_neighbors: representation.checkNeighbors() return representation
def _system_to_tessellation(systems): # Convert single system in list if _is_system(systems): systems = [ systems ] # Check the format if not _is_list_of(systems, type='system', check_array=True, recursive=False): _error_input_type('Systems', 'List of System (or single System)') # Extract all the system(s) all_IDs = [] all_names = [] all_COMs = [] all_states = [] for molecule in systems: # Get the molecule names and IDs current_IDs = molecule.infos['resids'] current_names = np.array( [molecule.type]*current_IDs.shape[0] ) # Get the positions current_COMs = getCOM( molecule.positions, molecule.infos['heavy_atoms']['masses'] ) # Get the states current_states = molecule.phases # Store the results all_names.append( current_names ) all_IDs.append( current_IDs ) all_COMs.append( current_COMs ) all_states.append( current_states ) # Merge the system(s) all_names = np.concatenate( all_names ) all_IDs = np.concatenate( all_IDs ) all_COMs = np.concatenate( all_COMs, axis=1 ) all_states = np.concatenate( all_states, axis=1 ) # Sort the arrays according to the molecule IDs sorting_ids = all_IDs.argsort() all_names = all_names[sorting_ids] all_IDs = all_IDs[sorting_ids] all_COMs = all_COMs[:,sorting_ids,:] all_states = all_states[:,sorting_ids] # Create the instance of the Tessellation class representation = Tessellation(all_names, all_IDs, all_COMs, systems[0].boxes, all_states) return representation
def getCoordinates(self, **kwargs): # Extract the kwargs up = kwargs.get('up', True) if not _is_boolean(up): _error_input_type('Up', str(bool)) # Centre and rotate the positions rotated_positions = rotateMolecules(self.positions, self.infos, up=up) # Convert to polar coordinates self.coordinates = cartesian2Polar(rotated_positions) return self.coordinates
def _get_models_from_source(models): # Get the models from a file if _is_file_is(models, extensions=['.lpm'], exist=True, no_extension=False): trained_models, training_infos = load_models_file(models) return trained_models, training_infos # Get the models from a dictionary elif _is_dict(models): return models, None else: _error_input_type("Models", "model file or model dictionary")
def getDistances(self, **kwargs): # Extract the kwargs rank = kwargs.get('rank', 6) if not _is_int(rank): _error_input_type('Rank', str(int)) # Get the bonds indices at the given rank bonds_ids = listPairs(self.infos, rank=rank) # Compute all the distances using the map self.distances = computeDistances(self.positions, bonds_ids) self.rank = rank return self.distances
def saveVoro(representation, file_path=None, format='.csv'): """Save a representation in a file Argument(s): representation {class Tessellation} -- Instance of the class Tessellation including the representation on the system and its Voronoi tessellation. file_path {str} -- (Opt.) Path and name of the file to generate. File extension should be .xml, .h5 or .csv By default, the name is autogenerated as "date_hour.csv" (e.g. 20201201_012345.csv) format {str} -- (Opt.) File extension and format to use for the output file. Should be ".xml", ".h5" or ".csv" Default is .csv """ # Check that the input is a Tessellation if not _is_tessellation(representation): _error_input_type('Tessellation', "instance of Tessellation class") # Save the system in file saveRepresentation(representation, file_path=file_path, format=format)
def _select_input_type(input, type_A, type_B): # Check if the input type is of type A if isinstance(input, type_A): is_A = True # Check if the input type is of type B elif isinstance(input, type_B): is_A = False # Raise an error else: is_A = False _error_input_type() return is_A
def setPhases(self, phases): # Assign a single state to all molecules if _is_string(phases): self.phases = np.array( [ [phases] * self.positions.shape[1] ] * self.positions.shape[0] ) # Assign the array of states to the system elif _is_array(phases): _error_array_shape_match(phases, tuple((self.positions.shape[0], self.positions.shape[1]))) self.phases = phases # Raise error on wrong input type else: _error_input_type("Phases","string or an array of strings") return self.phases
def readNeighbors(representation): """Compute the tessellations of the system for neighbour analysis. Argument(s): representation {class Tessellation} -- Instance of the class Tessellation including the representation on the system and its Voronoi tessellation. Output(s): neighbors_phases {np.ndarray} -- Array of the phase of the neighbors of each molecule. Dimension(s) are in (n_frames, n_molecules, n_states). phases_list {np.ndarray} -- Array listing the phases analysed and the order used for the neighbors_phases array. """ # Check that the input is a Tessellation if not _is_tessellation(representation): _error_input_type('Tessellation', "instance of Tessellation class") # Read the composition neighbors_phases, phases_list = representation.checkNeighbors() return neighbors_phases, phases_list
def getPhases(system, models): """Predict the phases of the molecules in a system based on the ML models trained previously Argument(s): system {class System} -- Instance of the system classes containing all the informations on the system as well as the positions and configurations. models {str or dict of models} -- Path to the model file to load or dictionary of the Scikit-Learn models to use to predict the states of the molecules. Output(s): phases {np.ndarray} -- Array of all the molecule phases predicted in the system. Dimension(s) are in (n_frames, n_molecules). """ # Check the input if not _is_system(system): _error_input_type("System", "instance of the System class") # Predict the phase of the molecules phases = system.getPhases(models) return phases
def setPhases(system, phases): """Set manually the phases of the molecules in a system Argument(s): system {class System} -- Instance of the system classes containing all the informations on the system as well as the positions and configurations. phases {str or np.ndarray} -- Phases to assign manually to the molecules. Output(s): phases {np.ndarray} -- Array of all the molecule phases predicted in the system. Dimension(s) are in (n_frames, n_molecules). """ # Check the input if not _is_system(system): _error_input_type("System", "instance of the System class") # Predict the phase of the molecules assigned_phases = system.setPhases(phases) return assigned_phases
def findTessellations(center_of_masses, boxes, ids, leaflets=None, ghosts=None, geometry='bilayer', threshold=0.01): # Check the input if not _is_string(geometry): _error_input_type('Geometry', str(str)) if not _is_float(threshold): _error_input_type('Threshold', str(float)) # Compute the tessellations in a 2D bilayer if geometry == 'bilayer': volumes, vertices, neighbours = _tessellation_bilayer( center_of_masses, boxes, ids, leaflets, threshold=threshold) # Compute the tessellations in a 3D bilayer or vesicle elif geometry == 'bilayer_3d' or geometry == 'vesicle_3d': volumes, vertices, neighbours = _tessellation_3d_system( center_of_masses, boxes, ids, ghosts, threshold=threshold) # Compute the tessellations in the leaflets of a vesicles elif geometry == 'vesicle': volumes, vertices, neighbours = _tessellation_vesicle( center_of_masses, boxes, ids, leaflets, ghosts, threshold=threshold) # Compute the tessellations in a solution elif geometry == 'solution': volumes, vertices, neighbours = _tessellation_solution( center_of_masses, boxes, ids, threshold=threshold) else: raise ValueError("The selected geometry is not valid.") return volumes, vertices, neighbours
def findLeaflets(center_of_masses, geometry='bilayer'): # Check the input if not _is_array(center_of_masses): _error_input_type('Positions', str(np.ndarray)) if not _is_string(geometry): _error_input_type('Geometry', str(str)) # Find the leaflets in a bilayer if 'bilayer' in geometry: leaflets = _leaflets_bilayer(center_of_masses) # Find the leaflets in a vesicle elif 'vesicle' in geometry: leaflets = _leaflets_vesicle(center_of_masses) # Raise an error if there is an error else: raise ValueError("The selected geometry is not valid.") return leaflets
def _coerce_trajectory(begin, end, step, max_length): # Check the first frame if not _is_int(begin): _error_input_type('First frame', str(int)) elif begin < 0 or begin >= max_length: _error_out_of_range(begin, 'First frame', 0, max_length - 1) # Coerce the last frame if end is None: end = max_length else: if not _is_int(end): _error_input_type('Last frame', str(int)) elif end <= begin or end > max_length: _error_out_of_range(end, 'Last frame', begin + 1, max_length) # Check the step if not _is_int(step): _error_input_type('Frame step', str(int)) elif step <= 0: _error_out_of_range(step, 'Frame step', 1, "inf") return begin, end, step
def __init__( self, type, positions, infos, boxes ): # Check the input of the function if not _is_string(type): _error_input_type('Molecule type', str(str)) if not _is_array(positions): _error_input_type('Atom positions', str(np.ndarray)) if not _is_dict(infos): _error_input_type('Molecule type information dictionary', str(dict)) if not _is_array(boxes): _error_input_type('Simulation box dimensions', str(np.ndarray)) # Save the parameters self.type = type self.positions = positions self.boxes = boxes self.infos = infos # Initialize the other variables self.coordinates = None self.distances = None self.phases = None self.rank = -1
def trainModel(coordinates, distances, states, validationSize=0.20, seed=7, nSplits=10): # Check the inputs if not _is_float(validationSize, strict=True): _error_input_type('First frame', str(float)) elif validationSize < 0.01 or validationSize >= 0.33: _error_out_of_range(validationSize, 'Validation size', 0.01, 0.33) if not _is_int(seed): _error_input_type('Seed', str(int)) if not _is_int(nSplits): _error_input_type('Number repetitions', str(int)) elif nSplits <= 0: _error_out_of_range(nSplits, 'Number repetitions', 1, "inf") # Check the validation subset size _error_training_size(coordinates.shape[0] / np.unique(states).shape[0], validationSize) # Make an ID array systems_IDs = np.arange(states.shape[0]) # Prepare the scoring and model selections best_total_score = 0 best_models = {} all_scores = [] # Do several trainings and verifications to get an average score for i in tqdm(range(nSplits), desc="Training models..."): # Split the ID and label into training and verification subsets ids_training_1, remaining_ids, states_training_1, remaining_states = model_selection.train_test_split( systems_IDs, states, test_size=2 * validationSize, random_state=seed + i) ids_training_2, ids_verification, states_training_2, states_verification = model_selection.train_test_split( remaining_ids, remaining_states, test_size=0.5, random_state=seed + i) # Split all the dataset coordinates_training_1 = coordinates[ids_training_1] coordinates_training_2 = coordinates[ids_training_2] coordinates_verification = coordinates[ids_verification] distances_training_1 = distances[ids_training_1] distances_training_2 = distances[ids_training_2] distances_verification = distances[ids_verification] # Train each model svm_coordinates_model = SVC(gamma='scale').fit( coordinates_training_1, states_training_1) # SVM on coordinates knn_model = KNeighborsClassifier().fit( coordinates_training_1, states_training_1) # KNN on coordinates svm_distances_model = SVC(gamma='scale').fit( distances_training_1, states_training_1) # SVM on distances nb_model = GaussianNB().fit(distances_training_1, states_training_1) # NB on coordinates # Generate the dictionary with the models models = { 'SVM_Coordinates': svm_coordinates_model, 'KNN_Coordinates': knn_model, 'SVM_Distances': svm_distances_model, 'NB_Distances': nb_model } # Use the models to make the prediction on the second training set final_training = _prediction_array(coordinates_training_2, distances_training_2, models) # Convert the string array into a binary one final_training_binary = np.zeros(final_training.shape) for i, state_name in enumerate(np.sort(np.unique(states))): final_training_binary[final_training == state_name] = i # Train the classification tree final_model = DecisionTreeClassifier().fit(final_training_binary, states_training_2) # Add the model to the dictionnaries models['ClassificationTree'] = final_model # Do all the scores model_scores = _make_detailed_score(models, coordinates_verification, distances_verification, states_verification) # Save the scores for general measurement all_scores.append(model_scores) # Save the models if needed if model_scores['final_score']['total'] > best_total_score: best_total_score = model_scores['final_score']['total'] best_models = models # Merge the scores to generate the average score training_scores, training_errors = _merge_scores(all_scores) return best_models, training_scores, training_errors
def summonGhosts(systems, geometry='bilayer', exclude_ghosts=None): """Compute the tessellations of the system for neighbour analysis. Argument(s): systems {list of class System} -- Instances of the System classes containing the molecules to save in a file. geometry {str} -- (Opt.) Geometry of the system to perform the tessellations on. The current geometries available are: *) bilayer - Analyse the 2D tesselations of a lipid bilayer on each leaflets. *) bilayer_3d - Analyse the 3D tessellations of a lipid bilayer. Requires ghosts to have been generated first. *) vesicle - Analyse the "2D" tessellations of a lipid vesicle by only keeping neighbours within the leaflet. Requires ghosts to have been generated first. *) vesicle_3d - Analyse the 3D tessellations of a lipid vesicle. Requires ghosts to have been generated first. By default, the geometry is set to a (2D) bilayer. exclude_ghosts {list of int} -- (Opt.) List of systems indices, provided with the same order than in the argument systems, that should be excluded from ghost generation. Default is None. Output(s): ghosts {np.ndarray} -- Position array of all the molecule ghosts generated for the Voronoi tessellation. """ # Convert single system in list if _is_system(systems): systems = [systems] # Check and convert input if not _is_list_of( systems, type='system', check_array=True, recursive=False): _error_input_type('Systems', 'List of System (or single System)') if exclude_ghosts is not None: if not _is_list(exclude_ghosts): exclude_ghosts = [exclude_ghosts] if not _is_list_of( exclude_ghosts, type='int', check_array=True, recursive=False): _error_input_type('Ghost exclusions', "List of integers") # Extract the information from the system(s) representation = _system_to_tessellation(systems) # Get the leaflets representation.getLeaflets(geometry=geometry) # Generate the ghosts for all the systems all_ghosts = [] for system_ID, mol_type in enumerate(systems): # Check if the ghosts should be calculated process_ghosts = True if exclude_ghosts is not None: if system_ID in exclude_ghosts: process_ghosts = False # Process the system if allowed if process_ghosts: # Create the ghosts mol_ghosts = generateGhosts(representation.positions, mol_type.positions, mol_type.infos['resids'], representation.leaflets, geometry=geometry) # Append the ghosts to the list all_ghosts.append(np.copy(mol_ghosts)) # Concatenate the ghosts all_ghosts = np.concatenate(all_ghosts, axis=1) # Save the ghosts in the representation all_ghosts = np.copy(all_ghosts) return all_ghosts
def _get_positions(coordinates_file, type=None, trj=None, heavy=True, type_info=None, begin=0, end=None, step=1): # --------------------- # CHECK THE USER INPUTS # Check the files extensions _check_input_file(coordinates_file, extensions=[".gro"]) if trj is not None: _check_input_file(trj, extensions=[".xtc", ".trr"]) # Load the system and set the time limits if trj is None: system = Universe(coordinates_file) begin, end, step = 0, 1, 1 else: system = Universe(coordinates_file, trj) begin, end, step = _coerce_trajectory(begin, end, step, len(system.trajectory)) # Check if the molecule type exists _error_molecule_type(type, np.unique(system.select_atoms("all").resnames)) # Check if the other kwargs have the good format if not _is_boolean(heavy): _error_input_type('Heavy atom selection', str(bool)) if type_info is not None: if not _is_dict(type_info): error_input_type('Molecule type information dictionary', str(dict)) # ---------------- # RUN THE FUNCTION # Create the selection selection_text = "resname " + type if heavy: selection_text += " and not type H" selected_molecules = system.select_atoms(selection_text) # Extract the required informations if type_info is None: n_molecules = np.unique(selected_molecules.resids).shape[0] else: n_molecules = type_info['n_molecules'] # Read all the frames all_frames = [] all_boxes = [] for i_frame in tqdm(range(begin, end, step), desc='Extracting ' + type + ' positions...'): # Move to the selected frame system.trajectory[i_frame] # Extract the positions current_positions = selected_molecules.positions # Reshape the positions n_frames = 1 n_atoms = int(current_positions.shape[0] / n_molecules) current_positions = np.reshape(current_positions, (n_molecules, n_atoms, 3)) # Get the box dimensions box_size = system.dimensions[0:3] # Save the positions all_frames.append(current_positions) all_boxes.append(np.copy(box_size)) # Get the array positions = np.array(all_frames) boxes = np.array(all_boxes) return positions, boxes