def test_priority_queue(): """ Test torsiondrive.priority_queue.PriorityQueue """ pq = PriorityQueue() # basic push and pop task = 'task' pq.push(task) assert pq.pop() == task # push and pop with priority task1 = 'task_fjei' task2 = 'task_ioue' task3 = 'task_asdb' pq.push(task1, priority=2) pq.push(task2, priority=1) pq.push(task3, priority=2) # task2 has a smaller priority value -> higher priority assert pq.pop() == task2 # the next one should be task1 because it's pushed earlier assert pq.pop() == task1 # last one is task3 assert pq.pop() == task3 # test popping empty error with pytest.raises(IndexError): pq.pop() # test push many pq = PriorityQueue() tasks = [f'task{i}' for i in range(20)] pq.push_many(tasks) # test len(pq) assert len(pq) == 20 # test looping over tasks for task, task_ref in zip(pq, tasks): assert task == task_ref
def repeat_scan_process(self): """ Mimicing DihedralScanner.master function, but stops when new jobs needs to run """ # make sure we're in the rootpath os.chdir(self.rootpath) self.refined_grid_ids = set() self.next_jobs = defaultdict(list) self.current_finished_job_results = PriorityQueue() # push the initial tasks self.push_initial_opt_tasks() if len(self.opt_queue) == 0: print("No tasks in opt_queue! Exiting..") return # start the iteration from beginning while True: # print current status if self.verbose: if len(self.dihedrals) == 2: print(self.draw_ramachandran_plot()) else: print(self.draw_ansi_image()) # this function will try to read cache and decide if new jobs needs to run self.launch_opt_jobs() # Break if any job was not found in the current cache if len(self.next_jobs) > 0: break # If all jobs found in the current iteration, parse the results current_best_grid_m = {} while len(self.current_finished_job_results) > 0: m, grid_id = self.current_finished_job_results.pop() if grid_id not in current_best_grid_m or m.qm_energies[ 0] < current_best_grid_m[grid_id].qm_energies[0]: current_best_grid_m[grid_id] = m # we only want refined results in current iteration to show in draw_ramachandran_plot() self.refined_grid_ids = set() # compare the best results between current iteration and all previous iterations newly_updated_grid_m = [] for grid_id, m in current_best_grid_m.items(): energy = m.qm_energies[0] # update current global minimum if self.global_minimum_energy is None or energy < self.global_minimum_energy: self.global_minimum_energy = energy if grid_id not in self.grid_energies: if self.verbose: print("First energy for grid_id %s = %f" % (str(grid_id), m.qm_energies[0])) self.grid_energies[grid_id] = energy self.grid_final_geometries[grid_id] = m.xyzs[0] newly_updated_grid_m.append((grid_id, m)) elif m.qm_energies[0] < self.grid_energies[ grid_id] - self.energy_decrease_thresh: if self.verbose: print( f"Energy for grid_id {grid_id} decreased from {self.grid_energies[grid_id]} to {energy}" ) self.grid_energies[grid_id] = energy self.grid_final_geometries[grid_id] = m.xyzs[0] newly_updated_grid_m.append((grid_id, m)) # we record the refined_grid_ids here to be printed as green tiles in draw_ramachandran_plot() self.refined_grid_ids.add(grid_id) # create new tasks for each newly_updated_grid_m for grid_id, m in newly_updated_grid_m: # every neighbor grid point will get one new task for neighbor_gid in self.grid_neighbors(grid_id): task = m, grid_id, neighbor_gid # validate task before pushing if self.validate_task(task): # all jobs are pushed with the same priority for now, can be adjusted here self.opt_queue.push(task) # check if all jobs finished if len(self.opt_queue) == 0 and len(self.next_jobs) == 0: print( "All optimizations converged at lowest energy. Job Finished!" ) break
class DihedralScanRepeater(DihedralScanner): """ Child class of dihedral scanner, that is specifically designed to accommodate the requirements of torsiondrive-API""" def repeat_scan_process(self): """ Mimicing DihedralScanner.master function, but stops when new jobs needs to run """ # make sure we're in the rootpath os.chdir(self.rootpath) self.refined_grid_ids = set() self.next_jobs = defaultdict(list) self.current_finished_job_results = PriorityQueue() # push the initial tasks self.push_initial_opt_tasks() if len(self.opt_queue) == 0: print("No tasks in opt_queue! Exiting..") return # start the iteration from beginning while True: # print current status if self.verbose: if len(self.dihedrals) == 2: print(self.draw_ramachandran_plot()) else: print(self.draw_ansi_image()) # this function will try to read cache and decide if new jobs needs to run self.launch_opt_jobs() # Break if any job was not found in the current cache if len(self.next_jobs) > 0: break # If all jobs found in the current iteration, parse the results current_best_grid_m = {} while len(self.current_finished_job_results) > 0: m, grid_id = self.current_finished_job_results.pop() if grid_id not in current_best_grid_m or m.qm_energies[ 0] < current_best_grid_m[grid_id].qm_energies[0]: current_best_grid_m[grid_id] = m # we only want refined results in current iteration to show in draw_ramachandran_plot() self.refined_grid_ids = set() # compare the best results between current iteration and all previous iterations newly_updated_grid_m = [] for grid_id, m in current_best_grid_m.items(): energy = m.qm_energies[0] # update current global minimum if self.global_minimum_energy is None or energy < self.global_minimum_energy: self.global_minimum_energy = energy if grid_id not in self.grid_energies: if self.verbose: print("First energy for grid_id %s = %f" % (str(grid_id), m.qm_energies[0])) self.grid_energies[grid_id] = energy self.grid_final_geometries[grid_id] = m.xyzs[0] newly_updated_grid_m.append((grid_id, m)) elif m.qm_energies[0] < self.grid_energies[ grid_id] - self.energy_decrease_thresh: if self.verbose: print( f"Energy for grid_id {grid_id} decreased from {self.grid_energies[grid_id]} to {energy}" ) self.grid_energies[grid_id] = energy self.grid_final_geometries[grid_id] = m.xyzs[0] newly_updated_grid_m.append((grid_id, m)) # we record the refined_grid_ids here to be printed as green tiles in draw_ramachandran_plot() self.refined_grid_ids.add(grid_id) # create new tasks for each newly_updated_grid_m for grid_id, m in newly_updated_grid_m: # every neighbor grid point will get one new task for neighbor_gid in self.grid_neighbors(grid_id): task = m, grid_id, neighbor_gid # validate task before pushing if self.validate_task(task): # all jobs are pushed with the same priority for now, can be adjusted here self.opt_queue.push(task) # check if all jobs finished if len(self.opt_queue) == 0 and len(self.next_jobs) == 0: print( "All optimizations converged at lowest energy. Job Finished!" ) break def rebuild_task_cache(self, grid_status): """ Take a dictionary of finished optimizations, rebuild task_cache dictionary This function mimics the DihedralScanner.restore_task_cache() Parameters: ------------ grid_status = dict(), key is the grid_id, value is a list of job_info. Each job_info is a tuple of (start_geo, end_geo, end_energy). * Note: The order of the job_info is important when reproducing the same scan procedure. Returns: None ------------ Upon finish, the new folder 'opt_tmp' will be created, with many empty folders corrsponding to the finished jobs. self.task_cache will be populated with correct information for repreducing the entire scan process. """ for grid_id, job_info_list in grid_status.items(): tname = 'gid_' + '_'.join('%+04d' % gid for gid in grid_id) tmp_folder_path = os.path.join(self.tmp_folder_name, tname) for i_job, job_info in enumerate(job_info_list): job_path = os.path.join(tmp_folder_path, str(i_job + 1)) (start_geo, end_geo, end_energy) = job_info job_geo_key = get_geo_key(start_geo) self.task_cache[grid_id][job_geo_key] = (end_geo, end_energy, job_path) def launch_opt_jobs(self): """ Mimicing DihedralScanner.launch_opt_jobs, """ assert hasattr(self, 'next_jobs') and hasattr( self, 'current_finished_job_results') while len(self.opt_queue) > 0: m, from_grid_id, to_grid_id = self.opt_queue.pop() # check if this job already exists m_geo_key = get_geo_key(m.xyzs[0]) if m_geo_key in self.task_cache[to_grid_id]: final_geo, final_energy, job_folder = self.task_cache[ to_grid_id][m_geo_key] result_m = Molecule() result_m.elem = list(m.elem) result_m.xyzs = [final_geo] result_m.qm_energies = [final_energy] result_m.build_topology() grid_id = self.get_dihedral_id(result_m, check_grid_id=to_grid_id) if grid_id is None: print( f"Cached result from {job_folder} is ignored because optimized geometry is far from grid id {to_grid_id}" ) else: self.current_finished_job_results.push((result_m, grid_id), priority=job_folder) else: # append the job to self.next_jobs, which is the output of torsiondrive-API self.next_jobs[to_grid_id].append(m.xyzs[0].copy())
def master(self): """ The master function that calls all other functions. This function will run the following steps: 1. Launch a new set of jobs from self.opt_queue, add their job path to a dictionary 2. Check if any running job has finished 3. For each finished job, check if energy is lower than existing one, if so, add its neighbor grid points to opt_queue 4. Go back to the 1st step, loop until all jobs finished, indicated by opt_queue and running jobs both empty. """ print("Master process started!") self.push_initial_opt_tasks() if len(self.opt_queue) == 0: print("No tasks in opt_queue! Exiting..") return # make sure we're in the rootpath os.chdir(self.rootpath) # check if the tmp folder exists if os.path.isdir(self.tmp_folder_name): # use existing tmp folder and read task cache self.restore_task_cache() else: # setup new tmp folders self.create_tmp_folder() # dictionary that saves the information for each running job, like orig m, orig grid_id, target grid_id self.running_job_path_info = dict() # Queue that saves the finished job results for each iteration # In each iteration, this will be populated by job results from task cache, and from new calculations # After parsing the finished jobs, this will emptied for the next iteration # We used a PriorityQueue here so the order of parsing finished jobs will be kept self.current_finished_job_results = PriorityQueue() # print scan status interval start_time = last_print_time = time.time() # the minimum time interval between prints min_print_interval = -1 # Disabled for now # store the grid ids that have found lower energy than existing one, for draw_ramachandran_plot() self.refined_grid_ids = set() # save the status of grid from beginning of run, useful when generating state files # self.grid_status = collections.defaultdict(list) while True: # check if it's time to show the status current_time = time.time() if self.verbose and current_time - last_print_time > min_print_interval: print("Scan Status at %d s" % (current_time - start_time)) try: if len(self.dihedrals) == 2: print(self.draw_ramachandran_plot()) else: print(self.draw_ansi_image()) except UnicodeEncodeError: print( "Warning: UnicodeEncodeError occured, status map not printed." ) last_print_time = current_time # Launch all jobs in self.opt_queue # new jobs will be put into self.running_job_path_info # job results found in self.task_cache will be added to self.current_finished_job_results self.launch_opt_jobs() # wait until all jobs finish, take out from self.running_job_path_info while len(self.running_job_path_info) > 0: self.wait_extract_finished_jobs() # check all finished jobs and keep the best ones for the current iteration current_best_grid_m = dict() while len(self.current_finished_job_results) > 0: m, grid_id = self.current_finished_job_results.pop() if grid_id not in current_best_grid_m or m.qm_energies[ 0] < current_best_grid_m[grid_id].qm_energies[0]: current_best_grid_m[grid_id] = m # we only want refined results in current iteration to show in draw_ramachandran_plot() self.refined_grid_ids = set() # compare the best results between current iteration and all previous iterations newly_updated_grid_m = [] for grid_id, m in current_best_grid_m.items(): energy = m.qm_energies[0] # update current global minimum if self.global_minimum_energy is None or energy < self.global_minimum_energy: self.global_minimum_energy = energy updating_grid_point = False if grid_id not in self.grid_energies: if self.verbose: print(f"First energy for grid_id {grid_id} = {energy}") updating_grid_point = True elif energy < self.grid_energies[ grid_id] - self.energy_decrease_thresh: if self.verbose: print( f"Energy for grid_id {grid_id} decreased from {self.grid_energies[grid_id]} to {energy}" ) updating_grid_point = True # we record the refined_grid_ids here to be printed as green tiles in draw_ramachandran_plot() self.refined_grid_ids.add(grid_id) if updating_grid_point: self.grid_energies[grid_id] = energy self.grid_final_geometries[grid_id] = m.xyzs[0] if hasattr(m, 'qm_grads'): self.grid_final_gradients[grid_id] = m.qm_grads[0] newly_updated_grid_m.append((grid_id, m)) # create new tasks for each newly_updated_grid_m for grid_id, m in newly_updated_grid_m: # every neighbor grid point will get one new task for neighbor_gid in self.grid_neighbors(grid_id): task = m, grid_id, neighbor_gid # validate task before pushing if self.validate_task(task): # all jobs are pushed with the same priority for now, can be adjusted here self.opt_queue.push(task) # check if all jobs finished if len(self.opt_queue) == 0 and len( self.running_job_path_info) == 0: print( "All optimizations converged at lowest energy. Job Finished!" ) break # the finish function will write files like scan.xyz, qdata.txt to disk self.finish()
def __init__(self, engine, dihedrals, grid_spacing, init_coords_M=None, energy_decrease_thresh=None, dihedral_ranges=None, energy_upper_limit=None, extra_constraints=None, verbose=False): self.engine = engine # store verbose flag for later printing self.verbose = verbose # validate input dihedral format self.dihedrals = [] for dihedral in dihedrals: assert len( dihedral ) == 4, "each dihedral in dihedrals should have 4 indices, e.g. (1,2,3,4)" dihedral_tuple = tuple(map(int, dihedral)) assert dihedral_tuple not in self.dihedrals and dihedral_tuple[:: -1] not in self.dihedrals, "All dihedrals should be unique" self.dihedrals.append(dihedral_tuple) self.grid_dim = len(self.dihedrals) for gs in grid_spacing: assert (0 < gs < 360) and ( 360 % gs == 0 ), f"grid_spacing {grid_spacing} is not valid, all values should be a divisor of 360" assert len( grid_spacing ) == self.grid_dim, f"Number of grid spacings {len(grid_spacing)} is not consistent with number of dihedrals {self.grid_dim}" self.grid_spacing = tuple(map(int, grid_spacing)) self.setup_grid() # validate dihedral ranges and build mask self.dihedral_ranges = dihedral_ranges if dihedral_ranges is not None else [ ] # for sanity check self.dihedral_mask = self.build_dihedral_mask(dihedral_ranges) # energy limit for high energy points self.global_minimum_energy = None self.energy_upper_limit = float( energy_upper_limit) if energy_upper_limit is not None else None # extra_constraints does not change, will be passed to engine for generating input files self.extra_constraints = extra_constraints # create a optiimization job queue self.opt_queue = PriorityQueue() # try to use init_coords_M first, if not given, use M in engine's template # `for m in init_coords_M` doesn't work since m.measure_dihedrals will fail because it has different m.xyzs shape self.init_coords_M = [ init_coords_M[i] for i in range(len(init_coords_M)) ] if init_coords_M is not None else [self.engine.M] # dictionary that stores the lowest energy for each grid point self.grid_energies = dict() # dictionary that stores the geometries corresponding to lowest energy for each grid point self.grid_final_geometries = dict() # dictionary that stores the gradients to lowest energy for each grid point (optional) self.grid_final_gradients = dict() # save current path as the rootpath self.rootpath = self.engine.rootpath = os.getcwd() # path for temporary optimization files to be saved self.tmp_folder_name = 'opt_tmp' # task cache for restoring self.task_cache = collections.defaultdict(dict) # filename for storing finished task result self.task_result_fname = 'dihedral_scanner_task_result.p' # threshold for determining the energy decrease self.energy_decrease_thresh = energy_decrease_thresh if energy_decrease_thresh is not None else 1e-5
class DihedralScanner: """ DihedralScanner class is designed to create a dihedral grid, and fill in optimized geometries and energies into the grid, by running wavefront propagations of constrained optimizations parameters ---------- engine: QMEngine() instance An QMEngine object, e.g. EnginePsi4, EngineQChem or EngineTerachem dihedrals: List[(d1, d2, d3, d4), ..] list of dihedral index tuples (d1, d2, d3, d4). The length of list determines the dimension of the grid i.e. dihedrals = [(0,1,2,3)] --> 1-D scan, dihedrals = [(0,1,2,3),(1,2,3,4)] --> 2-D Scan grid_spacing: Int Distance (in Degrees) between grid points, correspond to each dihedral, every value must be a divisor of 360 init_coords_M: geometric.molecule.Molecule() instance A Molecule constains a series of initial geometries to start with energy_decrease_thresh: Float The threshold of the smallest energy decrease amount to trigger activating optimizations from grid point. dihedral_ranges: List[(lower, upper), ..] A list of dihedral range limits as a pair (lower, upper), each range corresponds to the dihedrals in input. energy_upper_limit: Float or None The threshold if the energy of a grid point that is higher than the current global minimum, to start new optimizations, in unit of a.u. i.e. if energy_upper_limit = 0.05, current global minimum energy is -9.9 , then a new task starting with energy -9.8 will be skipped. extra_constraints: Dict A nested dictionary specifing extra constraints in geomeTRIC format. Details in extra_constraints.py verbose: bool let methods print more information when running """ def __init__(self, engine, dihedrals, grid_spacing, init_coords_M=None, energy_decrease_thresh=None, dihedral_ranges=None, energy_upper_limit=None, extra_constraints=None, verbose=False): self.engine = engine # store verbose flag for later printing self.verbose = verbose # validate input dihedral format self.dihedrals = [] for dihedral in dihedrals: assert len( dihedral ) == 4, "each dihedral in dihedrals should have 4 indices, e.g. (1,2,3,4)" dihedral_tuple = tuple(map(int, dihedral)) assert dihedral_tuple not in self.dihedrals and dihedral_tuple[:: -1] not in self.dihedrals, "All dihedrals should be unique" self.dihedrals.append(dihedral_tuple) self.grid_dim = len(self.dihedrals) for gs in grid_spacing: assert (0 < gs < 360) and ( 360 % gs == 0 ), f"grid_spacing {grid_spacing} is not valid, all values should be a divisor of 360" assert len( grid_spacing ) == self.grid_dim, f"Number of grid spacings {len(grid_spacing)} is not consistent with number of dihedrals {self.grid_dim}" self.grid_spacing = tuple(map(int, grid_spacing)) self.setup_grid() # validate dihedral ranges and build mask self.dihedral_ranges = dihedral_ranges if dihedral_ranges is not None else [ ] # for sanity check self.dihedral_mask = self.build_dihedral_mask(dihedral_ranges) # energy limit for high energy points self.global_minimum_energy = None self.energy_upper_limit = float( energy_upper_limit) if energy_upper_limit is not None else None # extra_constraints does not change, will be passed to engine for generating input files self.extra_constraints = extra_constraints # create a optiimization job queue self.opt_queue = PriorityQueue() # try to use init_coords_M first, if not given, use M in engine's template # `for m in init_coords_M` doesn't work since m.measure_dihedrals will fail because it has different m.xyzs shape self.init_coords_M = [ init_coords_M[i] for i in range(len(init_coords_M)) ] if init_coords_M is not None else [self.engine.M] # dictionary that stores the lowest energy for each grid point self.grid_energies = dict() # dictionary that stores the geometries corresponding to lowest energy for each grid point self.grid_final_geometries = dict() # dictionary that stores the gradients to lowest energy for each grid point (optional) self.grid_final_gradients = dict() # save current path as the rootpath self.rootpath = self.engine.rootpath = os.getcwd() # path for temporary optimization files to be saved self.tmp_folder_name = 'opt_tmp' # task cache for restoring self.task_cache = collections.defaultdict(dict) # filename for storing finished task result self.task_result_fname = 'dihedral_scanner_task_result.p' # threshold for determining the energy decrease self.energy_decrease_thresh = energy_decrease_thresh if energy_decrease_thresh is not None else 1e-5 #---------------------- # Initializing methods #---------------------- def setup_grid(self): """ Set up grid ids, each as a tuple with size corresponding to grid dimension. i.e. 1-D: grid_ids = ( (-165, ), (-150, ), ... (180, ) ) 2-D: grid_ids = ( (-165,-165), (-165,-150), ... (180,180) ) This function is called by the initializer. self.grid_axes is also initialized, to be a full range of grid values for each dihedral, i.e., 1-D: grid_axes = [range(-165, 195, 15)] 2-D: grid_axes = [range(-165, 195, 15), range(-165, 195, 15)] """ self.grid_axes = [] for gs in self.grid_spacing: self.grid_axes.append(range(-180 + gs, 180 + gs, gs)) self.grid_ids = tuple(itertools.product(*self.grid_axes)) def build_dihedral_mask(self, dihedral_ranges): """ Build a dihedral mask based on specified ranges Parameters ---------- dihedral_ranges: List[(lower: Int, upper: Int), ..] The range limits corresponding to each dihedral angle A full dihedral range is [-180, 180] The upper limit up to 360 is supported for the purpose of specifying range limits crossing the boundary, e.g. [80, 240], which effectively become [-180, 120] + [80, 180] Returns ------- dihedral_mask: List[set(), ..] The dihedral mask is a list of sets, each set contains all available values for one dihedral angle Notes ----- This function should be called after self.setup_grid() """ if not dihedral_ranges: return None assert all(l >= -180 and u <= 360 and l < u for l, u in dihedral_ranges), \ f'Dihedral ranges {dihedral_ranges} mistaken, range should be within [-180, 360]' assert len(dihedral_ranges) == len( self.dihedrals ), f'Dihedral ranges {dihedral_ranges} do not have consistent length to dihedrals {self.dihedrals}' if self.verbose: print( f"Dihedral scan initialized with range limit {dihedral_ranges}" ) dihedral_mask = [] for (l, u), ax in zip(dihedral_ranges, self.grid_axes): if u > 180: # the "split range" case dmask = {g for g in ax if g >= l or g <= u - 360} else: # the normal case dmask = {g for g in ax if l <= g <= u} # handle special case of l == -180 # needed so that 180 actually exists in dmask, and is computed if l == -180: dmask.add(180) dihedral_mask.append(dmask) return dihedral_mask #-------------------- # General methods #-------------------- def get_dihedral_id(self, molecule, check_grid_id=None): """ Compute the closest grid ID for molecule (only first frame) If check_grid_id is given, will perform a check if the computed dihedral_values are close to the grid_id provided If the check is not passed, this function will return None """ dihedral_values = measure_dihedrals(molecule, self.dihedrals) if check_grid_id is not None: assert len(check_grid_id) == len( dihedral_values), "Grid dimensions should be the same!" for dv, dref in zip(dihedral_values, check_grid_id): diff = abs(dv - dref) if min(diff, abs(360 - diff)) > 0.5: print( "Warning! dihedral values inconsistent with check_grid_id" ) print( f'dihedral_values {dihedral_values}; check_grid_id {check_grid_id}' ) return None # here we shift the dihedral by +180 then shift back because -180 is the actual origin of the grid # this allows grid_spacing of 24 dihedral_id = (np.round( (dihedral_values + 180) / self.grid_spacing) * self.grid_spacing - 180).astype(int) # we return a tuples as the grid_id return tuple([normalize_dihedral(d) for d in dihedral_id]) def grid_neighbors(self, grid_id): """ Take a center grid id, return all the neighboring grid ids, in each dimension """ neighbor_gridids = [] for i_dim in range(len(grid_id)): gs = self.grid_spacing[i_dim] lower_neighbor = list(grid_id) lower_neighbor[i_dim] = normalize_dihedral(grid_id[i_dim] - gs) neighbor_gridids.append(tuple(lower_neighbor)) higher_neighbor = list(grid_id) higher_neighbor[i_dim] = normalize_dihedral(grid_id[i_dim] + gs) neighbor_gridids.append(tuple(higher_neighbor)) return tuple(neighbor_gridids) def grid_full_neighbors(self, grid_id): """ Take a center grid id, return all the neighboring grid ids, in all dimensions """ # Note: This function is not in use now, because it's very expensive (and probably unnecessary) neighbor_gids_each_dim = [] for gid_each_dim, gs in zip(grid_id, self.grid_spacing): lower_neighbor = normalize_dihedral(gid_each_dim - gs) higher_neighbor = normalize_dihedral(gid_each_dim + gs) neighbor_gids_each_dim.append((lower_neighbor, higher_neighbor)) return tuple(itertools.product(*neighbor_gids_each_dim)) #---------------------- # Master method #---------------------- def master(self): """ The master function that calls all other functions. This function will run the following steps: 1. Launch a new set of jobs from self.opt_queue, add their job path to a dictionary 2. Check if any running job has finished 3. For each finished job, check if energy is lower than existing one, if so, add its neighbor grid points to opt_queue 4. Go back to the 1st step, loop until all jobs finished, indicated by opt_queue and running jobs both empty. """ print("Master process started!") self.push_initial_opt_tasks() if len(self.opt_queue) == 0: print("No tasks in opt_queue! Exiting..") return # make sure we're in the rootpath os.chdir(self.rootpath) # check if the tmp folder exists if os.path.isdir(self.tmp_folder_name): # use existing tmp folder and read task cache self.restore_task_cache() else: # setup new tmp folders self.create_tmp_folder() # dictionary that saves the information for each running job, like orig m, orig grid_id, target grid_id self.running_job_path_info = dict() # Queue that saves the finished job results for each iteration # In each iteration, this will be populated by job results from task cache, and from new calculations # After parsing the finished jobs, this will emptied for the next iteration # We used a PriorityQueue here so the order of parsing finished jobs will be kept self.current_finished_job_results = PriorityQueue() # print scan status interval start_time = last_print_time = time.time() # the minimum time interval between prints min_print_interval = -1 # Disabled for now # store the grid ids that have found lower energy than existing one, for draw_ramachandran_plot() self.refined_grid_ids = set() # save the status of grid from beginning of run, useful when generating state files # self.grid_status = collections.defaultdict(list) while True: # check if it's time to show the status current_time = time.time() if self.verbose and current_time - last_print_time > min_print_interval: print("Scan Status at %d s" % (current_time - start_time)) try: if len(self.dihedrals) == 2: print(self.draw_ramachandran_plot()) else: print(self.draw_ansi_image()) except UnicodeEncodeError: print( "Warning: UnicodeEncodeError occured, status map not printed." ) last_print_time = current_time # Launch all jobs in self.opt_queue # new jobs will be put into self.running_job_path_info # job results found in self.task_cache will be added to self.current_finished_job_results self.launch_opt_jobs() # wait until all jobs finish, take out from self.running_job_path_info while len(self.running_job_path_info) > 0: self.wait_extract_finished_jobs() # check all finished jobs and keep the best ones for the current iteration current_best_grid_m = dict() while len(self.current_finished_job_results) > 0: m, grid_id = self.current_finished_job_results.pop() if grid_id not in current_best_grid_m or m.qm_energies[ 0] < current_best_grid_m[grid_id].qm_energies[0]: current_best_grid_m[grid_id] = m # we only want refined results in current iteration to show in draw_ramachandran_plot() self.refined_grid_ids = set() # compare the best results between current iteration and all previous iterations newly_updated_grid_m = [] for grid_id, m in current_best_grid_m.items(): energy = m.qm_energies[0] # update current global minimum if self.global_minimum_energy is None or energy < self.global_minimum_energy: self.global_minimum_energy = energy updating_grid_point = False if grid_id not in self.grid_energies: if self.verbose: print(f"First energy for grid_id {grid_id} = {energy}") updating_grid_point = True elif energy < self.grid_energies[ grid_id] - self.energy_decrease_thresh: if self.verbose: print( f"Energy for grid_id {grid_id} decreased from {self.grid_energies[grid_id]} to {energy}" ) updating_grid_point = True # we record the refined_grid_ids here to be printed as green tiles in draw_ramachandran_plot() self.refined_grid_ids.add(grid_id) if updating_grid_point: self.grid_energies[grid_id] = energy self.grid_final_geometries[grid_id] = m.xyzs[0] if hasattr(m, 'qm_grads'): self.grid_final_gradients[grid_id] = m.qm_grads[0] newly_updated_grid_m.append((grid_id, m)) # create new tasks for each newly_updated_grid_m for grid_id, m in newly_updated_grid_m: # every neighbor grid point will get one new task for neighbor_gid in self.grid_neighbors(grid_id): task = m, grid_id, neighbor_gid # validate task before pushing if self.validate_task(task): # all jobs are pushed with the same priority for now, can be adjusted here self.opt_queue.push(task) # check if all jobs finished if len(self.opt_queue) == 0 and len( self.running_job_path_info) == 0: print( "All optimizations converged at lowest energy. Job Finished!" ) break # the finish function will write files like scan.xyz, qdata.txt to disk self.finish() #---------------------------------- # Utility methods Called by Master #---------------------------------- def validate_task(self, task): """ Validate a constrained optimization task before pushing to the queue. This is useful to limit the dihedrals into a range of interest. Parameters ---------- task: (m, from_grid_id, to_grid_id) A constrained optimization task Returns ------- isValid: bool True if the task is valid """ m, from_grid_id, to_grid_id = task # check if dihedral is in mask if self.dihedral_mask is not None: for d, dmask in zip(to_grid_id, self.dihedral_mask): if d not in dmask: if self.verbose: print( f"Task with target grid_id {to_grid_id} skipped because {d} doesn't fit in range limit" ) return False # check if energy is higher than limit if self.energy_upper_limit is not None: if self.global_minimum_energy is not None and hasattr( m, 'qm_energies') and len(m.qm_energies) > 0: abs_energy_upper_limit = self.global_minimum_energy + self.energy_upper_limit if m.qm_energies[0] > abs_energy_upper_limit: if self.verbose: print(f"Task {from_grid_id} => {to_grid_id} skipped") print( f"Reason: starting energy {m.qm_energies[0]} is more than {self.energy_upper_limit} higher than current global minimum {self.global_minimum_energy} in a.u." ) return False return True def push_initial_opt_tasks(self): """ Push a set of initial tasks to self.opt_queue A task is defined as (m, from_grid_id, to_grid_id) tuple, where geometry is stored in m """ for m in self.init_coords_M: from_grid_id = to_grid_id = self.get_dihedral_id(m) task = (m, from_grid_id, to_grid_id) if self.validate_task(task): self.opt_queue.push(task) if self.verbose: print( f"{len(self.init_coords_M)} initial tasks pushed to opt_queue") def save_task_cache(self, job_path, m_init, m_final): """ Save a file containing the finished job information to a pickle file on disk. The format should be consistent with self.restore_task_cache() """ final_energy = m_final.qm_energies[0] task_result = { 'initial_geo': m_init.xyzs[0], 'final_geo': m_final.xyzs[0], 'final_energy': final_energy } if hasattr(m_final, 'qm_grads'): task_result['final_gradient'] = m_final.qm_grads[0] with open( os.path.join(self.rootpath, job_path, self.task_result_fname), 'wb') as pickleout: pickle.dump(task_result, pickleout) def restore_task_cache(self): """ Restore previous finished tasks from tmp folder. 1. Look into tmp folder and read scanner_settings.json, check if it matches current setting 2. Read the result pickle file from each leaf folder, into task_cache If successful, self.tmp_folder_dict will be initialized, same as self.create_tmp_folder(), and self.task_cache will be populated, with task caches, defined in this way: self.task_cache = {(30,-60): {geo_key: (final_geo, final_energy, final_gradient, job_folder)}} final_gradient will be None if it's not available. """ if self.verbose: print("Restoring from %s" % self.tmp_folder_name) # check if this scan matches the previous scan settings_fname = os.path.join(self.tmp_folder_name, 'scanner_settings.json') with open(settings_fname) as jsonfile: scanner_settings = json.load(jsonfile) err_msg = " does not match current one, please delete %s to restart" % self.tmp_folder_name assert len(self.dihedrals) == len( scanner_settings['dihedrals']), 'Setting [dihedrals] ' + err_msg assert np.array_equal(np.array(self.dihedrals), np.array(scanner_settings['dihedrals']) ), 'Setting [dihedrals] ' + err_msg assert np.array_equal(self.grid_spacing, scanner_settings['grid_spacing'] ), 'Setting [grid_spacing] ' + err_msg if 'energy_decrease_thresh' in scanner_settings: assert self.energy_decrease_thresh == scanner_settings[ 'energy_decrease_thresh'], 'Setting [energy_decrease_thresh] ' + err_msg if 'dihedral_ranges' in scanner_settings: assert np.array_equal(self.dihedral_ranges, scanner_settings['dihedral_ranges'] ), 'Setting [dihedral_ranges] ' + err_msg if 'extra_constraints' in scanner_settings: assert json.dumps( self.extra_constraints, sort_keys=True) == json.dumps( scanner_settings['extra_constraints'], sort_keys=True), 'Setting [extra_constraints] ' + err_msg if 'energy_upper_limit' in scanner_settings: assert self.energy_upper_limit == scanner_settings[ 'energy_upper_limit'], 'Setting [energy_upper_limit] ' + err_msg # read all finished jobs in tmp folder self.tmp_folder_dict = dict() n_cache = 0 for grid_id in self.grid_ids: tname = 'gid_' + '_'.join('%+04d' % gid for gid in grid_id) tmp_folder_path = os.path.join(self.tmp_folder_name, tname) self.tmp_folder_dict[grid_id] = tmp_folder_path existing_job_folders = [ os.path.join(tmp_folder_path, f) for f in os.listdir(tmp_folder_path) ] for job_folder in existing_job_folders: result_fname = os.path.join(job_folder, self.task_result_fname) if os.path.isfile(result_fname): try: task_result = pickle.load(open(result_fname, 'rb')) task_geo_key = get_geo_key(task_result['initial_geo']) self.task_cache[grid_id][task_geo_key] = ( task_result['final_geo'], task_result['final_energy'], task_result.get('final_gradient', None), job_folder) n_cache += 1 except Exception as e: print(f"Error while loading {result_fname}:" + str(e)) pass if self.verbose: print("Successfully loaded %s cached results" % n_cache) def create_tmp_folder(self): """ Create an empty tmp folder structure, save the paths for each grid point into self.tmp_folder_dict Examples -------- self.tmp_folder_dict = {(30,-70): "opt_tmp/gid_+030_-070", ..} """ assert hasattr(self, 'grid_ids'), 'Call self.setup_grid() first' os.mkdir(self.tmp_folder_name) # save current scan settings scanner_settings = { 'dihedrals': self.dihedrals, 'grid_spacing': self.grid_spacing, 'energy_decrease_thresh': self.energy_decrease_thresh, 'dihedral_ranges': self.dihedral_ranges, 'extra_constraints': self.extra_constraints, 'energy_upper_limit': self.energy_upper_limit, } settings_fname = os.path.join(self.rootpath, self.tmp_folder_name, 'scanner_settings.json') with open(settings_fname, 'w') as jsonfile: json.dump(scanner_settings, jsonfile) # create folders and save their path to self.tmp_folder_dict tmp_folder_dict = dict() for grid_id in self.grid_ids: tname = 'gid_' + '_'.join('%+04d' % gid for gid in grid_id) tmp_folder_path = os.path.join(self.tmp_folder_name, tname) os.mkdir(tmp_folder_path) tmp_folder_dict[grid_id] = tmp_folder_path self.tmp_folder_dict = tmp_folder_dict def launch_opt_jobs(self): """ Launch constrained optimizations for molecules in opt_queue Tasks current opt_queue will be popped in order. If a task exist in self.task_cache, the cached result will be checked, then put into self.current_finished_job_results Else, the task will be launched by self.launch_constrained_opt, and information is saved as self.running_job_path_info[job_path] = m, from_grid_id, to_grid_id """ assert hasattr(self, 'running_job_path_info') and hasattr( self, 'current_finished_job_results') while len(self.opt_queue) > 0: m, from_grid_id, to_grid_id = self.opt_queue.pop() # check if this job already exists m_geo_key = get_geo_key(m.xyzs[0]) if m_geo_key in self.task_cache[to_grid_id]: final_geo, final_energy, final_gradient, job_folder = self.task_cache[ to_grid_id][m_geo_key] result_m = Molecule() result_m.elem = list(m.elem) result_m.xyzs = [final_geo] result_m.qm_energies = [final_energy] if final_gradient is not None: result_m.qm_grads = [final_gradient] result_m.build_topology() grid_id = self.get_dihedral_id(result_m, check_grid_id=to_grid_id) if grid_id is None: print( f"Cached result from {job_folder} is ignored because optimized geometry is far from grid id {to_grid_id}" ) else: self.current_finished_job_results.push((result_m, grid_id), priority=job_folder) #self.grid_status[to_grid_id].append((m.xyzs[0], final_geo, final_energy)) else: job_path = self.launch_constrained_opt(m, to_grid_id) self.running_job_path_info[ job_path] = m, from_grid_id, to_grid_id def launch_constrained_opt(self, molecule, grid_id): """ Called by launch_opt_jobs() to launch one opt job in a new scr folder Return the new folder path """ dihedral_idx_values = [] for dihedral_idxs, dihedral_value in zip(self.dihedrals, grid_id): dihedral_idx_values.append(list(dihedral_idxs) + [dihedral_value]) # get a new folder new_job_path = self.get_new_scr_folder(grid_id) if self.verbose: print("Launching new job at %s" % new_job_path) # launch optimization job inside scratch folder self.engine.M = copy.deepcopy(molecule) self.engine.extra_constraints = self.extra_constraints self.engine.set_dihedral_constraints(dihedral_idx_values) self.engine.launch_optimize(new_job_path) return new_job_path def get_new_scr_folder(self, grid_id): """ create a job scratch folder inside tmp_folder_dict[grid_id] name starting from '1', and will use larger numbers if exist return the new folder name that's been created """ tmp_path = self.tmp_folder_dict[grid_id] existing_jobs = os.listdir(tmp_path) next_job_number = len(existing_jobs) + 1 job_path = os.path.join(tmp_path, str(next_job_number)) os.mkdir(job_path) return job_path def wait_extract_finished_jobs(self): """ Interface with engine to check if any job finished. Will wait infinitely here until at least one job finished. The finished job paths will be removed from self.running_job_path_info. The finished job results (m, grid_id) will be checked, if the result geometry is not close enough to target grid id, the result will be ignored. Results passed the check will be added to self.current_finished_job_results. """ if len(self.running_job_path_info) == 0: print("No job running, returning") return while True: finished_path_set = self.engine.find_finished_jobs( self.running_job_path_info, wait_time=3) if len(finished_path_set) > 0: break if self.verbose: print("Find finished jobs:", finished_path_set) for job_path in finished_path_set: m_init, from_grid_id, to_grid_id = self.running_job_path_info.pop( job_path) # call the engine to parse output file and return final geometry/energy in a new molecule m = self.engine.load_task_result_m(job_path) # save the parsed task result to disk self.save_task_cache(job_path, m_init, m) # we will check here if the optimized structure has the desired dihedral ids grid_id = self.get_dihedral_id(m, check_grid_id=to_grid_id) if grid_id is None: print( f"Constrained optimization result at {job_path} is skipped, because final geometry is far from grid id {to_grid_id}" ) else: # each finished job result is a tuple of (m, grid_id) self.current_finished_job_results.push((m, grid_id), priority=job_path) def finish(self): """ Write qdata.txt and scan.xyz file based on converged scan results """ m = Molecule() m.elem = list(self.engine.M.elem) m.qm_energies, m.xyzs, m.comms = [], [], [] # optionally writing qm gradients into qdata.txt if avilable writing_gradients = False if len(self.grid_final_gradients) == len(self.grid_final_geometries): m.qm_grads = [] writing_gradients = True # only print grid with energies for gid in sorted(self.grid_energies.keys()): m.qm_energies.append(self.grid_energies[gid]) m.xyzs.append(self.grid_final_geometries[gid]) if writing_gradients: m.qm_grads.append(self.grid_final_gradients[gid]) m.comms.append("Dihedral %s Energy %.9f" % (str(gid), self.grid_energies[gid])) m.write('qdata.txt') print( f"Final scan energies{' and gradients' if writing_gradients else ''} are written to qdata.txt" ) m.write('scan.xyz') print("Final scan energies are written to scan.xyz") #---------------------------------- # Status Drawing Utilites #---------------------------------- def draw_ansi_image(self): """ Return a string with ANSI colors showing current running status """ if not hasattr(self, 'grid_energies') or not hasattr( self, 'opt_queue'): return "draw_ansi_image failed: grid_energies or opt_queue not available" result_str = "" count = 0 running_to_job_ids = set( to_grid_id for m, from_grid_id, to_grid_id in self.opt_queue) for grid_id in self.grid_ids: symbol = ' -' if grid_id in running_to_job_ids: symbol = ' \033[0;33m+\033[0m' # orange for running jobs elif grid_id in self.grid_energies: symbol = ' \033[0;36mo\033[0m' # cyan for finished jobs result_str += symbol count += 1 end_number = 1 for i_dim in range(self.grid_dim): end_number *= len(self.grid_axes[i_dim]) if count % end_number == 0: result_str += '\n' return result_str def draw_ramachandran_plot(self): """ Return a string of Ramachandran plot showing current running status """ assert self.grid_dim == 2, "Ramachandran plot only works for 2-D scans" gsx, gsy = self.grid_spacing grid_x, grid_y = self.grid_axes # add labels of status for each grid point grid_status = collections.defaultdict(str) gid_direction = { (gsx, 0): 'r', (gsx - 360, 0): 'r', (-gsx, 0): 'l', (360 - gsx, 0): 'l', (0, gsy): 'u', (0, gsy - 360): 'u', (0, -gsy): 'd', (0, 360 - gsy): 'd', (0, 0): 'o' } # print the status of jobs that are about to be launched for m, from_grid_id, to_grid_id in self.opt_queue: from_x, from_y = from_grid_id to_x, to_y = to_grid_id direction = gid_direction[(to_x - from_x, to_y - from_y)] grid_status[to_grid_id] += direction # if no job launching for this grid point, print the previous result for grid_id in self.refined_grid_ids: if grid_id not in grid_status: grid_status[ grid_id] = 'f' # green tiles for just refined results for grid_id in self.grid_energies: if grid_id not in grid_status: grid_status[grid_id] = 'e' # blue tiles for finished results # format string status_symbols = collections.defaultdict( lambda: '\x1b[1;41m><\x1b[0m', { '': ' ', 'c': '\x1b[46m--\x1b[0m', 'e': '\x1b[44m--\x1b[0m', 'f': '\x1b[42m--\x1b[0m', 'r': '\x1b[1;41m>\x1b[0m', 'l': '\x1b[1;41m<\x1b[0m', 'd': '\x1b[1;41m\\/\x1b[0m', 'u': '\x1b[1;41m/\\\x1b[0m', 'dl': '\x1b[41m\\x1b[0m', 'dr': '\x1b[41m/\x1b[0m', 'ld': '\x1b[41m\\x1b[0m', 'rd': '\x1b[41m/\x1b[0m', 'ul': '\x1b[41m/\x1b[0m', 'ur': '\x1b[41m\\x1b[0m', 'lu': '\x1b[41m/\x1b[0m', 'ru': '\x1b[41m\\x1b[0m', }) result_str = "--== Ramachandran Plot of Optimization Status ==--\n" result_str += "--== Blue: Optimized, Green: Found Lower, Red: Next ==--\n" result_str += " " + ''.join("%6d" % x for x in grid_x[::3]) + '\n' for y in grid_y[::-1]: line = '%4d ' % y + ''.join(status_symbols[grid_status[(x, y)]] for x in grid_x) + '\n' result_str += line return result_str