def get_base_training_set( self, database: MoleculePropertyDB) -> Dict[str, float]: """Get the training set for the base model Args: database: Connection to a collection of molecular properties Returns: Training set used to train "molecule structure" -> "property" models """ results = database.get_training_set(['identifier.smiles'], [self.target_property]) return dict( zip(results['identifier.smiles'], results[self.target_property]))
def get_calibration_training_set( self, level: int, database: MoleculePropertyDB) -> Dict[str, float]: """Get the training set for a certain level of fidelity Args: level: Index of the desired level of fidelity database: Connection to a collection of molecular properties Returns: Training set useful for that calibration model """ # Get the recipe level of fidelity used as the base recipe = get_recipe_by_name(self.model_levels[level].base_fidelity) # Define the name of the input description of the molecule model_type = self.model_levels[level].model_type if model_type == ModelType.SCHNET: # Use the geometry at the base level of fidelity, and select the charged geometry only if available xyz = f'data.{recipe.geometry_level}.{self.oxidation_state if recipe.adiabatic else "neutral"}.xyz' else: # Use the SMILES string xyz = 'identifier.smiles' # Get the low-res level of fidelity low_res = 'oxidation_potential' if self.oxidation_state == OxidationState.OXIDIZED else 'reduction_potential' low_res += '.' + recipe.name # Query the database to be the output results = database.get_training_set([xyz, low_res], [self.target_property]) # Compute the delta between base and target delta = np.subtract(results[self.target_property], results[low_res]) # Return that as the training set return dict(zip(results[xyz], delta))
def db() -> MoleculePropertyDB: client = MongoClient() db = client['edw-pytest'] yield MoleculePropertyDB(db['molecules']) db.drop_collection('molecules') client.drop_database('edw-pytest')
"""Load summary of data from QCFractal into MongoDB""" from tqdm import tqdm from moldesign.simulate.qcfractal import GeometryDataset, SolvationEnergyDataset, HessianDataset, SinglePointDataset from moldesign.store.models import UnmatchedGeometry from moldesign.store.mongo import MoleculePropertyDB # Log in to MongoDB mongo = MoleculePropertyDB.from_connection_info() # Get the QCFractal datasets relax_datasets = [ GeometryDataset('Electrolyte Geometry XTB', 'xtb'), GeometryDataset('Electrolyte Geometry NWChem', 'small_basis'), GeometryDataset('Electrolyte Geometry NWChem, 6-31G(2df,p)', 'normal_basis') ] single_point_energy_datasets = [ # Verticals using XTB geometry SinglePointDataset( 'Electrolyte XTB Neutral Geometry, Small-Basis Energy', 'nwchem', 'small_basis', ), SinglePointDataset('Electrolyte XTB Neutral Geometry, Normal-Basis Energy', 'nwchem', 'normal_basis'), SinglePointDataset( 'Electrolyte XTB Neutral Geometry, Diffuse-Basis Energy', 'nwchem', 'diffuse_basis'),
help="Number molecules per inference task") parser.add_argument("--beta", default=1, help="Degree of exploration for active learning. " "This is the beta from the UCB acquistion function", type=float) # Execution system related parser.add_argument('--dilation-factor', default=1, type=float, help='Factor by which to artificially increase simulation time') parser.add_argument('--num-workers', default=1, type=int, help='Number of workers') # Parse the arguments args = parser.parse_args() run_params = args.__dict__ # Connect to MongoDB mongo_url = parse.urlparse(args.mongo_url) mongo = MoleculePropertyDB.from_connection_info(mongo_url.hostname, mongo_url.port) full_search = pd.read_csv(args.search_space, delim_whitespace=True) search_space = full_search['inchi'].values # Create an output directory with the time and run parameters start_time = datetime.utcnow() params_hash = hashlib.sha256(json.dumps(run_params).encode()).hexdigest()[:6] out_dir = Path('runs').joinpath(f'ensemble-{start_time.strftime("%d%b%y-%H%M%S")}-{params_hash}') out_dir.mkdir(exist_ok=False, parents=True) # Save the run parameters to disk with open(os.path.join(out_dir, 'run_params.json'), 'w') as fp: json.dump(run_params, fp, indent=2) with open(os.path.join(out_dir, 'environment.json'), 'w') as fp: json.dump(dict(os.environ), fp, indent=2)
learning_rate=args.learning_rate, bootstrap=True) my_retrain_mpnn = update_wrapper(my_retrain_mpnn, retrain_mpnn) # Create the method server and task generator inf_cfg = {'executors': ['ml-inference']} tra_cfg = {'executors': ['ml-train']} dft_cfg = {'executors': ['qc']} doer = ParslMethodServer([(my_evaluate_mpnn, inf_cfg), (run_simulation, dft_cfg), (my_update_mpnn, tra_cfg), (my_retrain_mpnn, tra_cfg)], server_queues, config) # Connect to MongoDB database = MoleculePropertyDB.from_connection_info(args.mongohost, args.mongoport) # Configure the "thinker" application thinker = Thinker(client_queues, database, args.search_space, args.search_size, args.retrain_frequency, args.retrain_from_scratch, models, args.molecules_per_ml_task, nnodes, args.nodes_per_task, out_dir, args.beta) logging.info('Created the method server and task generator') try: # Launch the servers # The method server is a Thread, so that it can access the Parsl DFK # The task generator is a Thread, so that all debugging methods get cast to screen doer.start() thinker.start()
help= 'Globus Endpoint config file to use with the ProxyStore Globus backend' ) group.add_argument( '--ml-ps-globus-config', default=None, help= 'Globus Endpoint config file to use with the ProxyStore Globus backend' ) # Parse the arguments args = parser.parse_args() run_params = args.__dict__ # Connect to MongoDB database = MoleculePropertyDB.from_connection_info(hostname=args.mongohost, port=args.mongoport) # Get the target level of accuracy with open(args.simulation_spec) as fp: simulation_spec = MultiFidelitySearchSpecification.parse_obj( yaml.safe_load(fp)) # Create an output directory with the time and run parameters start_time = datetime.utcnow() params_hash = hashlib.sha256( json.dumps(run_params).encode()).hexdigest()[:6] out_dir = os.path.join( 'runs', f'{simulation_spec.target_property}-N{args.num_qc_workers}-n{args.nodes_per_task}-' f'{params_hash}-{start_time.strftime("%d%b%y-%H%M%S")}') os.makedirs(out_dir, exist_ok=False)