Пример #1
0
    def get_base_training_set(
            self, database: MoleculePropertyDB) -> Dict[str, float]:
        """Get the training set for the base model

        Args:
            database: Connection to a collection of molecular properties
        Returns:
            Training set used to train "molecule structure" -> "property" models
        """

        results = database.get_training_set(['identifier.smiles'],
                                            [self.target_property])
        return dict(
            zip(results['identifier.smiles'], results[self.target_property]))
Пример #2
0
    def get_calibration_training_set(
            self, level: int,
            database: MoleculePropertyDB) -> Dict[str, float]:
        """Get the training set for a certain level of fidelity

        Args:
            level: Index of the desired level of fidelity
            database: Connection to a collection of molecular properties
        Returns:
            Training set useful for that calibration model
        """

        # Get the recipe level of fidelity used as the base
        recipe = get_recipe_by_name(self.model_levels[level].base_fidelity)

        # Define the name of the input description of the molecule
        model_type = self.model_levels[level].model_type
        if model_type == ModelType.SCHNET:
            #  Use the geometry at the base level of fidelity, and select the charged geometry only if available
            xyz = f'data.{recipe.geometry_level}.{self.oxidation_state if recipe.adiabatic else "neutral"}.xyz'
        else:
            #  Use the SMILES string
            xyz = 'identifier.smiles'

        # Get the low-res level of fidelity
        low_res = 'oxidation_potential' if self.oxidation_state == OxidationState.OXIDIZED else 'reduction_potential'
        low_res += '.' + recipe.name

        # Query the database to be the output
        results = database.get_training_set([xyz, low_res],
                                            [self.target_property])

        # Compute the delta between base and target
        delta = np.subtract(results[self.target_property], results[low_res])

        # Return that as the training set
        return dict(zip(results[xyz], delta))
Пример #3
0
def db() -> MoleculePropertyDB:
    client = MongoClient()
    db = client['edw-pytest']
    yield MoleculePropertyDB(db['molecules'])
    db.drop_collection('molecules')
    client.drop_database('edw-pytest')
"""Load summary of data from QCFractal into MongoDB"""
from tqdm import tqdm

from moldesign.simulate.qcfractal import GeometryDataset, SolvationEnergyDataset, HessianDataset, SinglePointDataset
from moldesign.store.models import UnmatchedGeometry
from moldesign.store.mongo import MoleculePropertyDB

# Log in to MongoDB
mongo = MoleculePropertyDB.from_connection_info()

# Get the QCFractal datasets
relax_datasets = [
    GeometryDataset('Electrolyte Geometry XTB', 'xtb'),
    GeometryDataset('Electrolyte Geometry NWChem', 'small_basis'),
    GeometryDataset('Electrolyte Geometry NWChem, 6-31G(2df,p)',
                    'normal_basis')
]

single_point_energy_datasets = [
    # Verticals using XTB geometry
    SinglePointDataset(
        'Electrolyte XTB Neutral Geometry, Small-Basis Energy',
        'nwchem',
        'small_basis',
    ),
    SinglePointDataset('Electrolyte XTB Neutral Geometry, Normal-Basis Energy',
                       'nwchem', 'normal_basis'),
    SinglePointDataset(
        'Electrolyte XTB Neutral Geometry, Diffuse-Basis Energy', 'nwchem',
        'diffuse_basis'),
Пример #5
0
                        help="Number molecules per inference task")
    parser.add_argument("--beta", default=1, help="Degree of exploration for active learning. "
                                                  "This is the beta from the UCB acquistion function", type=float)

    # Execution system related
    parser.add_argument('--dilation-factor', default=1, type=float,
                        help='Factor by which to artificially increase simulation time')
    parser.add_argument('--num-workers', default=1, type=int, help='Number of workers')

    # Parse the arguments
    args = parser.parse_args()
    run_params = args.__dict__

    # Connect to MongoDB
    mongo_url = parse.urlparse(args.mongo_url)
    mongo = MoleculePropertyDB.from_connection_info(mongo_url.hostname, mongo_url.port)

    full_search = pd.read_csv(args.search_space, delim_whitespace=True)
    search_space = full_search['inchi'].values

    # Create an output directory with the time and run parameters
    start_time = datetime.utcnow()
    params_hash = hashlib.sha256(json.dumps(run_params).encode()).hexdigest()[:6]
    out_dir = Path('runs').joinpath(f'ensemble-{start_time.strftime("%d%b%y-%H%M%S")}-{params_hash}')
    out_dir.mkdir(exist_ok=False, parents=True)

    # Save the run parameters to disk
    with open(os.path.join(out_dir, 'run_params.json'), 'w') as fp:
        json.dump(run_params, fp, indent=2)
    with open(os.path.join(out_dir, 'environment.json'), 'w') as fp:
        json.dump(dict(os.environ), fp, indent=2)
Пример #6
0
                              learning_rate=args.learning_rate,
                              bootstrap=True)
    my_retrain_mpnn = update_wrapper(my_retrain_mpnn, retrain_mpnn)

    # Create the method server and task generator
    inf_cfg = {'executors': ['ml-inference']}
    tra_cfg = {'executors': ['ml-train']}
    dft_cfg = {'executors': ['qc']}
    doer = ParslMethodServer([(my_evaluate_mpnn, inf_cfg),
                              (run_simulation, dft_cfg),
                              (my_update_mpnn, tra_cfg),
                              (my_retrain_mpnn, tra_cfg)], server_queues,
                             config)

    # Connect to MongoDB
    database = MoleculePropertyDB.from_connection_info(args.mongohost,
                                                       args.mongoport)

    # Configure the "thinker" application
    thinker = Thinker(client_queues, database, args.search_space,
                      args.search_size, args.retrain_frequency,
                      args.retrain_from_scratch, models,
                      args.molecules_per_ml_task, nnodes, args.nodes_per_task,
                      out_dir, args.beta)
    logging.info('Created the method server and task generator')

    try:
        # Launch the servers
        #  The method server is a Thread, so that it can access the Parsl DFK
        #  The task generator is a Thread, so that all debugging methods get cast to screen
        doer.start()
        thinker.start()
Пример #7
0
        help=
        'Globus Endpoint config file to use with the ProxyStore Globus backend'
    )
    group.add_argument(
        '--ml-ps-globus-config',
        default=None,
        help=
        'Globus Endpoint config file to use with the ProxyStore Globus backend'
    )

    # Parse the arguments
    args = parser.parse_args()
    run_params = args.__dict__

    # Connect to MongoDB
    database = MoleculePropertyDB.from_connection_info(hostname=args.mongohost,
                                                       port=args.mongoport)

    # Get the target level of accuracy
    with open(args.simulation_spec) as fp:
        simulation_spec = MultiFidelitySearchSpecification.parse_obj(
            yaml.safe_load(fp))

    # Create an output directory with the time and run parameters
    start_time = datetime.utcnow()
    params_hash = hashlib.sha256(
        json.dumps(run_params).encode()).hexdigest()[:6]
    out_dir = os.path.join(
        'runs',
        f'{simulation_spec.target_property}-N{args.num_qc_workers}-n{args.nodes_per_task}-'
        f'{params_hash}-{start_time.strftime("%d%b%y-%H%M%S")}')
    os.makedirs(out_dir, exist_ok=False)