Пример #1
0
def dump_bulks(output_file_name='bulks.db'):
    with get_mongo_collection('atoms') as collection:
        docs = list(tqdm(collection.find({'fwname.calculation_type': 'unit cell optimization'}),
                         desc='pulling from FireWorks'))

    db = ase.db.connect(output_file_name)
    for doc in tqdm(docs, desc='writing to database'):
        atoms = make_atoms_from_doc(doc)
        _ = db.write(atoms, mpid=doc['fwname']['mpid'])
Пример #2
0
def save_predictions():
    '''
    Wrapper to read our pickled prediction cache, and then save them to Mongo
    all at once. We save them all at once so that we can write to Mongo all at
    once instead of multiple times per day. This reduces the strain on the
    catalog collection, which is big and doesn't need any more strain.

    Returns:
        mongo_result    Mongo returns a `result` object after we write to it.
                        This is that object.
    '''
    # Read the caches
    print('[%s] Loading cached predictions now...' % datetime.utcnow())
    all_predictions = {}
    for (model_name, adsorbate), cache_name in PREDICTION_CACHES.items():
        with open(cache_name, 'rb') as file_handle:
            predictions = pickle.load(file_handle)
        all_predictions[(model_name, adsorbate)] = predictions

    # Parse the predictions into `$push` commands
    print('[%s] Parsing predictions into commands now...' % datetime.utcnow())
    adsorption_push_commands = __create_adsorption_energy_push_commands(
        all_predictions)
    orr_push_commands = __create_4e_orr_onset_potential_push_commands(
        all_predictions)

    # We'll be using pymongo's `bulk_write`, which takes a list of commands.
    # We'll be making a list of `UpdateOne` commands.
    mongo_commands = []
    mongo_ids = set(adsorption_push_commands.keys()) | set(
        orr_push_commands.keys())
    for mongo_id in mongo_ids:
        push_commands = {
            **adsorption_push_commands[mongo_id],
            **orr_push_commands[mongo_id]
        }
        command = UpdateOne({'_id': mongo_id}, {
            '$push': push_commands,
            '$set': {
                'mtime': datetime.utcnow()
            }
        })
        mongo_commands.append(command)

    # Write the results
    print('[%s] Writing predictions into catalog now...' % datetime.utcnow())
    with get_mongo_collection('catalog') as collection:
        mongo_result = collection.bulk_write(mongo_commands, ordered=False)
    print('[%s] Updated %i predictions in the catalog' %
          (datetime.utcnow(), len(mongo_commands)))

    return mongo_result
def _prepare_grouped_df(adsorbate1, adsorbate2):
    """
    This function takes the adsorbates of interest
    and make a dataframe that will be used for making 2D plots.
    Each row in the dataframe is grouped by unique surface.
    A surface is defined with mpid, Miller index, top, and shift.

    Args:
        adsorbate1      adsorbate1, who's energy is going on x-axis
        adsorbate1      adsorbate2, who's energy is going on y-axis

    Returns:
        adsorbate1_df   dataframe of adsorbate1
        adsorbate2_df   dataframe of adsorbate2
        grouped_df      dataframe used for plotting
    """
    surface_fp = ['mpid', 'millerx', 'millery', 'millerz', 'top', 'shift']
    adsorbate1_df = _make_df_from_docs(adsorbate1, surface_fp)
    adsorbate2_df = _make_df_from_docs(adsorbate2, surface_fp)

    # merge them together based on unique surface
    grouped_results = pd.merge(adsorbate1_df, adsorbate2_df,
                               on=surface_fp).dropna()
    # drop rows that has ML prediction on both OH & CO
    grouped_results = grouped_results.drop(
        grouped_results[(grouped_results['{}_DFT'.format(adsorbate1)] == False)
                        & (grouped_results['{}_DFT'.format(adsorbate2)] ==
                           False)].index).reset_index()

    # Add formula to the dataframe based on mpid
    rc = read_rc()
    atoms_db = gasdb.get_mongo_collection('atoms')
    mpids = set(grouped_results['mpid'])
    compositions_by_mpid = {}
    print('Beginning to pull data from the Materials Project...')
    with MPRester(read_rc()['matproj_api_key']) as mat_proj:
        for mpid in tqdm.tqdm_notebook(mpids):
            try:
                entry = mat_proj.get_entry_by_material_id({'task_ids': mpid})
                compositions_by_mpid[
                    mpid] = entry.composition.get_reduced_formula_and_factor(
                    )[0]
            except IndexError:
                compositions_by_mpid[mpid] = ""
    data = list(compositions_by_mpid.items())
    df_new = pd.DataFrame(data, columns=['mpid', 'formula'])
    grouped_df = pd.merge(grouped_results, df_new, on='mpid')

    return grouped_df
Пример #4
0
'''
This submodule contains the scripts that the Ulissi group used to pull the
relaxed bulk structures from our database.
'''

__author__ = 'Kevin Tran'
__email__ = '*****@*****.**'

import warnings
from tqdm import tqdm
import ase.db
from gaspy.gasdb import get_mongo_collection
from gaspy.mongo import make_atoms_from_doc

with get_mongo_collection('atoms') as collection:
    docs = list(
        tqdm(collection.find({
            'fwname.calculation_type': 'unit cell optimization',
            'fwname.vasp_settings.gga': 'RP',
            'fwname.vasp_settings.pp': 'PBE',
            'fwname.vasp_settings.xc': {
                '$exists': False
            },
            'fwname.vasp_settings.pp_version': '5.4',
            'fwname.vasp_settings.encut': 500,
            'fwname.vasp_settings.isym': 0
        }),
             desc='pulling from FireWorks'))

mpids = set()
db = ase.db.connect('bulks.db')
# Initialize synthesized dataset
db_name = 'CO.db'
try:
    os.remove(db_name)
except FileNotFoundError:
    pass
db = ase.db.connect(db_name)

# Grab all the sites from chosen bulks
mongo_ids = [
    site['mongo_id'] for bulks in samples.values()
    for surfaces in bulks.values() for site in surfaces
]
query = {'_id': {'$in': mongo_ids}}
projection = {'atoms': 1, 'calc': 1, 'results': 1}
with get_mongo_collection('catalog') as collection:
    all_docs = list(
        tqdm(collection.find(query, projection),
             desc='pulling docs',
             total=n_sites))
docs_by_id = {doc['_id']: doc for doc in all_docs}

# Make up an energy
for mpid, _samples in tqdm(samples.items(), desc='bulks'):
    sampled_bulk_mean = norm.rvs(loc=bulk_mean, scale=bulk_std)
    for surface, sites in tqdm(_samples.items(), desc='surfaces'):
        sampled_surface_mean = norm.rvs(loc=sampled_bulk_mean,
                                        scale=surface_std)
        for site in tqdm(sites, desc='sites'):
            sampled_energy = norm.rvs(loc=sampled_surface_mean, scale=site_std)