def dump_bulks(output_file_name='bulks.db'): with get_mongo_collection('atoms') as collection: docs = list(tqdm(collection.find({'fwname.calculation_type': 'unit cell optimization'}), desc='pulling from FireWorks')) db = ase.db.connect(output_file_name) for doc in tqdm(docs, desc='writing to database'): atoms = make_atoms_from_doc(doc) _ = db.write(atoms, mpid=doc['fwname']['mpid'])
def save_predictions(): ''' Wrapper to read our pickled prediction cache, and then save them to Mongo all at once. We save them all at once so that we can write to Mongo all at once instead of multiple times per day. This reduces the strain on the catalog collection, which is big and doesn't need any more strain. Returns: mongo_result Mongo returns a `result` object after we write to it. This is that object. ''' # Read the caches print('[%s] Loading cached predictions now...' % datetime.utcnow()) all_predictions = {} for (model_name, adsorbate), cache_name in PREDICTION_CACHES.items(): with open(cache_name, 'rb') as file_handle: predictions = pickle.load(file_handle) all_predictions[(model_name, adsorbate)] = predictions # Parse the predictions into `$push` commands print('[%s] Parsing predictions into commands now...' % datetime.utcnow()) adsorption_push_commands = __create_adsorption_energy_push_commands( all_predictions) orr_push_commands = __create_4e_orr_onset_potential_push_commands( all_predictions) # We'll be using pymongo's `bulk_write`, which takes a list of commands. # We'll be making a list of `UpdateOne` commands. mongo_commands = [] mongo_ids = set(adsorption_push_commands.keys()) | set( orr_push_commands.keys()) for mongo_id in mongo_ids: push_commands = { **adsorption_push_commands[mongo_id], **orr_push_commands[mongo_id] } command = UpdateOne({'_id': mongo_id}, { '$push': push_commands, '$set': { 'mtime': datetime.utcnow() } }) mongo_commands.append(command) # Write the results print('[%s] Writing predictions into catalog now...' % datetime.utcnow()) with get_mongo_collection('catalog') as collection: mongo_result = collection.bulk_write(mongo_commands, ordered=False) print('[%s] Updated %i predictions in the catalog' % (datetime.utcnow(), len(mongo_commands))) return mongo_result
def _prepare_grouped_df(adsorbate1, adsorbate2): """ This function takes the adsorbates of interest and make a dataframe that will be used for making 2D plots. Each row in the dataframe is grouped by unique surface. A surface is defined with mpid, Miller index, top, and shift. Args: adsorbate1 adsorbate1, who's energy is going on x-axis adsorbate1 adsorbate2, who's energy is going on y-axis Returns: adsorbate1_df dataframe of adsorbate1 adsorbate2_df dataframe of adsorbate2 grouped_df dataframe used for plotting """ surface_fp = ['mpid', 'millerx', 'millery', 'millerz', 'top', 'shift'] adsorbate1_df = _make_df_from_docs(adsorbate1, surface_fp) adsorbate2_df = _make_df_from_docs(adsorbate2, surface_fp) # merge them together based on unique surface grouped_results = pd.merge(adsorbate1_df, adsorbate2_df, on=surface_fp).dropna() # drop rows that has ML prediction on both OH & CO grouped_results = grouped_results.drop( grouped_results[(grouped_results['{}_DFT'.format(adsorbate1)] == False) & (grouped_results['{}_DFT'.format(adsorbate2)] == False)].index).reset_index() # Add formula to the dataframe based on mpid rc = read_rc() atoms_db = gasdb.get_mongo_collection('atoms') mpids = set(grouped_results['mpid']) compositions_by_mpid = {} print('Beginning to pull data from the Materials Project...') with MPRester(read_rc()['matproj_api_key']) as mat_proj: for mpid in tqdm.tqdm_notebook(mpids): try: entry = mat_proj.get_entry_by_material_id({'task_ids': mpid}) compositions_by_mpid[ mpid] = entry.composition.get_reduced_formula_and_factor( )[0] except IndexError: compositions_by_mpid[mpid] = "" data = list(compositions_by_mpid.items()) df_new = pd.DataFrame(data, columns=['mpid', 'formula']) grouped_df = pd.merge(grouped_results, df_new, on='mpid') return grouped_df
''' This submodule contains the scripts that the Ulissi group used to pull the relaxed bulk structures from our database. ''' __author__ = 'Kevin Tran' __email__ = '*****@*****.**' import warnings from tqdm import tqdm import ase.db from gaspy.gasdb import get_mongo_collection from gaspy.mongo import make_atoms_from_doc with get_mongo_collection('atoms') as collection: docs = list( tqdm(collection.find({ 'fwname.calculation_type': 'unit cell optimization', 'fwname.vasp_settings.gga': 'RP', 'fwname.vasp_settings.pp': 'PBE', 'fwname.vasp_settings.xc': { '$exists': False }, 'fwname.vasp_settings.pp_version': '5.4', 'fwname.vasp_settings.encut': 500, 'fwname.vasp_settings.isym': 0 }), desc='pulling from FireWorks')) mpids = set() db = ase.db.connect('bulks.db')
# Initialize synthesized dataset db_name = 'CO.db' try: os.remove(db_name) except FileNotFoundError: pass db = ase.db.connect(db_name) # Grab all the sites from chosen bulks mongo_ids = [ site['mongo_id'] for bulks in samples.values() for surfaces in bulks.values() for site in surfaces ] query = {'_id': {'$in': mongo_ids}} projection = {'atoms': 1, 'calc': 1, 'results': 1} with get_mongo_collection('catalog') as collection: all_docs = list( tqdm(collection.find(query, projection), desc='pulling docs', total=n_sites)) docs_by_id = {doc['_id']: doc for doc in all_docs} # Make up an energy for mpid, _samples in tqdm(samples.items(), desc='bulks'): sampled_bulk_mean = norm.rvs(loc=bulk_mean, scale=bulk_std) for surface, sites in tqdm(_samples.items(), desc='surfaces'): sampled_surface_mean = norm.rvs(loc=sampled_bulk_mean, scale=surface_std) for site in tqdm(sites, desc='sites'): sampled_energy = norm.rvs(loc=sampled_surface_mean, scale=site_std)