예제 #1
0
class MPDataRetrievalTest(unittest.TestCase):
    def setUp(self):
        self.mpdr = MPDataRetrieval(mapi_key)

    def test_get_data(self):
        df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"},
                                     properties=["structure"])
예제 #2
0
class MPDataRetrievalTest(unittest.TestCase):

    def setUp(self):
        self.mpdr = MPDataRetrieval(mapi_key)

    def test_get_data(self):
        df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"}, properties=["structure"])
예제 #3
0
    def test_featurize_bsdos(self, refresh_df_init=False, limit=1):
        """
        Tests featurize_dos and featurize_bandstructure.

        Args:
            refresh_df_init (bool): for developers, if the test need to be
                updated set to True. Otherwise set to False to make the final
                test independent of MPRester and faster.
            limit (int): the maximum final number of entries.

        Returns (None):
        """
        target = "color"
        df_bsdos_pickled = "mp_data_with_dos_bandstructure.pickle"
        if refresh_df_init:
            mpdr = MPDataRetrieval()
            df = mpdr.get_dataframe(criteria={"material_id": "mp-149"},
                                    properties=[
                                        "pretty_formula", "dos",
                                        "bandstructure",
                                        "bandstructure_uniform"
                                    ])
            df.to_pickle(os.path.join(TEST_DIR, df_bsdos_pickled))
        else:
            df = pd.read_pickle(os.path.join(TEST_DIR, df_bsdos_pickled))
        df = df.dropna(axis=0)
        df = df.rename(
            columns={
                "bandstructure_uniform": "bandstructure",
                "bandstructure": "line bandstructure"
            })
        df[target] = [["red"]]
        n_cols_init = df.shape[1]

        featurizer = AutoFeaturizer(preset="express",
                                    ignore_errors=False,
                                    multiindex=False)
        df = featurizer.fit_transform(df, target)

        # sanity checks
        self.assertTrue(len(df), limit)
        self.assertGreater(len(df.columns), n_cols_init)

        # DOSFeaturizer:
        self.assertEqual(df["cbm_character_1"][0], "p")

        # DopingFermi:
        self.assertAlmostEqual(df["fermi_c1e+20T300"][0], -0.539, 3)

        # Hybridization:
        self.assertAlmostEqual(df["vbm_sp"][0], 0.181, 3)
        self.assertAlmostEqual(df["cbm_s"][0], 0.4416, 3)
        self.assertAlmostEqual(df["cbm_sp"][0], 0.9864, 3)

        # BandFeaturizer:
        self.assertAlmostEqual(df["direct_gap"][0], 2.556, 3)
        self.assertAlmostEqual(df["n_ex1_norm"][0], 0.6285, 4)

        # BranchPointEnergy:
        self.assertAlmostEqual(df["branch_point_energy"][0], 5.7677, 4)
예제 #4
0
class MPDataRetrievalTest(unittest.TestCase):
    def setUp(self):
        self.mpdr = MPDataRetrieval()

    def test_get_data(self):
        if self.mpdr.mprester.api_key:
            df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"},
                                         properties=["structure"])
            self.assertEqual(len(df["structure"]), 1)
        else:
            raise SkipTest(
                "Skipped MPDataRetrieval test; no MAPI_KEY detected")
예제 #5
0
class MPDataRetrievalTest(unittest.TestCase):
    def setUp(self):
        self.mpdr = MPDataRetrieval()

    def test_get_data(self):
        df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"},
                                     properties=["structure",
                                                 "bandstructure",
                                                 "bandstructure_uniform",
                                                 "dos"])
        self.assertEqual(len(df["structure"]), 1)
        self.assertEqual(df["bandstructure"][0].get_band_gap()["energy"], 0)
        self.assertTrue(isinstance(df["bandstructure"][0],
                                   BandStructureSymmLine))
        self.assertTrue(isinstance(df["bandstructure_uniform"][0],
                                   BandStructure))
        self.assertTrue(isinstance(df["dos"][0], CompleteDos))
예제 #6
0
class MPDataRetrievalTest(unittest.TestCase):
    def setUp(self):
        self.mpdr = MPDataRetrieval()

    def test_get_data(self):
        df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"},
                                     properties=["structure",
                                                 "bandstructure",
                                                 "bandstructure_uniform",
                                                 "dos"])
        self.assertEqual(len(df["structure"]), 1)
        self.assertEqual(df["bandstructure"][0].get_band_gap()["energy"], 0)
        self.assertTrue(isinstance(df["bandstructure"][0],
                                   BandStructureSymmLine))
        self.assertTrue(isinstance(df["bandstructure_uniform"][0],
                                   BandStructure))
        self.assertTrue(isinstance(df["dos"][0], CompleteDos))
print("NUMBER OF JOBS:", NJOBS)
print("DEBUG MODE:", args.debug)


# Set up dataset
if FABER:
    df = load_flla()
else:
    # Initialize data retrieval class
    from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
    mpr = MPDataRetrieval()
    criteria = "*-*-O"
    # Choose list of properties to retrive
    properties = ['structure', 'nsites', 'formation_energy_per_atom', 'e_above_hull']
    # Get the dataframe with the matching structure from the Materials Project
    df = mpr.get_dataframe(criteria=criteria, properties=properties)
    # Create the formation_energy feature for the SCM regression, since the SCM
    # model learns formation energy per unit cell rather than per atom.
    df['formation_energy'] = df['formation_energy_per_atom'] * df['nsites']
    # Structures are retrieved as dictionaries but can easily be converted to
    # pymatgen.core.Structure objects as shown.
    df['structure'] = pd.Series([Structure.from_dict(df['structure'][i])\
        for i in range(df.shape[0])], df.index)
    # Filter the dataset if it consists of ternary oxides
    df = df[df['e_above_hull'] < 0.1]
    df = df[df['nsites'] <= 30]

# For debug mode only use 100 entries
if args.debug:
    df = df.head(100)
예제 #8
0
# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

mpdr = MPDataRetrieval()

# df = load_dataset("dielectric_constant")

df = mpdr.get_dataframe(
    criteria={"has": "diel"},
    properties=[
        "material_id",
        "diel.n",
        "formation_energy_per_atom",
        "e_above_hull",
        "structure",
    ],
    index_mpid=False,
)
df = df[(df["e_above_hull"] < 0.150)
        & (df["formation_energy_per_atom"] < 0.150)]
df = df.rename(columns={"diel.n": "n"})
df = df[(df["n"] >= 1)]
df = df.dropna()

df = df[["structure", "n"]]

# See if there is anything wrong with the Lu containing entries.
numLu = 0
예제 #9
0
plt.rcParams['ytick.labelsize'] = font['size'] - 2
plt.rcParams['legend.fontsize'] = font['size'] - 2.5

mat_api_key = 'YourPymatgenAPI'
mpdr = MPDataRetrieval(mat_api_key)

df_terqua = mpdr.get_dataframe(criteria={
    'nsites': {
        '$lt': 41
    },
    'e_above_hull': {
        '$lt': 0.08
    },
    'nelements': {
        '$gt': 2,
        '$lt': 5
    },
},
                               properties=[
                                   'material_id',
                                   'formation_energy_per_atom',
                                   'band_gap',
                                   'e_above_hull',
                                   'pretty_formula',
                                   'cif',
                               ])

df_ter = mpdr.get_dataframe(criteria={
    'nsites': {
        '$lt': 21
    },
    'e_above_hull': {
예제 #10
0
def query_data(pname,api_key,path=''):

	mpdr = MPDataRetrieval(api_key)

	# query properties
	props = mpdr.get_dataframe(criteria={pname: {"$exists": True},
	#                                      "elements": {"$all": ["Li", "Fe", "O"]},
										("{}.warnings".format(pname)): None},
							  properties=['pretty_formula',pname,'e_above_hull'])
	print("There are {} entries satisfying criteria".format(props[pname].count()))

	# Load crystal structures
	# initialize dataframe
	structures = pd.DataFrame(columns=['structure'])

	# lists of mp ids to avo
	chunk_size = 1000
	mp_ids = props.index.tolist()
	sublists = [mp_ids[i:i+chunk_size] for i in range(0, len(mp_ids), chunk_size)]

	# query structures 
	for sublist in sublists:
	structures = structures.append(mpdr.get_dataframe({"material_id":{"$in": sublist}}, ['structure']))

	data = pd.concat([props,structures],axis=1)
	fname = '%s/%s.pkl' % (path,pname)

	data.to_pickle(fname)
	print('Saved file to ',fname)

	return data

def filter_data(df,elems,pname,pmin=None,pmax=None,stab=None):
	'''Filter data by criteria'''

	print('# entries before filters: ',len(df))

	# filter by chemistry
	inds = np.zeros((len(elems),len(df)))
	for i,item in enumerate(elems):
	  inds[i,:] = (df['pretty_formula'].str.contains(item))
	  
	idx = np.prod(inds,axis=0)
	df = df[idx==1]
	print('# entries after chemistry: ',len(df))

	# filter by property values
	if pmin:
	  df = df[df[pname] >= pmin]
	if pmax:
	  df = df[df[pname] <= pmax]
	print('# entries after property: ',len(df))
	  
	# filter by stability
	if stab:
	  df = df[df['e_above_hull'] <= stab]
	print('# entries after stability: ',len(df))

	return df

def get_xy(df,elems,pname,pmin,pmax,stab):
	'''Get x and y from data'''

	# filter NaNs and entries based on criteria
	df = df.dropna()
	df = filter_data(df,elems,pname,pmin=pmin,pmax=pmax,stab=stab)

	# exclude non-input columns
	exclude = ['pretty_formula',pname,'e_above_hull','structure','composition','composition_oxid','radial distribution function']
	
	# get X and Y
	x = df.sort_index().drop(exclude, axis=1)
	y = df[pname].sort_index().values

	return x,y

def fit_forest(x,y,lbl='Full'):

  # split data
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

  # grid-search optimal parameters
  rf = RandomForestRegressor()
  param_grid = { 
        'n_estimators'      : [10,25,50,100,250],
        'max_features'      : ['auto','sqrt','log2'],
        'min_samples_split' : [2,4,8],
        'min_samples_leaf'  : [1, 2, 5]
        }
  grid = GridSearchCV(rf, param_grid, n_jobs=-1, cv=5)
  grid.fit(x_train, y_train)

  print(grid.best_score_)
  print(grid.best_params_)
  print(grid.score(x_test, y_test))

  # use optimal parameters
  rf.set_params(**grid.best_params_)
  rf.fit(x_train, y_train)

  y_hat_train = rf.predict(x_train) 
  y_hat_test = rf.predict(x_test) 

  mae_train = np.mean(abs(y_hat_train-y_train))/np.mean(y_train)
  print('%s RF, train error: %.3f' % (lbl,mae_train))

  mae_test = np.mean(abs(y_hat_test-y_test))/np.mean(y_test)
  print('%s RF, test error : %.3f' % (lbl,mae_test))

  return rf

def fit_model(x,y,show_flag=False):

  # fit RF using all variables
  print('Fitting full random forest...')
  rf = fit_forest(x,y,lbl='Full')

  # variable importances
  nvar = 10
  imp = rf.feature_importances_
  idx = np.argsort(imp)[::-1]
  print('%d most important variables:' % nvar)
  print(x.columns.values[idx][0:nvar])

  # prune variables
  thr = 0.5*np.median(imp)
  idx = imp < thr
  exclude = list(x.columns.values[idx])
  x_sel = x.drop(exclude, axis=1)

  # fit RF using important variables
  print('\nFitting pruned random forest...')
  rf = fit_forest(x_sel,y,lbl='Pruned')
  
  print('%d pruned variables:' % len(x_sel.columns))
  print(x_sel.columns.values)
  
  if show_flag:
    # plt.figure(figsize=(7, 4))

    # importance chart
    plt.subplot(121)
    
    ind = np.argsort(imp)[::-1]
    plt.bar(x=x.columns.values[ind][0:nvar], height=imp[ind][0:nvar],color=(0.3,0.3,0.9))
    plt.xticks(x.columns.values[ind][0:nvar], x.columns.values[ind][0:nvar], rotation='vertical')
    plt.xlabel('Variables')
    plt.ylabel('Importance')

    # parity plot
    ax = plt.subplot(122)
    ax.set_aspect(1)
    
    plt.scatter(y, rf.predict(x_sel),marker='s',alpha=.25,c=(0.9,0.3,0.3))
    plt.plot(np.arange(np.max(y)),c='gray')
    plt.xlabel('Ground truth')
    plt.ylabel('RF prediction')
    
    plt.subplots_adjust(bottom=0.25,top=0.75)
    plt.draw()
    plt.show()

  return rf

def add_atom_feats(df):
  
  avg_row = []
  avg_col = []
  avg_num = []
  el_neg = []
  at_mass = []
  at_r = []
  io_r = []
  
  # loop through entries
  for index, row in df.iterrows(): 
    
    comp = Composition(row['pretty_formula'])
    elem,fracs = zip(*comp.fractional_composition.items())

    # 0. average row in the periodic table
    try:
      avg_row.append(sum([el.row*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      avg_row.append(float('nan'))
    
    # 1. average column in the periodic table
    try:
      avg_col.append(sum([el.group*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      avg_col.append(float('nan'))
  
    # 2. average atomic number
    try:
      avg_num.append(sum([el.number*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      avg_num.append(float('nan'))
    
    # 3. average electronegativity
    try:
      el_neg.append(sum([el.X*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      el_neg.append(float('nan'))
    
    # 4. average atomic mass
    try:
      at_mass.append(sum([el.data['Atomic mass']*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      at_mass.append(float('nan'))
    
    # 5. average atomic radius
    try:
      at_r.append(sum([el.data['Atomic radius']*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      at_r.append(float('nan'))
    
    # 6. average ionic radius
    try:
      io_r.append(sum([el.average_ionic_radius*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      io_r.append(float('nan'))
      
  df['avg row'] = pd.Series(avg_row, index=df.index)
  df['avg column'] = pd.Series(avg_col, index=df.index)
  df['avg num'] = pd.Series(avg_num, index=df.index)
  df['avg el-neg'] = pd.Series(el_neg, index=df.index)
  df['avg atom mass'] = pd.Series(at_mass, index=df.index)
  df['avg atom radius'] = pd.Series(at_r, index=df.index)
  df['avg ionic radius'] = pd.Series(io_r, index=df.index)
  
  feat_labels = ['avg row','avg column','avg num','avg el-neg',
                 'avg atom mass','avg atom radius','avg ionic radius']
  
  return df,feat_labels

def add_cs_features(df,rdf_flag=False):

  df["composition"] = str_to_composition(df["pretty_formula"]) 
  df["composition_oxid"] = composition_to_oxidcomposition(df["composition"])
  df["structure"] = dict_to_object(df["structure"]) 

  vo = ValenceOrbital()
  df = vo.featurize_dataframe(df,"composition")

  ox = OxidationStates()
  df = ox.featurize_dataframe(df, "composition_oxid")
  
  # structure features
  den = DensityFeatures()
  df = den.featurize_dataframe(df, "structure")
  
  if rdf_flag:
    rdf = RadialDistributionFunction(cutoff=15.0,bin_size=0.2)
    df = rdf.featurize_dataframe(df, "structure") 
  
  return df
예제 #11
0
mpdr = MPDataRetrieval()

df = mpdr.get_dataframe(
    criteria={
        "e_above_hull": {
            "$lt": 0.150
        },
        "formation_energy_per_atom": {
            "$lt": 0.150
        },
        "elasticity": {
            "$exists": 1,
            "$ne": None
        },
    },
    # "elements": },
    properties=[
        "material_id",
        "structure",
        "elasticity.K_VRH",
        "elasticity.G_VRH",
        "elasticity.G_Voigt",
        "elasticity.K_Voigt",
        "elasticity.G_Reuss",
        "elasticity.K_Reuss",
        "warnings",
    ],
    index_mpid=False,
)

df = df.rename(
예제 #12
0
mpr = MPRester()


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


df = mpdr.get_dataframe(criteria={
    "e_above_hull": {
        "$lt": 0.150
    },
    "formation_energy_per_atom": {
        "$lt": 0.150
    },
    "band_gap": {
        "$exists": 1,
        "$ne": None
    }
},
                        properties=["material_id", "warnings"],
                        index_mpid=False)

print(df["warnings"].astype(str).value_counts())

structures = pd.DataFrame({"structure": [], "material_id": [], "band_gap": []})

for chunk in tqdm(chunks(range(len(df)), chunksize)):
    print(chunk[0], chunk[-1])
    mpids = df.loc[chunk[0]:chunk[-1], "material_id"].tolist()
    stchunk = mpdr.get_dataframe(
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.model_selection import train_test_split
from automatminer import MatPipe
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
# mpr=MPDataRetrieval()
# mpdr=MPDataRetrieval()
api_key = 'x3NlvC67Z9tPykwGz'
# Set your MP API key here.
mpr = MPDataRetrieval(api_key)
# api_key = None   # Set your MP API key here.
# mpr = MPDataRetrieval(api_key)
df = mpr.get_dataframe(
    {
        "elasticity": {
            "$exists": True
        },
        "elasticity.warnings": []
    }, ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH'])
#/ criteria = {'elasticity.K_VRH': {'$ne': None}}
#/ properties = ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']
# get the data
# df=mpr.get_dataframe(criteria=criteria, properties=properties)
# Filter out unstable entries and negative bulk moduli
df = df[df['elasticity.K_VRH'] > 0]
df = df[df['elasticity.G_VRH'] > 0]
from matminer.featurizers.composition import ElementProperty
from pymatgen import Composition

df["composition"] = df['pretty_formula'].map(lambda x: Composition(x))
df = df.dropna()
예제 #14
0
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from pymatgen.electronic_structure.plotter import BSDOSPlotter
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.data_retrieval.retrieve_MDF import MDFDataRetrieval

mpdr = MPDataRetrieval()

df = mpdr.get_dataframe(criteria={"nelements": 1},
                        properties=['density', 'pretty_formula'])
print("There are {} entries on MP with 1 element".format(
    df['density'].count()))
print(df.head())
df = mpdr.get_dataframe({"band_gap": {
    "$gt": 4.0
}}, ['pretty_formula', 'band_gap'])
print("There are {} entries on MP with a band gap larger than 4.0".format(
    df['band_gap'].count()))
df.to_csv('gt4.csv')
df = mpdr.get_dataframe(
    {
        "elasticity": {
            "$exists": True
        },
        "elasticity.warnings": []
    }, ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH'])
print("There are {} elastic entries on MP with no warnings".format(
    df['elasticity.K_VRH'].count()))
df = mpdr.get_dataframe(
    criteria={
        "elasticity": {
            "$exists": True
print("DEBUG MODE:", args.debug)

# Set up dataset
if FABER:
    df = load_dataset("flla")
else:
    # Initialize data retrieval class
    from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
    mpr = MPDataRetrieval()
    criteria = "*-*-O"
    # Choose list of properties to retrive
    properties = [
        'structure', 'nsites', 'formation_energy_per_atom', 'e_above_hull'
    ]
    # Get the dataframe with the matching structure from the Materials Project
    df = mpr.get_dataframe(criteria=criteria, properties=properties)
    # Create the formation_energy feature for the SCM regression, since the SCM
    # model learns formation energy per unit cell rather than per atom.
    df['formation_energy'] = df['formation_energy_per_atom'] * df['nsites']
    # Structures are retrieved as dictionaries but can easily be converted to
    # pymatgen.core.Structure objects as shown.
    df['structure'] = pd.Series([Structure.from_dict(df['structure'][i])\
        for i in range(df.shape[0])], df.index)
    # Filter the dataset if it consists of ternary oxides
    df = df[df['e_above_hull'] < 0.1]
    df = df[df['nsites'] <= 30]

# For debug mode only use 100 entries
if args.debug:
    df = df.head(100)
예제 #16
0
파일: data.py 프로젝트: PV-Lab/FTCP
def data_query(mp_api_key,
               max_elms=3,
               min_elms=3,
               max_sites=20,
               include_te=False):
    """
    The function queries data from Materials Project.

    Parameters
    ----------
    mp_api_key : str
        The API key for Mateirals Project.
    max_elms : int, optional
        Maximum number of components/elements for crystals to be queried.
        The default is 3.
    min_elms : int, optional
        Minimum number of components/elements for crystals to be queried.
        The default is 3.
    max_sites : int, optional
        Maximum number of components/elements for crystals to be queried.
        The default is 20.
    include_te : bool, optional
        DESCRIPTION. The default is False.

    Returns
    -------
    dataframe : pandas dataframe
        Dataframe returned by MPDataRetrieval.

    """
    mpdr = MPDataRetrieval(mp_api_key)
    # Specify query criteria in MongoDB style
    query_criteria = {
        'e_above_hull': {
            '$lte': 0.08
        },  # eV/atom
        'nelements': {
            '$gte': min_elms,
            '$lte': max_elms
        },
        'nsites': {
            '$lte': max_sites
        },
    }
    # Specify properties to be queried, properties avaible are at https://github.com/materialsproject/mapidoc/tree/master/materials
    query_properties = [
        'material_id', 'formation_energy_per_atom', 'band_gap',
        'pretty_formula', 'e_above_hull', 'elements', 'cif',
        'spacegroup.number'
    ]
    # Obtain queried dataframe containing CIFs and groud-state property labels
    dataframe = mpdr.get_dataframe(
        criteria=query_criteria,
        properties=query_properties,
    )
    dataframe['ind'] = np.arange(len(dataframe))

    if include_te:
        dataframe['ind'] = np.arange(0, len(dataframe))
        # Read thermoelectric properties from https://datadryad.org/stash/dataset/doi:10.5061/dryad.gn001
        te = pd.read_csv('data/thermoelectric_prop.csv', index_col=0)
        te = te.dropna()
        # Get compound index that has both ground-state and thermoelectric properties
        ind = dataframe.index.intersection(te.index)
        # Concatenate thermoelectric properties to corresponding compounds
        dataframe = pd.concat([dataframe, te.loc[ind, :]], axis=1)
        dataframe['Seebeck'] = dataframe['Seebeck'].apply(np.abs)

    return dataframe
예제 #17
0
def generate_mp(max_nsites=None, properties=None, write_to_csv=False,
                write_to_compressed_json=True):
    """
    Grabs all mp materials. This will return two csv/json.gz files:
        * mp_nostruct: All MP materials, not including structures
        * mp_all: All MP materials, including structures

    Args:
        max_nsites (int): The maximum number of sites to include in the query.

        properties (iterable of strings): list of properties supported by
            MPDataRetrieval

        write_to_csv (bool): whether to write resulting dataframe to csv

        write_to_compressed_json (bool): whether to write resulting
            dataframe to json.gz file

    Returns (pandas.DataFrame):
        retrieved/generated data
    """

    # Set default properties if None and ensure is a list
    if properties is None:
        properties = ['pretty_formula', 'e_above_hull', 'band_gap',
                      'total_magnetization', 'elasticity.elastic_anisotropy',
                      'elasticity.K_VRH', 'elasticity.G_VRH', 'structure',
                      'energy', 'energy_per_atom', 'formation_energy_per_atom']
    elif not isinstance(properties, list):
        properties = list(properties)

    # Pick columns to drop structure data from
    drop_cols = []
    for col_name in ["structure", "initial_structure"]:
        if col_name in properties:
            drop_cols.append(col_name)

    mpdr = MPDataRetrieval()
    if max_nsites is not None:
        sites_list = [i for i in range(1, max_nsites + 1)]
    else:
        sites_list = [i for i in range(1, 101)] + [{"$gt": 100}]

    df = pd.DataFrame()
    for site_specifier in tqdm(sites_list, desc="Querying Materials Project"):
        # While loop to repeat queries if server request fails
        while True:
            try:
                site_response = mpdr.get_dataframe(
                    criteria={"nsites": site_specifier},
                    properties=properties, index_mpid=True
                )
                break

            except MPRestError:
                tqdm.write("Error querying materials project, "
                           "trying again after 5 sec")
                sleep(5)

        df = df.append(site_response)

    tqdm.write("DataFrame with {} entries created".format(len(df)))

    # Write data out to file if user so chooses
    if write_to_csv:
        df.to_csv("mp_all.csv")
        df.drop(drop_cols, axis=1, inplace=True)
        df.to_csv("mp_nostruct.csv")

    if write_to_compressed_json:
        store_dataframe_as_json(df, "mp_all.json.gz", compression="gz")
        df = df.drop(drop_cols, axis=1)
        store_dataframe_as_json(df, "mp_nostruct.json.gz", compression="gz")

    return df
예제 #18
0
chunksize = 1000

mpdr = MPDataRetrieval()
mpr = MPRester()


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i : i + n]


df = mpdr.get_dataframe(
    criteria={"formation_energy_per_atom": {"$lt": 2.5}},
    properties=["material_id", "warnings"],
    index_mpid=False,
)

print(df["warnings"].astype(str).value_counts())

structures = pd.DataFrame(
    {"structure": [], "material_id": [], "formation_energy_per_atom": []}
)

for chunk in tqdm(chunks(range(len(df)), chunksize)):
    print(chunk[0], chunk[-1])
    mpids = df.loc[chunk[0] : chunk[-1], "material_id"].tolist()
    stchunk = mpdr.get_dataframe(
        criteria={"material_id": {"$in": mpids}},
        properties=["structure", "material_id", "formation_energy_per_atom"],
예제 #19
0
import pandas as pd
# pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

mpdr = MPDataRetrieval()

df = load_dataset("phonon_dielectric_mp")

print(df)

mpids = df["mpid"].tolist()
dfe = mpdr.get_dataframe(
    criteria={"material_id": {
        "$in": mpids
    }},
    properties=["e_above_hull", "formation_energy_per_atom", "material_id"],
    index_mpid=False)
dfe = dfe.rename(columns={"material_id": "mpid"})

df = pd.merge(df, dfe, how='inner')

df = df[(df["e_above_hull"] < .150)
        & (df["formation_energy_per_atom"] < 0.150)]
df = df[["structure", "last phdos peak"]]
df = df.reset_index(drop=True)

print(df)

df.to_pickle("phonons.pickle.gz")