def main(): """ Extract data from databases and store into pickle files of sklearn-ready lists TODO: abstract out SQL queries """ check_species_in_dict = False mol_dict = bond_order_dict() # alloy_formation = pd.read_table('alloys_formation.txt', index_col=0) # load the CatApp database conn = sqlite3.connect('catapp/catapp.db') c = conn.cursor() dataset = 'CO2+NO_larger' # select chemisorption energies # e.g. NO --> N* + O* # reaction = ['NO*','N*','O*'] # table = c.execute('SELECT * FROM catapp WHERE ab = \"'+reaction[0]+'\" AND a = \"'+reaction[1]+'\" AND b = \"'+reaction[2]+'\"') # CO2 intermediates # table = c.execute('SELECT * FROM catapp WHERE ((ab LIKE "%C%O%" AND a LIKE "%C%O%") OR (ab LIKE "CH_*" AND a LIKE "CH_") OR (ab LIKE "O%" AND a LIKE "O%") )AND (ab NOT LIKE "%Si%") AND (ab NOT LIKE "%Cl%") AND (termination LIKE "___") AND (surface like "%3%" or surface like "__") AND (surface <> "ZnO");') # CO2+NO intermediates # reaction = ['NO*','N*','O*'] # table = c.execute('SELECT * FROM catapp WHERE ' + # '(((ab LIKE "%C%O%" AND a LIKE "%C%O%") OR ' + # '(ab LIKE "CH_*" AND a LIKE "CH_") OR ' + # '(ab LIKE "O%" AND a LIKE "O%")) ' + # 'AND (ab NOT LIKE "%Si%") ' + # 'AND (ab NOT LIKE "%Cl%") ' + # 'AND (termination LIKE "___") ' + # 'AND (surface like "%3%" or surface like "__") ' + # 'AND (surface <> "ZnO")) OR ' + # '(ab = \"'+reaction[0]+'\" AND a = \"'+reaction[1]+'\" AND b = \"'+reaction[2]+'\");' # ) # CO2+NO intermediates + more reaction = ['NO*','N*','O*'] table = c.execute('SELECT * FROM catapp WHERE ' + '(((ab LIKE "%C%O%") OR ' + '(ab LIKE "CH_*") OR ' + '(ab LIKE "O%")) ' + 'AND (ab NOT LIKE "%Si%") ' + 'AND (ab NOT LIKE "%Cl%") ' + 'AND (termination LIKE "___") ' + 'AND (surface like "%3%" or surface like "__") ' + 'AND (surface <> "ZnO")) OR ' + '(ab = \"'+reaction[0]+'\" AND a = \"'+reaction[1]+'\" AND b = \"'+reaction[2]+'\");' ) # A3B alloys with atomic adsorption energies of C, O, H, CHO, and CO # table = c.execute('select * from catapp where (ab like "_*" or ab = "CO*" or ab = "CHO*") and (termination like "___") and (surface like "%3%" or surface like "__") ;') # table = c.execute('SELECT * FROM catapp WHERE (ab like "%*") AND (a NOT LIKE "%*" OR b NOT LIKE "%*") AND (ab NOT LIKE "%Si%") AND (termination LIKE "___") AND (surface like "%3%" or surface like "__");') # A3B alloys and pure metals with most atomic adsorption energies # table = c.execute('SELECT * FROM catapp WHERE (ab NOT LIKE "%Si%") AND (ab NOT LIKE "%Cl%") AND (termination LIKE "___") AND (surface like "%3%" or surface like "__");') if check_species_in_dict: species = [] for spec in ['AB','A','B']: result = c.execute('SELECT '+spec+' FROM catapp WHERE (ab NOT LIKE "%Si%") AND (ab NOT LIKE "%Cl%") AND (termination LIKE "___") AND (surface like "%3%" or surface like "__");') species += [unicode_convert(s[0]) for s in result] unique_species = list(set(species)) for spec in unique_species: if spec not in mol_dict: print spec features = ['h', 'k', 'l', 'stoichiometry_M1', 'stoichiometry_M2', 'E_form', 'density', 'a', 'b', 'c', 'alpha', 'beta', 'gamma', 's_M1', 'p_M1', 'd_M1', 'f_M1', 's_M2', 'p_M2', 'd_M2', 'f_M2', 'max_bonds_central', 'molecule_bonds', 'surface_bonds', 'C_count', 'H_count', 'O_count', 'N_count'] data, energetics, data_key = get_data_from_query(table) pickle.dump(data, open('datasets/data_'+dataset+'.pckl','w')) pickle.dump(energetics, open('datasets/energetics_'+dataset+'.pckl','w')) pickle.dump(data_key, open('datasets/key_'+dataset+'.pckl','w')) return 0
def get_data_from_query(table): """Returns the matrix of data, the response, and an array form of the table Input: table -- Table from SQL query e.g. `c.execute(...)` Returns: data -- data in the design matrix energetics -- response variable, the energetics of the reaction data_key -- corresponding row from SQL table """ mol_dict = bond_order_dict() # set up data and response lists data = [] energetics = [] data_key = [] # populate design matrix and response vector for row in table: # h|k|l|stoichiometry_M1|stoichiometry_M2|E_form|density|a|b|c|alpha|beta|gamma|s_M1|p_M1|d_M1|f_M1|s_M2|p_M2|d_M2|f_M2|max bonds of the central atom|bonds in the molecule|bonds in the surface|C count|H count|O count|N count # surface termination - determined by Miller indices observation = [int(row[3][0]), int(row[3][1]), int(row[3][2])] # Split formula and determine stoichiometry formula = split_formula(row[2]) if len(set(formula)) > 1: stoich = [s for s in formula if s.isdigit()][0] if formula.index(stoich) == 1: observation += [float(stoich),1] material = ''.join([formula[2],formula[0],formula[1]]) else: observation += [1,float(stoich)] material = row[2] formula.remove(stoich) else: observation += [1,3] material = formula[0] # make sure MP has the required data mp_data = get_mp_crystal_data(material) if mp_data: for bulk_prop in mp_data: observation.append(bulk_prop) # repeat for pure elements to match columns for binary alloys if len(formula) == 1: formula *= 2 for element in formula: for elemental_prop in get_outer_elec_data(element): observation.append(elemental_prop) # bond orders of adsorbate for state in row[4:7]: for bond_order in mol_dict[state]['b-order']: observation.append(bond_order) # element counts for each adsorbate for element_count in mol_dict[state]['e-count']: observation.append(element_count) # add all columns as an entry data.append(observation) # output: E_rxn energetics.append(float(row[0])) # add data row to key data_key += [row]