def test_root_source(self): source = ["KP.cell", "KP.param", "KP.castep"] src = "KP" self.assertEqual(src, get_root_source(source)) source = ["KP.cell", "KP.param", "KP-1234-abcd.castep"] src = "KP-1234-abcd" self.assertEqual(src, get_root_source(source)) source = [ "KP.cell", "KP.param", "abcd-123.fdasf/efgf/KP-0.02.-1234-abcd.castep", "KP-0.02.-1234-abcd.res", ] src = "KP-0.02.-1234-abcd" self.assertEqual(src, get_root_source(source)) source = [ "KP.cell", "KP.param", "abcd-123.fdasf/efgf/KP-0.02.-1234-abcd.history", ] src = "KP-0.02.-1234-abcd" self.assertEqual(src, get_root_source(source)) source = ["KP.cell", "KP.param", "PK-OQMD_12345.history"] src = "PK-OQMD_12345" self.assertEqual(src, get_root_source(source)) source = ["OQMD 12345"] src = "OQMD 12345" self.assertEqual(src, get_root_source(source)) source = ["OQMD-12345"] src = "OQMD-12345" self.assertEqual(src, get_root_source(source)) source = [ "KP.cell", "KP.param", "abcd-123.fdasf/efgf/KP-0.02.-1234-abcd.castep", "KP-1234-abcde.res", ] with self.assertRaises(RuntimeError): src = get_root_source(source) source = ["not a file name"] src = "not a file name" self.assertEqual(src, get_root_source(source)) source = "not even a list" src = "not even a list" self.assertEqual(src, get_root_source(source))
def root_source(self): from matador.utils.chem_utils import get_root_source try: if 'source' in self._data: self._root_source = get_root_source(self._data['source']) except RuntimeError: pass return self._root_source
def add_root_source(self): """ Add the "root_source" key to a document in the database, i.e. the name of the structure, minus file extension. """ from matador.utils.chem_utils import get_root_source for _, doc in enumerate(self.cursor): try: if 'root_source' in doc: continue else: doc['root_source'] = get_root_source(doc['source']) self.diff_cursor.append(doc) self.changed_count += 1 except Exception as error: print(repr(error)) self.failed_count += 1
def generate_stability_statistics(self, group_by='structure'): """ Creates a histogram that counts how many times each structure is found to be stable in the ensemble. Keyword arguments: group_by (str): either 'structure' or 'formula' for bar groupings. """ from collections import defaultdict histogram = defaultdict(int) for pd in self.phase_diagrams: for doc in pd.stable_structures: if group_by == 'formula': histogram[get_formula_from_stoich( doc['stoichiometry'])] += 1 else: histogram[get_root_source(doc)] += 1 return histogram
def query2files(cursor, dirname=None, max_files=10000, top=None, prefix=None, cell=None, param=None, res=None, pdb=None, json=None, xsf=None, markdown=True, latex=False, subcmd=None, argstr=None, **kwargs): """ Many-to-many convenience function for many structures being written to many file types. Parameters: cursor (:obj:`list` of :obj:`dict`/:class:`AtomicSwapper`): list of matador dictionaries to write out. Keyword arguments: dirname (str): the folder to save the results into. Will be created if non-existent. Will have integer appended to it if already existing. max_files (int): if the number of files to be written exceeds this number, then raise RuntimeError. **kwargs (dict): dictionary of {filetype: bool(whether to write)}. Accepted file types are cell, param, res, pdb, json, xsf, markdown and latex. """ multiple_files = any((cell, param, res, pdb, xsf)) prefix = prefix + '-' if prefix is not None else '' if isinstance(cursor, AtomicSwapper): cursor = cursor.cursor subcmd = "swaps" if subcmd in ['polish', 'swaps']: info = False hash_dupe = False else: info = True hash_dupe = False if isinstance(cursor, list): num = len(cursor) else: num = cursor.count() if top is not None: if top < num: num = top num_files = num * sum(1 for ext in [cell, param, res, pdb, xsf] if ext) if multiple_files: print('Intending to write', num, 'structures to file...') if num_files > max_files: raise RuntimeError( "Not writing {} files as it exceeds argument `max_files` limit of {}" .format(num_files, max_files)) if dirname is None: dirname = generate_relevant_path(subcmd=subcmd, **kwargs) _dir = False dir_counter = 0 # postfix integer on end of directory name if it exists while not _dir: if dir_counter != 0: directory = dirname + str(dir_counter) else: directory = dirname if not os.path.isdir(directory): os.makedirs(directory) _dir = True else: dir_counter += 1 for _, doc in enumerate(cursor[:num]): # generate an appropriate filename for the structure root_source = get_root_source(doc) if '_swapped_stoichiometry' in doc: formula = get_formula_from_stoich(doc['_swapped_stoichiometry']) else: formula = get_formula_from_stoich(doc['stoichiometry']) if subcmd == 'swaps': root_source = root_source.replace('-swap-', '-') name = root_source if 'OQMD ' in root_source: name = '{formula}-OQMD_{src}'.format( formula=formula, src=root_source.split(' ')[-1]) elif 'mp-' in root_source: name = '{formula}-MP_{src}'.format(formula=formula, src=root_source.split('-')[-1]) if 'icsd' in doc and 'CollCode' not in name: name += '-CollCode{}'.format(doc['icsd']) else: pf_id = None for source in doc['source']: if 'pf-' in source: pf_id = source.split('-')[-1] break else: if 'pf_ids' in doc: pf_id = doc['pf_ids'][0] if pf_id is not None: name += '-PF-{}'.format(pf_id) # if swaps, prepend new composition if subcmd == 'swaps': new_formula = get_formula_from_stoich(get_stoich( doc['atom_types'])) name = '{}-swap-{}'.format(new_formula, name) path = "{directory}/{prefix}{name}".format(directory=directory, prefix=prefix, name=name) if param: doc2param(doc, path, hash_dupe=hash_dupe) if cell: doc2cell(doc, path, hash_dupe=hash_dupe) if res: doc2res(doc, path, info=info, hash_dupe=hash_dupe) if json: doc2json(doc, path, hash_dupe=hash_dupe) if pdb: doc2pdb(doc, path, hash_dupe=hash_dupe) if xsf: doc2xsf(doc, path) hull = subcmd in ['hull', 'voltage'] if isinstance(cursor, pm.cursor.Cursor): cursor.rewind() md_path = "{directory}/{directory}.md".format(directory=directory) md_kwargs = {} md_kwargs.update(kwargs) md_kwargs.update({ 'markdown': True, 'latex': False, 'argstr': argstr, 'hull': hull }) md_string = display_results(cursor, **md_kwargs) with open(md_path, 'w') as f: f.write(md_string) if latex: if isinstance(cursor, pm.cursor.Cursor): cursor.rewind() tex_path = "{directory}/{directory}.tex".format(directory=directory) print('Writing LaTeX file', tex_path + '...') tex_kwargs = {} tex_kwargs.update(kwargs) tex_kwargs.update({ 'latex': True, 'markdown': False, 'argstr': argstr, 'hull': hull }) tex_string = display_results(cursor, **tex_kwargs) with open(tex_path, 'w') as f: f.write(tex_string) print('Done!')
def _construct_structure_string( doc, ind, formula_substring, gs_enthalpy, use_source, colour, hull, additions, deletions, add_index_mode, del_index_mode, energy_key, per_atom, eform, markdown, latex ): """ Construct the pretty output for an individual structure. Options passed from `matador.utils.cursor_utils.display_results.` Returns: str: the pretty output. """ # start with two spaces, replaced by the prefix from hull/add/del this_struct_string = ' ' prefix = '' suffix = '' # apply appropriate prefices and suffices to structure if hull and np.abs(doc.get('hull_distance')) <= 0.0 + 1e-12: if colour: prefix = '\033[92m' suffix = '\033[0m' this_struct_string = '* ' if additions is not None: if (add_index_mode and ind in additions) or doc.get('text_id', '_') in additions: this_struct_string = '+ ' if colour: prefix = '\033[92m' suffix = '\033[0m' if deletions is not None: if (del_index_mode and ind in deletions) or doc.get('text_id', '_') in deletions: this_struct_string = '- ' if colour: prefix = '\033[91m' suffix = '\033[0m' # display the canonical name for the structure if use_source: src = get_root_source(doc['source']) max_len = 34 this_struct_string += "{:<36.{max_len}}".format( src if len(src) <= max_len else src[:max_len-4]+'[..]', max_len=max_len ) else: this_struct_string += "{:^24.22}".format(' '.join(doc.get('text_id', ['xxx', 'yyy']))) # again, if we're not outputting to markdown, then flag warnings in the quality column try: if doc.get('prototype'): this_struct_string += "{:^5}".format('*p*') elif doc.get('quality', 5) == 0: this_struct_string += "{:^5}".format('!!!') else: this_struct_string += "{:^5}".format((5 - doc.get('quality', 5)) * '?') except KeyError: this_struct_string += "{:^5}".format(' ') # loop over header names and print the appropriate values if 'pressure' in doc and doc['pressure'] != 'xxx': this_struct_string += "{: >9.2f} ".format(doc['pressure']) else: this_struct_string += "{:^9} ".format('xxx') try: if per_atom and 'cell_volume' in doc and 'num_atoms' in doc: this_struct_string += "{:>12.1f} ".format(doc['cell_volume'] / doc['num_atoms']) elif 'cell_volume' in doc and 'num_fu' in doc: this_struct_string += "{:>12.1f} ".format(doc['cell_volume'] / doc['num_fu']) else: this_struct_string += "{:^12} ".format('xxx') except Exception: this_struct_string += "{:^10} ".format('xxx') try: if hull and eform: this_struct_string += "{:>12.3f} ".format( doc['formation_' + energy_key] ) elif hull: this_struct_string += "{:>12.1f} ".format( 1000 * doc['hull_distance'] ) elif per_atom: this_struct_string += "{:>16.4f} ".format(recursive_get(doc, energy_key) - gs_enthalpy) else: this_struct_string += "{:>16.4f} ".format( recursive_get(doc, energy_key) * doc['num_atoms'] / doc['num_fu'] - gs_enthalpy ) except KeyError: this_struct_string += "{:^18}".format('xxx') if latex: from matador.utils.cell_utils import get_space_group_label_latex this_struct_string += " {:^13} ".format(get_space_group_label_latex(doc.get('space_group', 'xxx'))) else: this_struct_string += " {:^13} ".format(doc.get('space_group', 'xxx')) # now we add the formula column this_struct_string += " {:^13} ".format(formula_substring) if 'num_fu' in doc: this_struct_string += " {:^6} ".format(int(doc['num_fu'])) else: this_struct_string += " {:^6} ".format('xxx') if 'source' in doc: prov = get_guess_doc_provenance(doc['source'], doc.get('icsd')) this_struct_string += "{:^8}".format(prov) else: this_struct_string += "{:^8}".format('xxx') this_struct_string = prefix + this_struct_string + suffix return this_struct_string
def adapt( possible_parents, mutation_rate, crossover_rate, mutations=None, max_num_mutations=3, max_num_atoms=40, structure_filter=None, minsep_dict=None, debug=False, ): """ Take a list of possible parents and randomly adapt according to given mutation weightings. Parameters: possible_parents (list(dict)) : list of all breeding stock, mutation_rate (float): rate of mutations relative to crossover, crossover_rate (float): see `mutation_rate`. Keyword Arguments: mutations (list(str)): list of desired mutations to choose from (as strings), max_num_mutations (int): rand(1, this) mutations will be performed, max_num_atoms (int): any structures with more than this many atoms will be filtered out. structure_filter (callable(dict)): custom filter to pass to check_feasible. minsep_dict (dict): dictionary containing element-specific minimum separations, e.g. `{('K', 'K'): 2.5, ('K', 'P'): 2.0}`. Returns: dict: the mutated/newborn structure. """ total_rate = mutation_rate + crossover_rate if total_rate != 1.0: LOG.debug( "Total mutation rate not 1 ({}), rescaling...".format(total_rate)) mutation_rate /= total_rate crossover_rate /= total_rate assert mutation_rate + crossover_rate == 1.0 mutation_rand_seed = np.random.rand() # turn specified mutations string into corresponding functions if mutations is not None: _mutations = [] from .mutate import nudge_positions, null_nudge_positions, permute_atoms from .mutate import random_strain, vacancy, voronoi_shuffle, transmute_atoms for mutation in mutations: if mutation == "nudge_positions": _mutations.append(nudge_positions) elif mutation == "null_nudge_positions": _mutations.append(null_nudge_positions) elif mutation == "permute_atoms": _mutations.append(permute_atoms) elif mutation == "random_strain": _mutations.append(random_strain) elif mutation == "voronoi": _mutations.append(voronoi_shuffle) elif mutation == "vacancy": _mutations.append(vacancy) elif mutation == "transmute_atoms": _mutations.append(transmute_atoms) else: _mutations = None # loop over *SAME* branch (i.e. crossover vs mutation) until valid cell is produced # with max attempts of 1000, at which point it will continue with a terrible cell valid_cell = False max_restarts = 1000 num_iter = 0 while not valid_cell and num_iter < max_restarts: # if random number is less than mutant rate, then mutate if mutation_rand_seed < mutation_rate: parent = strip_useless(np.random.choice(possible_parents), to_run=True) try: newborn = mutate( parent, mutations=_mutations, max_num_mutations=max_num_mutations, debug=debug, ) parents = [parent] valid_cell = check_feasible( newborn, parents, max_num_atoms, structure_filter=structure_filter, minsep_dict=minsep_dict, ) # this will be raised if the mutation fails for a good reason except RuntimeError: valid_cell = False except Exception as oops: if debug: print_exc() LOG.warning("Mutation failed with error {}".format(oops)) valid_cell = False # otherwise, do crossover else: if len(possible_parents) > 2: parents = [ strip_useless(parent, to_run=True) for parent in np.random.choice(possible_parents, size=2, replace=False) ] elif len(possible_parents) == 2: parents = copy.deepcopy(possible_parents) elif len(possible_parents) == 1: parents = 2 * [copy.deepcopy(possible_parents[0])] LOG.warning( "Only one possible parent: performing self-crossover...") try: newborn = crossover(parents, debug=debug) valid_cell = check_feasible( newborn, parents, max_num_atoms, structure_filter=structure_filter, minsep_dict=minsep_dict, ) except RuntimeError: valid_cell = False except Exception as oops: if debug: print_exc() LOG.warning("Crossover failed with error {}".format(oops)) valid_cell = False num_iter += 1 LOG.debug("Initialised newborn after {} trials".format(num_iter)) if num_iter == max_restarts: LOG.warning( "Max restarts reached in mutations, something has gone wrong... " "running with possibly unphysical cell") newborn = adapt( possible_parents, mutation_rate, crossover_rate, mutations=mutations, max_num_mutations=max_num_mutations, max_num_atoms=max_num_atoms, minsep_dict=minsep_dict, debug=debug, ) # set parents in newborn dict if "parents" not in newborn: newborn["parents"] = [] for parent in parents: parent_source = get_root_source(parent["source"]) newborn["parents"].append(parent_source) return newborn
def test_integration(self): """ Test import and query. """ print("IMPORT CASTEP 1") query = import_castep() self.assertEqual(len(query.cursor), 3, msg="Failed to import structures correctly") # run again and hopefully nothing will change, i.e. no duplication print("IMPORT CASTEP 2") query = import_castep() self.assertEqual(len(query.cursor), 3, msg="Failed to import structures correctly") print("IMPORT CASTEP 3") query = import_castep(extra_flags="--recent_only") self.assertEqual(len(query.cursor), 3, msg="Failed to import structures correctly") print("IMPORT RES") query_1, query_2 = import_res() self.assertEqual(len(query_1.cursor), 7, msg="Failed to import res files") self.assertEqual(len(query_2.cursor), 4, msg="Failed to import res files") self.assertEqual( query_2.cursor[0]["species_pot"]["K"], "2|1.5|9|10|11|30U:40:31(qc=6)", msg="Failed to scrape OTF with weird name", ) self.assertEqual( query_2.cursor[0]["species_pot"]["Sn"], "2|2|2|1.6|9.6|10.8|11.7|50U=-0.395U=+0.25:51U=-0.14U=+0.25", msg="Failed to scrape OTF with linebreak", ) self.assertFalse( any(["Sb" in doc["species_pot"] for doc in query_2.cursor]), msg="pspots over-scraped!", ) print("SWAP") output_folder_exists, successes, elem_successes = swaps() self.assertTrue(output_folder_exists, msg="No folder created") self.assertTrue(all(successes), msg="Failed to even read files") self.assertFalse(all(elem_successes), msg="Swaps had wrong elements") print("CHANGES") query_1, query_2, changes_count = changes() self.assertEqual(len(query_1.cursor), 3, msg="matador changes did not remove files") self.assertEqual(len(query_2.cursor), 0, msg="matador changes did not remove files") # unclear that mongomock can handle this; just check the queries instead # self.assertEqual(changes_count, 1, msg='matador changes did not changelog') expected_dir = "query-ci_test" if os.path.isdir(expected_dir): for _file in glob.glob(expected_dir + "/*"): os.remove(_file) os.removedirs(expected_dir) print("EXPORT") export() expected_files = [ "query-ci_test/query-ci_test.md", "query-ci_test/query-ci_test.tex", "query-ci_test/Na3Zn4-swap-ReOs-OQMD_759599.pdb", "query-ci_test/Na3Zn4-swap-ReOs-OQMD_759599.xsf", "query-ci_test/Na3Zn4-swap-ReOs-OQMD_759599.json", "query-ci_test/Na-edgecase-CollCode10101.pdb", "query-ci_test/Na-edgecase-CollCode10101.xsf", "query-ci_test/Na-edgecase-CollCode10101.json", "query-ci_test/NaP_intermediates.pdb", "query-ci_test/NaP_intermediates.xsf", "query-ci_test/NaP_intermediates.json", ] dir_exists = os.path.isdir(expected_dir) files_exist = all([os.path.isfile(_file) for _file in expected_files]) for _file in expected_files: if os.path.isfile(_file): os.remove(_file) if os.path.isdir(expected_dir): for _file in glob.glob(expected_dir + "/*"): os.remove(_file) os.removedirs(expected_dir) self.assertTrue(dir_exists, msg="Failed to create output directory") self.assertTrue(files_exist, msg="Some files missing from export") print("PSEUDOTERNARY HULL") query, hull = pseudoternary_hull() self.assertTrue(query.args.get("intersection")) self.assertTrue(query._non_elemental) self.assertTrue(query._create_hull) self.assertEqual(len(query.cursor), 7) self.assertEqual(len(hull.cursor), 7) self.assertEqual(len(hull.hull_cursor), 5) expected_dir = "query-LaLiZrO-ci_test" if os.path.isdir(expected_dir): for _file in glob.glob(expected_dir + "/*"): os.remove(_file) os.removedirs(expected_dir) print("PSEUDOTERNARY HULL 2") hull = pseudoternary_hull_no_query() self.assertTrue(hull._query.args.get("intersection")) self.assertTrue(hull._query._non_elemental) self.assertTrue(hull._query._create_hull) self.assertEqual(len(query.cursor), 7) self.assertEqual(len(hull.cursor), 7) self.assertEqual(len(hull.hull_cursor), 5) expected_dir = "query-LaLiZrO-ci_test" if os.path.isdir(expected_dir): for _file in glob.glob(expected_dir + "/*"): os.remove(_file) os.removedirs(expected_dir) print("UNIQ") uniq() expected_files = [expected_dir + "/cubic-LLZO-CollCode999999.res"] dir_exists = os.path.isdir(expected_dir) files_exist = all([os.path.isfile(_file) for _file in expected_files]) correct_num = len(glob.glob(expected_dir + "/*.res")) if os.path.isdir(expected_dir): for _file in glob.glob(expected_dir + "/*"): os.remove(_file) os.removedirs(expected_dir) self.assertTrue(dir_exists, msg="Failed to create output directory, uniq") self.assertTrue(files_exist, msg="Some files missing from export, uniq") self.assertTrue(correct_num == len(expected_files), msg="Incorrect filter") print("STATS") try: stats() except TypeError: print("Unable to test stats module due to mongomock limitations.") pass print("ID QUERY") query = id_query() self.assertEqual(len(query.cursor), 0) print("REFINE") cursor = refine().cursor self.assertTrue(all([doc["doi"] == ["10/12345"] for doc in cursor])) self.assertTrue( all([doc["tags"] == ["integration_test"] for doc in cursor])) self.assertTrue( all([doc["root_source"] == get_root_source(doc) for doc in cursor])) self.assertTrue(all([isinstance(doc["_raw"], list) for doc in cursor]))
def _struct2db(self, struct): """ Insert completed Python dictionary into chosen database, with generated text_id. Add quality factor for any missing data. Parameters: struct (dict): dictionary containing structure. Returns: int: 1 if successfully inputted into database, 0 otherwise. """ try: plain_text_id = [ WORDS[random.randint(0, self.num_words - 1)].strip(), NOUNS[random.randint(0, self.num_nouns - 1)].strip() ] struct['text_id'] = plain_text_id if 'tags' in self.tag_dict: struct['tags'] = self.tag_dict['tags'] struct['quality'] = 5 # if any missing info at all, score = 0 # include elem set for faster querying if 'elems' not in struct: struct['elems'] = sorted(list(set(struct['atom_types']))) # check basic DFT params if we're not in a prototype DB if not self.args.get('prototype'): failed_checks = [] if 'species_pot' not in struct: struct['quality'] = 0 failed_checks.append('missing all pspots') else: specified = [] for elem in struct['stoichiometry']: # remove all points for a missing pseudo if elem[0] not in struct['species_pot']: struct['quality'] = 0 failed_checks.append('missing pspot for {}'.format( elem[0])) else: specified.append(elem[0]) # remove a point for a generic OTF pspot if 'OTF' in struct['species_pot'][elem[0]].upper(): struct['quality'] -= 1 failed_checks.append( 'pspot not fully specified for {}'.format( elem[0])) struct['species_pot'] = { species: struct['species_pot'][species] for species in specified } if 'xc_functional' not in struct: struct['quality'] = 0 failed_checks.append('missing xc functional') if 'cut_off_energy' not in struct: struct['quality'] -= 1 if 'kpoints_mp_spacing' not in struct: struct['quality'] -= 1 else: struct['prototype'] = True struct['xc_functional'] = 'xxx' struct['enthalpy_per_atom'] = 0 struct['enthalpy'] = 0 struct['pressure'] = 0 struct['species_pot'] = {} root_src = get_root_source(struct) struct['root_source'] = root_src exts = ['.castep', '.res', '.history', '.history.gz'] for ext in exts: for src in struct['source']: if src.endswith(ext): expanded_root_src = src if struct['quality'] == 5: struct_id = self.repo.insert_one(struct).inserted_id self.struct_list.append((struct_id, root_src)) self.manifest.write('+ {}\n'.format(expanded_root_src)) if self.debug: print('Inserted', struct_id) else: self.logfile.write('? {} failed quality checks: {}\n'.format( expanded_root_src, failed_checks)) if self.debug: print('Error with', root_src) return 0 except Exception as exc: # this shouldn't fail, but if it does, fail loudly but cleanly self.logfile.write( '! Importer produced an unexpected error {}. Final state of struct: {}\n' .format(exc, struct)) tb.print_exc() return 0 return 1