def get_affected_chemsys(self, chemical_systems: Set) -> Set: """ Gets chemical systems affected by changes in the supplied chemical systems """ # First get all chemsys with any of the elements we've marked affected_chemsys = set() affected_els = list( {el for c in chemical_systems for el in c.split("-")}) possible_affected_chemsys = self.materials.distinct( "chemsys", {"elements": { "$in": affected_els }}) sub_chemsys = defaultdict(list) # Build a dictionary mapping sub_chemsys to all super_chemsys for chemsys in possible_affected_chemsys: for permutation in chemsys_permutations(chemsys): sub_chemsys[permutation].append(chemsys) # Select and merge distinct super chemsys from sub_chemsys for chemsys in chemical_systems: affected_chemsys |= set(sub_chemsys[chemsys]) self.logger.debug( f"Found {len(affected_chemsys)} chemical systems affected by this build" ) return affected_chemsys
def get_items(self) -> Iterator[List[Dict]]: """ Gets whole chemical systems of entries to process """ self.logger.info("Thermo Builder Started") self.logger.info("Setting indexes") self.ensure_indexes() updated_chemsys = self.get_updated_chemsys() new_chemsys = self.get_new_chemsys() affected_chemsys = self.get_affected_chemsys(updated_chemsys | new_chemsys) # Remove overlapping chemical systems to_process_chemsys = set() for chemsys in updated_chemsys | new_chemsys | affected_chemsys: if chemsys not in to_process_chemsys: to_process_chemsys |= chemsys_permutations(chemsys) self.logger.info( f"Found {len(to_process_chemsys)} chemical systems with new/updated materials to process" ) self.total = len(to_process_chemsys) # Yield the chemical systems in order of increasing size # Will build them in a similar manner to fast Pourbaix for chemsys in sorted(to_process_chemsys, key=lambda x: len(x.split("-"))): entries = self.get_entries(chemsys) yield entries
def get_items(self) -> Iterator[List[Dict]]: """ Gets whole chemical systems of entries to process """ self.logger.info("Thermo Builder Started") self.logger.info("Setting indexes") self.ensure_indexes() updated_chemsys = self.get_updated_chemsys() new_chemsys = self.get_new_chemsys() affected_chemsys = self.get_affected_chemsys(updated_chemsys | new_chemsys) # Remove overlapping chemical systems to_process_chemsys = {} for chemsys in updated_chemsys | new_chemsys | affected_chemsys: if chemsys not in to_process_chemsys: to_process_chemsys |= chemsys_permutations(chemsys) self.logger.inf( f"Found {len(to_process_chemsys)} chemical systems with new/updated materials to process" ) self.total = len(to_process_chemsys) # Yield the chemical systems in order of increasing size # Will build them in a similar manner to fast Pourbaix for chemsys in sorted(to_process_chemsys, key=lambda x: len(x.split("-"))): entries = self.get_entries(chemsys) # build sandbox sets: ["a"] , ["a","b"], ["core","a","b"] sandbox_sets = set([ frozenset(entry.data.get("sandboxes", {})) for entry in entries ]) sandbox_sets = maximal_spanning_non_intersecting_subsets( sandbox_sets) self.logger.debug(f"Found {len(sandbox_sets)}: {sandbox_sets}") for sandboxes in sandbox_sets: # only yield maximal subsets so that we can process a equivalent sandbox combinations at a time sandbox_entries = [ entry for entry in entries if all(sandbox in entry.data.get("_sbxn", []) for sandbox in sandboxes) ] yield sandboxes, sandbox_entries
def get_entries(self, chemsys: str) -> List[ComputedEntry]: """ Gets a entries from the tasks collection for the corresponding chemical systems Args: chemsys(str): a chemical system represented by string elements seperated by a dash (-) Returns: set(ComputedEntry): a set of entries for this system """ self.logger.info(f"Getting entries for: {chemsys}") # First check the cache all_chemsys = chemsys_permutations(chemsys) cached_chemsys = all_chemsys & set(self._entries_cache.keys()) query_chemsys = all_chemsys - cached_chemsys all_entries = list( chain.from_iterable(self._entries_cache[c] for c in cached_chemsys)) self.logger.debug( f"Getting {len(cached_chemsys)} sub-chemsys from cache for {chemsys}" ) self.logger.debug( f"Getting {len(query_chemsys)} sub-chemsys from DB for {chemsys}") # Second grab the materials docs new_q = dict(self.query) new_q["chemsys"] = {"$in": list(query_chemsys)} new_q["deprecated"] = False materials_docs = list( self.materials.query( criteria=new_q, properties=[self.materials.key, "entries", "sandboxes"])) self.logger.debug( f"Got {len(materials_docs)} entries from DB for {len(query_chemsys)} sub-chemsys for {chemsys}" ) # Convert the entries into ComputedEntries and store for doc in materials_docs: for entry in doc.get("entries", {}): entry["data"]["sandboxes"] = doc["sandboxes"] elsyms = sorted(set([el for el in entry["composition"]])) self._entries_cache["-".join(elsyms)].append(entry) all_entries.append(entry) self.logger.info(f"Total entries in {chemsys} : {len(all_entries)}") return all_entries
def test_chemsys_permutations(test_dir): assert len(chemsys_permutations("Sr")) == 1 assert len(chemsys_permutations("Sr-Hf")) == 3 assert len(chemsys_permutations("Sr-Hf-O")) == 7