def __call__(self, struct1, struct2): sm = StructureMatcher(**self.kw, comparator=SpeciesComparator()) pstruct1 = struct1.get_pymatgen_structure() pstruct2 = struct2.get_pymatgen_structure() return sm.fit(pstruct1, pstruct2)
def add_if_belongs(self, cand_snl): # no need to compare if different formulas or spacegroups if cand_snl.snlgroup_key != self.canonical_snl.snlgroup_key: return False, None # no need to compare if one is ordered, the other disordered if not (cand_snl.structure.is_ordered == self.canonical_structure.is_ordered): return False, None # filter out large C-Ce structures comp = cand_snl.structure.composition elsyms = sorted(set([e.symbol for e in comp.elements])) chemsys = '-'.join(elsyms) if ( cand_snl.structure.num_sites > 1500 or self.canonical_structure.num_sites > 1500) and chemsys == 'C-Ce': print 'SKIPPING LARGE C-Ce' return False, None # make sure the structure is not already in all_structures if cand_snl.snl_id in self.all_snl_ids: print 'WARNING: add_if_belongs() has detected that you are trying to add the same SNL id twice!' return False, None #try a structure fit to the canonical structure # use default Structure Matcher params from April 24, 2013, as suggested by Shyue # we are using the ElementComparator() because this is how we want to group results sm = StructureMatcher(ltol=0.2, stol=0.3, angle_tol=5, primitive_cell=True, scale=True, attempt_supercell=False, comparator=ElementComparator()) if not sm.fit(cand_snl.structure, self.canonical_structure): return False, None # everything checks out, add to the group self.all_snl_ids.append(cand_snl.snl_id) # now that we are in the group, if there are site properties we need to check species_groups # e.g., if there is another SNL in the group with the same site properties, e.g. MAGMOM spec_group = None if has_species_properties(cand_snl.structure): for snl in self.species_snl: sms = StructureMatcher(ltol=0.2, stol=0.3, angle_tol=5, primitive_cell=True, scale=True, attempt_supercell=False, comparator=SpeciesComparator()) if sms.fit(cand_snl.structure, snl.structure): spec_group = snl.snl_id self.species_groups[snl.snl_id].append(cand_snl.snl_id) break # add a new species group if not spec_group: self.species_groups[cand_snl.snl_id] = [cand_snl.snl_id] self.species_snl.append(cand_snl) spec_group = cand_snl.snl_id self.updated_at = datetime.datetime.utcnow() return True, spec_group
def compute_pymatgen_fit(self, s1, s2): ''' Compares two GAtor Structure objects for similiarity using pymatgen's StructureMatcher ''' sm = (StructureMatcher(ltol=self.L_tol, stol=self.S_tol, angle_tol=self.Angle_tol, primitive_cell=True, scale=False, attempt_supercell=False, comparator=SpeciesComparator())) sp1 = s1.get_pymatgen_structure() sp2 = s2.get_pymatgen_structure() fit = sm.fit(sp1, sp2) # If the structure is a duplicate # return the ID of the duplicate if fit: return s2.struct_id # Else return False return fit
def group_entries_by_structure(entries, species_to_remove=None, ltol=0.2, stol=.4, angle_tol=5, primitive_cell=True, scale=True, comparator=SpeciesComparator(), ncpus=None): """ Given a sequence of ComputedStructureEntries, use structure fitter to group them by structural similarity. Args: entries: Sequence of ComputedStructureEntries. species_to_remove: Sometimes you want to compare a host framework (e.g., in Li-ion battery analysis). This allows you to specify species to remove before structural comparison. ltol (float): Fractional length tolerance. Default is 0.2. stol (float): Site tolerance in Angstrom. Default is 0.4 Angstrom. angle_tol (float): Angle tolerance in degrees. Default is 5 degrees. primitive_cell (bool): If true: input structures will be reduced to primitive cells prior to matching. Defaults to True. scale: Input structures are scaled to equivalent volume if true; For exact matching, set to False. comparator: A comparator object implementing an equals method that declares equivalency of sites. Default is SpeciesComparator, which implies rigid species mapping. ncpus: Number of cpus to use. Use of multiple cpus can greatly improve fitting speed. Default of None means serial processing. Returns: Sequence of sequence of entries by structural similarity. e.g, [[ entry1, entry2], [entry3, entry4, entry5]] """ start = datetime.datetime.now() logger.info("Started at {}".format(start)) entries_host = [(entry, _get_host(entry.structure, species_to_remove)) for entry in entries] if ncpus: symm_entries = collections.defaultdict(list) for entry, host in entries_host: symm_entries[comparator.get_structure_hash(host)].append( (entry, host)) import multiprocessing as mp logging.info("Using {} cpus".format(ncpus)) manager = mp.Manager() groups = manager.list() p = mp.Pool(ncpus) # Parallel processing only supports Python primitives and not objects. p.map(_perform_grouping, [(json.dumps([e[0] for e in eh], cls=MontyEncoder), json.dumps([e[1] for e in eh], cls=MontyEncoder), ltol, stol, angle_tol, primitive_cell, scale, comparator, groups) for eh in symm_entries.values()]) else: groups = [] hosts = [host for entry, host in entries_host] _perform_grouping( (json.dumps(entries, cls=MontyEncoder), json.dumps(hosts, cls=MontyEncoder), ltol, stol, angle_tol, primitive_cell, scale, comparator, groups)) entry_groups = [] for g in groups: entry_groups.append(json.loads(g, cls=MontyDecoder)) logging.info("Finished at {}".format(datetime.datetime.now())) logging.info("Took {}".format(datetime.datetime.now() - start)) return entry_groups