def test_fully_saturated(self): self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens( smu_utils_lib.create_bond_topology('C', '', '4')), '(ch4)') self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens( smu_utils_lib.create_bond_topology('N', '', '3')), '(nh3)') self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens( smu_utils_lib.create_bond_topology('O', '', '2')), '(oh2)') self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens( smu_utils_lib.create_bond_topology('F', '', '1')), '(fh)')
def topology_query(db, smiles): """Find all conformers which have a detected bond topology. Note that this *redoes* the detection. If you want to use the default detected versions, you can just query by SMILES string. This is only useful if you adjust the distance thresholds for what a matching bond is. Args: db: smu_sqlite.SMUSQLite smiles: smiles string for the target bond topology Yields: dataset_pb2.Conformer """ mol = Chem.MolFromSmiles(smiles, sanitize=False) Chem.SanitizeMol(mol, Chem.rdmolops.SanitizeFlags.SANITIZE_ADJUSTHS) mol = Chem.AddHs(mol) query_bt = utilities.molecule_to_bond_topology(mol) expanded_stoich = smu_utils_lib.get_canonical_stoichiometry_with_hydrogens( query_bt) matching_parameters = _get_geometry_matching_parameters() geometry_data = GeometryData.get_singleton() cnt_matched_conformer = 0 cnt_conformer = 0 logging.info('Starting query for %s with stoich %s', smiles, expanded_stoich) for conformer in db.find_by_expanded_stoichiometry(expanded_stoich): if not smu_utils_lib.conformer_eligible_for_topology_detection(conformer): continue cnt_conformer += 1 matches = topology_from_geom.bond_topologies_from_geom( bond_lengths=geometry_data.bond_lengths, conformer_id=conformer.conformer_id, fate=conformer.fate, bond_topology=conformer.bond_topologies[0], geometry=conformer.optimized_geometry, matching_parameters=matching_parameters) if smiles in [bt.smiles for bt in matches.bond_topology]: cnt_matched_conformer += 1 del conformer.bond_topologies[:] conformer.bond_topologies.extend(matches.bond_topology) for bt in conformer.bond_topologies: try: bt.bond_topology_id = geometry_data.smiles_id_dict[bt.smiles] except KeyError: logging.error('Did not find bond topology id for smiles %s', bt.smiles) yield conformer logging.info('Topology query for %s matched %d / %d', smiles, cnt_matched_conformer, cnt_conformer)
def test_ethylene(self): bt = smu_utils_lib.create_bond_topology('CC', '2', '22') self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt), '(ch2)2')
def test_cyclobutane(self): bt = smu_utils_lib.create_bond_topology('CCCC', '110011', '2222') self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt), '(ch2)4')
def test_nplus_oneg(self): bt = smu_utils_lib.create_bond_topology('NO', '1', '30') self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt), '(nh3)(o)')
def test_fluorine(self): bt = smu_utils_lib.create_bond_topology('OFF', '110', '000') self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt), '(o)(f)2')
def test_acrylic_acid(self): bt = smu_utils_lib.create_bond_topology('CCCOO', '2000100210', '21001') self.assertEqual( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt), '(c)(ch)(ch2)(o)(oh)')
def bulk_insert(self, encoded_conformers, batch_size=10000, limit=None): """Inserts conformers into the database. Args: encoded_conformers: iterable for encoded dataset_pb2.Conformer batch_size: insert performance is greatly improved by putting multiple insert into one transaction. 10k was a reasonable default from some early exploration. limit: maximum number of records to insert Raises: ReadOnlyError: if mode is 'r' ValueError: If encoded_conformers is empty. """ if self._read_only: raise ReadOnlyError() if not encoded_conformers: raise ValueError() insert_conformer = (f'INSERT INTO {_CONFORMER_TABLE_NAME} ' 'VALUES (?, ?, ?)') insert_btid = f'INSERT INTO {_BTID_TABLE_NAME} VALUES (?, ?)' insert_smiles = (f'INSERT INTO {_SMILES_TABLE_NAME} VALUES (?, ?) ' f'ON CONFLICT(smiles) DO NOTHING') cur = self._conn.cursor() start_time = datetime.datetime.now() pending_conformer_args = [] pending_btid_args = [] pending_smiles_args = [] def commit_pending(): cur.executemany(insert_conformer, pending_conformer_args) cur.executemany(insert_btid, pending_btid_args) cur.executemany(insert_smiles, pending_smiles_args) pending_conformer_args.clear() pending_btid_args.clear() pending_smiles_args.clear() self._conn.commit() idx = None for idx, encoded_conformer in enumerate(encoded_conformers, 1): conformer = dataset_pb2.Conformer.FromString(encoded_conformer) # A small efficiency hack: the expanded stoich is only intended for use # with topology_detection, so we only put a real value for those so that # we dont' even have to return the entries we don't want. if smu_utils_lib.conformer_eligible_for_topology_detection(conformer): expanded_stoich = ( smu_utils_lib.get_canonical_stoichiometry_with_hydrogens( conformer.bond_topologies[0])) else: expanded_stoich = '' pending_conformer_args.append((conformer.conformer_id, expanded_stoich, snappy.compress(encoded_conformer))) for bond_topology in conformer.bond_topologies: pending_btid_args.append( (bond_topology.bond_topology_id, conformer.conformer_id)) pending_smiles_args.append( (bond_topology.smiles, bond_topology.bond_topology_id)) if batch_size and idx % batch_size == 0: commit_pending() elapsed = datetime.datetime.now() - start_time logging.info( 'bulk_insert: committed at index %d, %f s total, %.6f s/record', idx, elapsed.total_seconds(), elapsed.total_seconds() / idx) if limit and idx >= limit: break # Commit a final time commit_pending() elapsed = datetime.datetime.now() - start_time logging.info('bulk_insert: Total records %d, %f s, %.6f s/record', idx, elapsed.total_seconds(), elapsed.total_seconds() / idx)