def main(unused_argv): db = smu_sqlite.SMUSQLite('20220128_standard_v2.sqlite') bond_lengths = bond_length_distribution.AllAtomPairLengthDistributions() bond_lengths.add_from_sparse_dataframe_file( '20220128_bond_lengths.csv', bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS, bond_length_distribution.STANDARD_SIG_DIGITS) fake_smiles_id_dict = collections.defaultdict(lambda: -1) print('molecule_id, count_all, count_smu, count_covalent, count_allen') for molecule in db: if abs(hash(str(molecule.molecule_id))) % 1000 != 1: continue topology_from_geom.standard_topology_sensing(molecule, bond_lengths, fake_smiles_id_dict) count_all = len(molecule.bond_topologies) count_smu = sum(bt.source & dataset_pb2.BondTopology.SOURCE_ITC != 0 for bt in molecule.bond_topologies) count_covalent = sum(bt.source & dataset_pb2.BondTopology.SOURCE_MLCR != 0 for bt in molecule.bond_topologies) count_allen = sum(bt.source & dataset_pb2.BondTopology.SOURCE_CSD != 0 for bt in molecule.bond_topologies) print( f'{molecule.molecule_id}, {count_all}, {count_smu}, {count_covalent}, {count_allen}' )
def test_atom_ordering(self): all_dists = bond_length_distribution.AllAtomPairLengthDistributions() all_dists.add(ATOM_N, ATOM_O, BOND_SINGLE, bond_length_distribution.FixedWindow(1, 2, None)) self.assertEqual( all_dists.pdf_length_given_type(ATOM_N, ATOM_O, BOND_SINGLE, 1.5), 1) self.assertEqual( all_dists.pdf_length_given_type(ATOM_O, ATOM_N, BOND_SINGLE, 1.5), 1) self.assertEqual( all_dists.pdf_length_given_type(ATOM_N, ATOM_O, BOND_SINGLE, 999), 0) self.assertEqual( all_dists.pdf_length_given_type(ATOM_O, ATOM_N, BOND_SINGLE, 999), 0) # Make sure subsequent additions work as well all_dists.add(ATOM_N, ATOM_O, BOND_DOUBLE, bond_length_distribution.FixedWindow(2, 3, None)) self.assertEqual( all_dists.pdf_length_given_type(ATOM_N, ATOM_O, BOND_DOUBLE, 2.5), 1) self.assertEqual( all_dists.pdf_length_given_type(ATOM_O, ATOM_N, BOND_DOUBLE, 2.5), 1)
def __init__(self, bond_lengths_csv, bond_lengths_arg, bond_topology_csv): if bond_lengths_csv is None: raise ValueError('--bond_lengths_csv required') logging.info('Loading bond_lengths') with open(bond_lengths_csv, 'r') as infile: df = pd.read_csv(infile, dtype={'length_str': str}) self.bond_lengths = bond_length_distribution.AllAtomPairLengthDistributions( ) self.bond_lengths.add_from_sparse_dataframe( df, self._BOND_LENGTHS_UNBONDED_RIGHT_TAIL_MASS, self._BOND_LENGTHS_SIG_DIGITS) logging.info('Done loading bond_lengths_csv') self._parse_bond_lengths_arg(bond_lengths_arg) if bond_topology_csv is None: raise ValueError('--bond_topology_csv required') logging.info('Loading bond topologies') self.smiles_id_dict = {} with open(bond_topology_csv, 'r') as infile: reader = csv.reader(iter(infile)) next(reader) # skip the header line for row in reader: bt_id, _, _, _, _, smiles = row self.smiles_id_dict[smiles] = int(bt_id) logging.info('Done loading bond topologies')
def TopologyFromGeometryMain(unused_argv): del unused_argv bond_lengths = bond_length_distribution.AllAtomPairLengthDistributions() bond_lengths.add_from_files(FLAGS.bonds, 0.0, FLAGS.xnonbond) protos = ReadMolecule(bond_lengths, FLAGS.input, FLAGS.output) print(protos)
def test_scores(self): carbon = dataset_pb2.BondTopology.AtomType.ATOM_C single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE # For testing, turn off the need for complete matching. smu_molecule.default_must_match_all_bonds = False all_distributions = bond_length_distribution.AllAtomPairLengthDistributions( ) x, y = triangular_distribution(1.0, 1.4, 2.0) df = pd.DataFrame({"length": x, "count": y}) bldc1c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0) all_distributions.add(carbon, carbon, single_bond, bldc1c) x, y = triangular_distribution(1.0, 1.5, 2.0) df = pd.DataFrame({"length": x, "count": y}) bldc2c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0) all_distributions.add(carbon, carbon, double_bond, bldc2c) bond_topology = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C bonds: { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) geometry = text_format.Parse( """ atom_positions { x: 0.0 y: 0.0 z: 0.0 }, atom_positions { x: 0.0 y: 0.0 z: 0.0 } """, dataset_pb2.Geometry()) geometry.atom_positions[1].x = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS matching_parameters = smu_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False result = topology_from_geom.bond_topologies_from_geom( all_distributions, bond_topology, geometry, matching_parameters) self.assertIsNotNone(result) self.assertEqual(len(result.bond_topology), 2) self.assertEqual(len(result.bond_topology[0].bonds), 1) self.assertEqual(len(result.bond_topology[1].bonds), 1) self.assertGreater(result.bond_topology[0].score, result.bond_topology[1].score) self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond) self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond)
def test_multi_topology_detection(self): """Tests that we can find multiple versions of the same topology.""" single = dataset_pb2.BondTopology.BondType.BOND_SINGLE double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE all_dist = bond_length_distribution.AllAtomPairLengthDistributions() all_dist.add(dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, single, triangular_distribution(1.0, 1.5, 2.0)) all_dist.add(dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, double, triangular_distribution(1.0, 1.4, 2.0)) # This molecule is a flat aromatic square of nitrogens. The single and # double bonds can be rotated such that it's the same topology but # individual bonds have switched single/double. # We set it so the bond lengths favor one of the two arrangements molecule = dataset_pb2.Molecule(molecule_id=123) molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS molecule.bond_topologies.add(bond_topology_id=123, smiles='N1=NN=N1') molecule.bond_topologies[0].atoms.extend([ dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, ]) molecule.bond_topologies[0].bonds.extend([ dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2, bond_type=double), dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0, bond_type=double), ]) dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS dist14a = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS molecule.optimized_geometry.atom_positions.extend([ dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0), dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist14a, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist14a, y=0, z=0), ]) matching_parameters = topology_molecule.MatchingParameters() result = topology_from_geom.bond_topologies_from_geom( molecule, all_dist, matching_parameters) self.assertLen(result.bond_topology, 2) first = result.bond_topology[0] self.assertEqual(smu_utils_lib.get_bond_type(first, 0, 1), single) self.assertEqual(smu_utils_lib.get_bond_type(first, 1, 2), double) self.assertEqual(smu_utils_lib.get_bond_type(first, 2, 3), single) self.assertEqual(smu_utils_lib.get_bond_type(first, 3, 0), double) second = result.bond_topology[1] self.assertEqual(smu_utils_lib.get_bond_type(second, 0, 1), double) self.assertEqual(smu_utils_lib.get_bond_type(second, 1, 2), single) self.assertEqual(smu_utils_lib.get_bond_type(second, 2, 3), double) self.assertEqual(smu_utils_lib.get_bond_type(second, 3, 0), single)
def test_probability_bond_types(self): all_dists = bond_length_distribution.AllAtomPairLengthDistributions() all_dists.add(ATOM_N, ATOM_O, BOND_SINGLE, bond_length_distribution.FixedWindow(1, 4, None)) all_dists.add(ATOM_N, ATOM_O, BOND_DOUBLE, bond_length_distribution.FixedWindow(1, 2, None)) got = all_dists.probability_of_bond_types(ATOM_N, ATOM_O, 1.5) self.assertLen(got, 2) self.assertAlmostEqual(got[BOND_SINGLE], 0.25) self.assertAlmostEqual(got[BOND_DOUBLE], 0.75)
def test_missing_types(self): all_dists = bond_length_distribution.AllAtomPairLengthDistributions() all_dists.add(ATOM_N, ATOM_O, BOND_SINGLE, bond_length_distribution.FixedWindow(1, 2, None)) with self.assertRaises(KeyError): all_dists.probability_of_bond_types(ATOM_C, ATOM_C, 1.0) with self.assertRaises(KeyError): all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 1.0)
def __init__(self, bond_lengths_csv, bond_lengths_arg): if bond_lengths_csv is None: raise ValueError('--bond_lengths_csv required') logging.info('Loading bond_lengths') self.bond_lengths = ( bond_length_distribution.AllAtomPairLengthDistributions()) self.bond_lengths.add_from_sparse_dataframe_file( bond_lengths_csv, bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS, bond_length_distribution.STANDARD_SIG_DIGITS) logging.info('Done loading bond_lengths_csv') self.bond_lengths.add_from_string_spec(bond_lengths_arg)
def test_add_from_gaussians_file(self): all_dists = bond_length_distribution.AllAtomPairLengthDistributions() all_dists.add_from_gaussians_file( os.path.join(TESTDATA_PATH, 'example_gaussian_input.csv'), 1) # The example file has a few lines copied from the real exporte files. # # Note two things in the file that might not be obvious # * An line with "n/a" as Bond that is ignored # * A line with "N:N" but no values that is ignored # # The numbers in the test below come from looking at a few cases there. # Two enties for C-C, making sure they mix. self.assertGreater( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 1.513), 0) self.assertGreater( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 1.588 + .001), 0) self.assertEqual( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 1.8), 0) # Testing double bond self.assertGreater( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_DOUBLE, 1.299 + .001), 0) self.assertEqual( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_DOUBLE, 1.8), 0) # Testing triple bond self.assertGreater( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_TRIPLE, 1.183 + .001), 0) self.assertEqual( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_TRIPLE, 1.0), 0) # Aromatic are converted to both single and double. self.assertGreater( all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_SINGLE, 1.304), 0) self.assertGreater( all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_DOUBLE, 1.304), 0) with self.assertRaises(KeyError): all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_TRIPLE, 1.304)
def test_missing_types(self): all_dists = bond_length_distribution.AllAtomPairLengthDistributions() all_dists.add( dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.BOND_SINGLE, bond_length_distribution.FixedWindowLengthDistribution(1, 2, None)) with self.assertRaises(KeyError): all_dists.probability_of_bond_types(dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_C, 1.0) with self.assertRaises(KeyError): all_dists.pdf_length_given_type(dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.BOND_SINGLE, 1.0)
def process(self, conformer, bond_length_records, smiles_id_dict): """Per conformer updates. Args: conformer: dataset_pb2.Conformer bond_length_records: tuples to go to bond_length_distribution.AllAtomPairLengthDistributions smiles_id_dict: dict from SMILES to bond topology id Yields: Conformer. """ # There is probably a better way to do this. # We get the side input with each call to process. We'll assume that it's # always the same input, so we set our cache value and never update it. # We only do this with bond_length_records because there is a reasonable # amount of processing in creating AllAtomPairLengthDistributions. # The smiles_id_dict is used directly. if not self._cached_bond_lengths: self._cached_bond_lengths = ( bond_length_distribution.AllAtomPairLengthDistributions()) try: self._cached_bond_lengths.add_from_sparse_dataframe( bond_length_distribution.sparse_dataframe_from_records( bond_length_records), _BOND_LENGTHS_UNBONDED_RIGHT_TAIL_MASS, _BOND_LENGTHS_SIG_DIGITS) except ValueError as err: raise ValueError( 'Invalid sparse dataframe for conformer {0} org. ValueError: {1}' .format(str(conformer.conformer_id), err)) conformer = copy.deepcopy(conformer) conformer.fate = smu_utils_lib.determine_fate(conformer) yield from self._compare_smiles(conformer) if (conformer.duplicated_by == 0 and conformer.properties.errors.status < 512): # The duplicate records do not need topology extraction and anything # with this high an error is pretty messed so, do we won't bother trying # to match the topolgy. self._add_alternative_bond_topologies(conformer, smiles_id_dict) else: beam.metrics.Metrics.counter(_METRICS_NAMESPACE, 'skipped_topology_matches').inc() yield conformer
def process(self, molecule, bond_length_records, smiles_id_dict): """Per molecule updates. Args: molecule: dataset_pb2.Molecule bond_length_records: tuples to go to bond_length_distribution.AllAtomPairLengthDistributions smiles_id_dict: dict from SMILES to bond topology id Yields: Molecule. """ # There is probably a better way to do this. # We get the side input with each call to process. We'll assume that it's # always the same input, so we set our cache value and never update it. # We only do this with bond_length_records because there is a reasonable # amount of processing in creating AllAtomPairLengthDistributions. # The smiles_id_dict is used directly. if not self._cached_bond_lengths: self._cached_bond_lengths = ( bond_length_distribution.AllAtomPairLengthDistributions()) try: self._cached_bond_lengths.add_from_sparse_dataframe( bond_length_distribution.sparse_dataframe_from_records( bond_length_records), bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS, bond_length_distribution.STANDARD_SIG_DIGITS) except ValueError as err: raise ValueError( 'Invalid sparse dataframe for molecule {0} org. ValueError: {1}' .format(str(molecule.molecule_id), err)) from err molecule = copy.deepcopy(molecule) molecule.properties.errors.fate = smu_utils_lib.determine_fate(molecule) yield from self._compare_smiles(molecule) if smu_utils_lib.molecule_eligible_for_topology_detection(molecule): self._add_alternative_bond_topologies(molecule, smiles_id_dict) else: molecule.bond_topologies[ 0].source = dataset_pb2.BondTopology.SOURCE_STARTING beam.metrics.Metrics.counter(_METRICS_NAMESPACE, 'skipped_topology_matches').inc() yield molecule
def get_smu_dists(self): bld = bond_length_distribution.AllAtomPairLengthDistributions() # This is set up to make the O=C length of 1.25 a much better fit than # the [O-]-C bond bld.add(dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.BondType.BOND_SINGLE, triangular_distribution(1.2, 1.6, 1.8)) bld.add(dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.BondType.BOND_DOUBLE, triangular_distribution(1.2, 1.25, 1.3)) bld.add(dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.BondType.BOND_DOUBLE, bond_length_distribution.FixedWindow(1.1, 1.3, None)) bld.add(dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.BondType.BOND_TRIPLE, bond_length_distribution.FixedWindow(1.2, 1.4, None)) return bld
def test_add_from_sparse_dataframe(self): df = pd.DataFrame.from_records([ ('c', 'c', 1, '1.0', 10), ('c', 'c', 1, '1.2', 30), ('n', 'o', 2, '1.0', 50), ('n', 'o', 2, '1.5', 50), ('n', 'n', 0, '1.5', 100), ('n', 'n', 0, '1.8', 100), ], columns=[ 'atom_char_0', 'atom_char_1', 'bond_type', 'length_str', 'count' ]) all_dists = bond_length_distribution.AllAtomPairLengthDistributions() all_dists.add_from_sparse_dataframe(df, sig_digits=1, unbonded_right_tail_mass=0.8) carbon = dataset_pb2.BondTopology.AtomType.ATOM_C nitrogen = dataset_pb2.BondTopology.AtomType.ATOM_N oxygen = dataset_pb2.BondTopology.AtomType.ATOM_O unbonded = dataset_pb2.BondTopology.BondType.BOND_UNDEFINED single = dataset_pb2.BondTopology.BondType.BOND_SINGLE double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, carbon, single, 1.05), 2.5) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, carbon, single, 999), 0.0) self.assertAlmostEqual( all_dists.pdf_length_given_type(nitrogen, oxygen, double, 1.55), 5.0) self.assertAlmostEqual( all_dists.pdf_length_given_type(nitrogen, nitrogen, unbonded, 1.85), 1.0) # This makes sure the right tail mass was included self.assertGreater( all_dists.pdf_length_given_type(nitrogen, nitrogen, unbonded, 2.0), 0.0) self.assertGreater( all_dists.pdf_length_given_type(nitrogen, nitrogen, unbonded, 3.0), 0.0)
def test_add_from_sparse_dataframe(self): df = pd.DataFrame.from_records([ ('c', 'c', 1, '1.0', 10), ('c', 'c', 1, '1.1', 30), ('n', 'o', 2, '1.4', 50), ('n', 'o', 2, '1.5', 50), ('n', 'n', 0, '1.7', 100), ('n', 'n', 0, '1.8', 100), ], columns=[ 'atom_char_0', 'atom_char_1', 'bond_type', 'length_str', 'count' ]) all_dists = bond_length_distribution.AllAtomPairLengthDistributions() all_dists.add_from_sparse_dataframe(df, sig_digits=1, unbonded_right_tail_mass=0.8) self.assertAlmostEqual( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 1.05), 2.5) self.assertAlmostEqual( all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 999), 0.0) self.assertAlmostEqual( all_dists.pdf_length_given_type(ATOM_N, ATOM_O, BOND_DOUBLE, 1.55), 5.0) self.assertAlmostEqual( all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_UNDEFINED, 1.85), 1.0) # This makes sure the right tail mass was included self.assertGreater( all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_UNDEFINED, 2.0), 0.0) self.assertGreater( all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_UNDEFINED, 3.0), 0.0)
def main(argv): # Shortcuts for below atom_str = { dataset_pb2.BondTopology.ATOM_C: "ATOM_C", dataset_pb2.BondTopology.ATOM_N: "ATOM_N", dataset_pb2.BondTopology.ATOM_O: "ATOM_O", dataset_pb2.BondTopology.ATOM_F: "ATOM_F" } bond_str = { dataset_pb2.BondTopology.BOND_SINGLE: "BOND_SINGLE", dataset_pb2.BondTopology.BOND_DOUBLE: "BOND_DOUBLE", dataset_pb2.BondTopology.BOND_TRIPLE: "BOND_TRIPLE" } allen_dists = bond_length_distribution.AllAtomPairLengthDistributions() allen_dists.add_from_gaussians_file(argv[1], 3) for (atom_a, atom_b), bond in itertools.product( itertools.combinations_with_replacement([ dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_F ], 2), [ dataset_pb2.BondTopology.BOND_SINGLE, dataset_pb2.BondTopology.BOND_DOUBLE, dataset_pb2.BondTopology.BOND_TRIPLE ]): try: mn = allen_dists[(atom_a, atom_b)][bond].min() mx = allen_dists[(atom_a, atom_b)][bond].max() print( f" (dataset_pb2.BondTopology.{atom_str[atom_a]},\n" f" dataset_pb2.BondTopology.{atom_str[atom_b]},\n" f" dataset_pb2.BondTopology.{bond_str[bond]}): ({mn:0.3f}, {mx:.03f})," ) except KeyError: pass
def test_add_from_files(self): data = """1.0,1 1.1,2 1.2,3 1.3,2 """ data_increasing = """1.0,1 1.1,2 1.2,3 1.3,4 1.4,5 """ tmpdir = self.create_tempdir() stem = os.path.join(tmpdir, 'BONDS') self.create_tempfile(f'{stem}.6.0.6', content=data_increasing) self.create_tempfile(f'{stem}.6.1.6', content=data) self.create_tempfile(f'{stem}.6.0.7', content=data_increasing) self.create_tempfile(f'{stem}.6.1.7', content=data) self.create_tempfile(f'{stem}.6.2.7', content=data) self.create_tempfile(f'{stem}.6.3.7', content=data) all_dists = bond_length_distribution.AllAtomPairLengthDistributions() all_dists.add_from_files(stem, unbonded_right_tail_mass=0.8) carbon = dataset_pb2.BondTopology.AtomType.ATOM_C nitrogen = dataset_pb2.BondTopology.AtomType.ATOM_N unbonded = dataset_pb2.BondTopology.BondType.BOND_UNDEFINED single = dataset_pb2.BondTopology.BondType.BOND_SINGLE double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE triple = dataset_pb2.BondTopology.BondType.BOND_TRIPLE self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, carbon, unbonded, 0.99), 0.0) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, unbonded, 0.99), 0.0) # The 3/15 is the counts in the data_increasing file. # * 10 is for the pdf because the bucket is 0.1 wide # * 0.2 is because of the right tail mass. self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, carbon, unbonded, 1.25), 3.0 / 15.0 * 10 * 0.2) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, unbonded, 1.25), 3.0 / 15.0 * 10 * 0.2) # Test the right tail mass for the unbonded self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, carbon, unbonded, 1.5), 0.66666667) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, unbonded, 1.5), 0.66666667) # Test the bonded inside the pdf. # 3/8 are the counts in the data file # * 10 is for the pdf because the bucket is 0.1 wide self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, carbon, single, 1.25), 3.0 / 8.0 * 10) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, single, 1.25), 3.0 / 8.0 * 10) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, double, 1.25), 3.0 / 8.0 * 10) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, triple, 1.25), 3.0 / 8.0 * 10) # Check for no right tail mass for the bonded self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, carbon, single, 1.5), 0.0) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, single, 1.5), 0.0) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, double, 1.5), 0.0) self.assertAlmostEqual( all_dists.pdf_length_given_type(carbon, nitrogen, triple, 1.5), 0.0)
def test_scores(self): carbon = dataset_pb2.BondTopology.ATOM_C single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE # For testing, turn off the need for complete matching. topology_molecule.default_must_match_all_bonds = False all_distributions = bond_length_distribution.AllAtomPairLengthDistributions( ) bldc1c = triangular_distribution(1.0, 1.4, 2.0) all_distributions.add(carbon, carbon, single_bond, bldc1c) bldc2c = triangular_distribution(1.0, 1.5, 2.0) all_distributions.add(carbon, carbon, double_bond, bldc2c) molecule = dataset_pb2.Molecule() molecule.bond_topologies.append( text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C bonds: { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology())) molecule.optimized_geometry.MergeFrom( text_format.Parse( """ atom_positions { x: 0.0 y: 0.0 z: 0.0 }, atom_positions { x: 0.0 y: 0.0 z: 0.0 } """, dataset_pb2.Geometry())) molecule.optimized_geometry.atom_positions[1].x = ( 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS) matching_parameters = topology_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS molecule.molecule_id = 1001 result = topology_from_geom.bond_topologies_from_geom( molecule, all_distributions, matching_parameters) self.assertIsNotNone(result) self.assertLen(result.bond_topology, 2) self.assertLen(result.bond_topology[0].bonds, 1) self.assertLen(result.bond_topology[1].bonds, 1) self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond) self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond) self.assertGreater(result.bond_topology[0].topology_score, result.bond_topology[1].topology_score) self.assertAlmostEqual( np.sum(np.exp([bt.topology_score for bt in result.bond_topology])), 1.0) self.assertAlmostEqual(result.bond_topology[0].geometry_score, np.log(bldc1c.pdf(1.4))) self.assertAlmostEqual(result.bond_topology[1].geometry_score, np.log(bldc2c.pdf(1.4)))
print('find_by_smiles on', smiles, 'finds these molecule ids') print([c.molecule_id for c in original_molecules]) print() print('But you can modify the allowed distances for each type of bond') print( 'and find all molecules which match a given topology with these modifications' ) print( 'While this does not have the read the whole database, it is a much less efficient operation than querying by smiles, so only use it if you modify the allowed distances' ) print() print('First you have to load the default bond lengths') bond_lengths = bond_length_distribution.AllAtomPairLengthDistributions() bond_lengths.add_from_sparse_dataframe_file( '20220128_bond_lengths.csv', bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS, bond_length_distribution.STANDARD_SIG_DIGITS) print() print('You then provide the desired topology as a SMILES string') print( 'The topology query without modifying bond lengths, finds the same result') unmodified_molecules = sorted(list(db.find_by_topology(smiles, bond_lengths)), key=lambda c: c.molecule_id) print('Unmodified find_by_topology finds these molecule ids') print([c.molecule_id for c in unmodified_molecules]) print()
def test_multi_topology_detection(self): """Tests that we can find multiple versions of the same topology.""" single = dataset_pb2.BondTopology.BondType.BOND_SINGLE double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE all_dist = bond_length_distribution.AllAtomPairLengthDistributions() for bond_type in [single, double]: all_dist.add( dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, bond_type, bond_length_distribution.FixedWindowLengthDistribution( 1.0, 2.0, None)) # This conformer is a flat aromatic square of nitrogens. The single and # double bonds can be rotated such that it's the same topology but # individual bonds have switched single/double. conformer = dataset_pb2.Conformer() conformer.bond_topologies.add(bond_topology_id=123, smiles="N1=NN=N1") conformer.bond_topologies[0].atoms.extend([ dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, ]) conformer.bond_topologies[0].bonds.extend([ dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2, bond_type=double), dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0, bond_type=double), ]) dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS conformer.optimized_geometry.atom_positions.extend([ dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0), dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist15a, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist15a, y=0, z=0), ]) matching_parameters = smu_molecule.MatchingParameters() result = topology_from_geom.bond_topologies_from_geom( bond_lengths=all_dist, conformer_id=123, fate=dataset_pb2.Conformer.FATE_SUCCESS, bond_topology=conformer.bond_topologies[0], geometry=conformer.optimized_geometry, matching_parameters=matching_parameters) self.assertLen(result.bond_topology, 2) # The returned order is arbitrary so we figure out which is is marked # as the starting topology. starting_idx = min([ i for i, bt, in enumerate(result.bond_topology) if bt.is_starting_topology ]) other_idx = (starting_idx + 1) % 2 starting = result.bond_topology[starting_idx] self.assertTrue(starting.is_starting_topology) self.assertEqual(smu_utils_lib.get_bond_type(starting, 0, 1), single) self.assertEqual(smu_utils_lib.get_bond_type(starting, 1, 2), double) self.assertEqual(smu_utils_lib.get_bond_type(starting, 2, 3), single) self.assertEqual(smu_utils_lib.get_bond_type(starting, 3, 0), double) other = result.bond_topology[other_idx] self.assertFalse(other.is_starting_topology) self.assertEqual(smu_utils_lib.get_bond_type(other, 0, 1), double) self.assertEqual(smu_utils_lib.get_bond_type(other, 1, 2), single) self.assertEqual(smu_utils_lib.get_bond_type(other, 2, 3), double) self.assertEqual(smu_utils_lib.get_bond_type(other, 3, 0), single)
def test_add_itc_h_lengths(self): dists = bond_length_distribution.AllAtomPairLengthDistributions() bond_length_distribution.add_itc_h_lengths(dists) self.assertGreater(dists[ATOM_H, ATOM_C][BOND_SINGLE].pdf(1.0), 0)