def test_find_by_bond_topology_id_source_filtering(self): db = smu_sqlite.SMUSQLite(self.db_filename, 'c') # We'll make 2 molecules # 2001 with bt id 10 (ITC, STARTING) and bt id 11 (MLCR) # 4001 with bt id 10 (ITC), bt id 11 (ITC, STARTING), bt id 12 (CSD) # 6001 with bt id 12 (MLCR) molecules = [] molecules.append(dataset_pb2.Molecule(molecule_id=2001)) self.add_bond_topology_to_molecule( molecules[-1], 10, dataset_pb2.BondTopology.SOURCE_STARTING | dataset_pb2.BondTopology.SOURCE_ITC) self.add_bond_topology_to_molecule( molecules[-1], 11, dataset_pb2.BondTopology.SOURCE_MLCR) molecules.append(dataset_pb2.Molecule(molecule_id=4001)) self.add_bond_topology_to_molecule(molecules[-1], 10, dataset_pb2.BondTopology.SOURCE_ITC) self.add_bond_topology_to_molecule( molecules[-1], 11, dataset_pb2.BondTopology.SOURCE_STARTING | dataset_pb2.BondTopology.SOURCE_ITC) self.add_bond_topology_to_molecule(molecules[-1], 12, dataset_pb2.BondTopology.SOURCE_CSD) molecules.append(dataset_pb2.Molecule(molecule_id=6001)) self.add_bond_topology_to_molecule( molecules[-1], 12, dataset_pb2.BondTopology.SOURCE_MLCR) db.bulk_insert(self.encode_molecules(molecules)) def ids_for(bt_id, which): return [ c.molecule_id for c in db.find_by_bond_topology_id_list([bt_id], which) ] self.assertEqual(ids_for(10, smu_utils_lib.WhichTopologies.ALL), [2001, 4001]) self.assertEqual(ids_for(11, smu_utils_lib.WhichTopologies.ALL), [2001, 4001]) self.assertEqual(ids_for(12, smu_utils_lib.WhichTopologies.ALL), [4001, 6001]) self.assertEqual(ids_for(10, smu_utils_lib.WhichTopologies.STARTING), [2001]) self.assertEqual(ids_for(11, smu_utils_lib.WhichTopologies.MLCR), [2001]) self.assertEqual(ids_for(12, smu_utils_lib.WhichTopologies.CSD), [4001]) self.assertEmpty(ids_for(12, smu_utils_lib.WhichTopologies.ITC)) self.assertEmpty(ids_for(11, smu_utils_lib.WhichTopologies.CSD))
def find_by_bond_topology_id_list(self, btids, which_topologies): """Finds all the molecule associated with a bond topology id. Args: btids: list of bond topology id to look up. which_topologies: which topologies to match, see smu_utils_lib.WhichTopologies Yields: dataset_pb2.Molecule """ cur = self._conn.cursor() # DISTINCT is because the same mid can have the same btid multiple times. select = (''.join([ f'SELECT DISTINCT cid, conformer ' f'FROM {_MOLECULE_TABLE_NAME} ' f'INNER JOIN {_BTID_TABLE_NAME} USING(cid) ' f'WHERE {_BTID_TABLE_NAME}.btid IN (', ','.join('?' for _ in btids), ')' ])) cur.execute(select, btids) for result in cur: molecule = dataset_pb2.Molecule().FromString( snappy.uncompress(result[1])) for _, bt in smu_utils_lib.iterate_bond_topologies( molecule, which_topologies): if bt.bond_topology_id in btids: yield molecule break
def find_by_molecule_id(self, mid): """Finds the molecule associated with a molecule id. Args: mid: molecule id to look up. Returns: dataset_pb2.Molecule Raises: KeyError: if mid is not found """ cur = self._conn.cursor() select = f'SELECT conformer FROM {_MOLECULE_TABLE_NAME} WHERE cid = ?' cur.execute(select, (mid, )) result = cur.fetchall() if not result: raise KeyError(f'Molecule id {mid} not found') # Since it's a unique index, there should only be one result and it's a # tuple with one value. assert len(result) == 1 assert len(result[0]) == 1 return dataset_pb2.Molecule().FromString( snappy.uncompress(result[0][0]))
def ReadMolecule(bond_lengths, input_string, output): """Reads molecule. Args: bond_lengths: input_string: output: Returns: """ # class GetAtoms(beam.DoFn): # def process(self, item): # yield item.optimized_geometry.atom_positions[0].x options = PipelineOptions(direct_num_workers=6, direct_running_mode="multi_processing") # options = PipelineOptions() with beam.Pipeline(options=options) as p: protos = ( p | beam.io.tfrecordio.ReadFromTFRecord( input_string, coder=beam.coders.ProtoCoder(dataset_pb2.Molecule().__class__)) | beam.ParDo(topology_from_geom.TopologyFromGeom(bond_lengths)) | beam.ParDo(SummaryData()) | beam.io.textio.WriteToText(output)) return protos
def parse_equivalent_file(filename): """Parses the .dat of equivalent structure. The file is just pairs of entries where the first was kept over the second. Yields one entry per line keyed by the discarded molecule id. See merge_duplicate_information for how information is transferred to the kept molecule. Args: filename: string Yields: dataset_pb2.Molecule """ with gfile.GFile(filename) as f: for line in f: kept_str, discard_str = line.split() _, _, kept_btid, kept_mid = smu_parser_lib.parse_long_identifier(kept_str) _, _, discard_btid, discard_mid = smu_parser_lib.parse_long_identifier( discard_str) # Convert to our molecule ids which include the btid kept_mid = kept_btid * 1000 + kept_mid discard_mid = discard_btid * 1000 + discard_mid yield dataset_pb2.Molecule( molecule_id=discard_mid, duplicated_by=kept_mid)
def test_extract_bond_lengths_max_unbonded(self): # This molecule does not obery valence rules, but it's fine for this test. mol = dataset_pb2.Molecule(molecule_id=123000) mol.properties.errors.status = 4 bt = mol.bond_topologies.add() bt.atoms.extend([ dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_O ]) bt.bonds.add(atom_a=0, atom_b=1, bond_type=dataset_pb2.BondTopology.BOND_SINGLE) bt.bonds.add(atom_a=0, atom_b=2, bond_type=dataset_pb2.BondTopology.BOND_SINGLE) mol.optimized_geometry.atom_positions.add(x=0, y=0, z=0) mol.optimized_geometry.atom_positions.add(x=1, y=0, z=0) mol.optimized_geometry.atom_positions.add(x=100, y=2, z=0) got = list( pipeline.extract_bond_lengths(mol, dist_sig_digits=2, unbonded_max=2.0)) # Note that these are *not* rounded, but truncated to this many digits. self.assertEqual( got, [ # 1 bohr -> 0.529177249 angstroms ('c', 'n', dataset_pb2.BondTopology.BOND_SINGLE, '0.52'), # It seems like this should be 52.91 but it looks like some # numerical noise in np.linalg.norm. ('c', 'o', dataset_pb2.BondTopology.BOND_SINGLE, '52.92') ])
def get_bond_length_distribution_inner(input_fname, output_fname): """Generate bond length distibutions. Args: input_fname: An existing TFRecord file containing Molecule protos. output_fname: An output file that will be created that contains all bond length distributions - all bond types, all atom types. Requires post-processing to generate bond length distribution files. """ print("Reading from {input_fname} output to {output_fname}") options = PipelineOptions(direct_num_workers=6, direct_running_mode="multi_processing") # options = PipelineOptions() with beam.Pipeline(options=options) as p: protos = ( p | beam.io.tfrecordio.ReadFromTFRecord( input_fname, coder=beam.coders.ProtoCoder(dataset_pb2.Molecule().__class__)) | beam.ParDo(bond_lengths.GetBondLengthDistribution()) | beam.CombinePerKey(sum) # | beam.ParDo(GroupBondTypes()) # | beam.GroupByKey() | beam.ParDo(BondDistToString()) | beam.io.WriteToText(output_fname)) print(protos)
def test_multi_topology_detection(self): """Tests that we can find multiple versions of the same topology.""" single = dataset_pb2.BondTopology.BondType.BOND_SINGLE double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE all_dist = bond_length_distribution.AllAtomPairLengthDistributions() all_dist.add(dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, single, triangular_distribution(1.0, 1.5, 2.0)) all_dist.add(dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, double, triangular_distribution(1.0, 1.4, 2.0)) # This molecule is a flat aromatic square of nitrogens. The single and # double bonds can be rotated such that it's the same topology but # individual bonds have switched single/double. # We set it so the bond lengths favor one of the two arrangements molecule = dataset_pb2.Molecule(molecule_id=123) molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS molecule.bond_topologies.add(bond_topology_id=123, smiles='N1=NN=N1') molecule.bond_topologies[0].atoms.extend([ dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, ]) molecule.bond_topologies[0].bonds.extend([ dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2, bond_type=double), dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0, bond_type=double), ]) dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS dist14a = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS molecule.optimized_geometry.atom_positions.extend([ dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0), dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist14a, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist14a, y=0, z=0), ]) matching_parameters = topology_molecule.MatchingParameters() result = topology_from_geom.bond_topologies_from_geom( molecule, all_dist, matching_parameters) self.assertLen(result.bond_topology, 2) first = result.bond_topology[0] self.assertEqual(smu_utils_lib.get_bond_type(first, 0, 1), single) self.assertEqual(smu_utils_lib.get_bond_type(first, 1, 2), double) self.assertEqual(smu_utils_lib.get_bond_type(first, 2, 3), single) self.assertEqual(smu_utils_lib.get_bond_type(first, 3, 0), double) second = result.bond_topology[1] self.assertEqual(smu_utils_lib.get_bond_type(second, 0, 1), double) self.assertEqual(smu_utils_lib.get_bond_type(second, 1, 2), single) self.assertEqual(smu_utils_lib.get_bond_type(second, 2, 3), double) self.assertEqual(smu_utils_lib.get_bond_type(second, 3, 0), single)
def test_merge_duplicate_information_diff_topology(self): main_mol = dataset_pb2.Molecule(molecule_id=123000) main_mol.initial_geometries.add() main_mol.initial_geometries[0].atom_positions.add(x=1, y=2, z=3) dup_mol = dataset_pb2.Molecule(molecule_id=456000, duplicated_by=123000) dup_mol.initial_geometries.add() dup_mol.initial_geometries[0].atom_positions.add(x=4, y=5, z=6) got = pipeline.merge_duplicate_information(123000, [dup_mol, main_mol]) self.assertEqual(got.molecule_id, 123000) self.assertEqual(got.duplicated_by, 0) self.assertEqual(got.duplicate_of, [456000]) # TODO(pfr, ianwatson): implement correct copying of initial geometry self.assertLen(got.initial_geometries, 1) self.assertEqual(got.initial_geometries[0].atom_positions[0].x, 1)
def test_merge_duplicate_information_same_topology(self): main_mol = dataset_pb2.Molecule(molecule_id=123000) main_mol.initial_geometries.add() main_mol.initial_geometries[0].atom_positions.add(x=1, y=2, z=3) dup_mol = dataset_pb2.Molecule(molecule_id=123456, duplicated_by=123000) dup_mol.initial_geometries.add() dup_mol.initial_geometries[0].atom_positions.add(x=4, y=5, z=6) got = pipeline.merge_duplicate_information(123000, [dup_mol, main_mol]) self.assertEqual(got.molecule_id, 123000) self.assertEqual(got.duplicated_by, 0) self.assertEqual(got.duplicate_of, [123456]) self.assertLen(got.initial_geometries, 2) self.assertEqual(got.initial_geometries[0].atom_positions[0].x, 1) self.assertEqual(got.initial_geometries[1].atom_positions[0].x, 4)
def _create_dummy_molecule(self): mol = dataset_pb2.Molecule(molecule_id=123000) bt = mol.bond_topologies.add() bt.atoms.extend( [dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_C]) bt.bonds.add(atom_a=0, atom_b=1, bond_type=dataset_pb2.BondTopology.BOND_SINGLE) mol.optimized_geometry.atom_positions.add(x=0, y=0, z=0) mol.optimized_geometry.atom_positions.add(x=1, y=0, z=0) return mol
def test_extract_bond_lengths(self): # This molecule does not obey valence rules, but it's fine for this test. mol = dataset_pb2.Molecule(molecule_id=123000) mol.properties.errors.status = 4 bt = mol.bond_topologies.add() bt.atoms.extend([ dataset_pb2.BondTopology.ATOM_ONEG, dataset_pb2.BondTopology.ATOM_NPOS, dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_H ]) bt.bonds.add(atom_a=0, atom_b=1, bond_type=dataset_pb2.BondTopology.BOND_SINGLE) bt.bonds.add(atom_a=0, atom_b=2, bond_type=dataset_pb2.BondTopology.BOND_DOUBLE) bt.bonds.add(atom_a=0, atom_b=3, bond_type=dataset_pb2.BondTopology.BOND_SINGLE) mol.optimized_geometry.atom_positions.add(x=0, y=0, z=0) mol.optimized_geometry.atom_positions.add(x=1, y=0, z=0) mol.optimized_geometry.atom_positions.add(x=0, y=2, z=0) mol.optimized_geometry.atom_positions.add(x=111, y=222, z=333) got = list( pipeline.extract_bond_lengths(mol, dist_sig_digits=2, unbonded_max=2.0)) # Note that these are *not* rounded, but truncated to this many digits. self.assertEqual( got, [ # 1 bohr -> 0.529177249 angstroms ('n', 'o', dataset_pb2.BondTopology.BOND_SINGLE, '0.52'), # 2 bohr -> 2 * 0.529177249 angstroms ('c', 'o', dataset_pb2.BondTopology.BOND_DOUBLE, '1.05'), # sqrt(1**2 + 2**2) bohr -> 2.23606 * 0.529177249 angstroms ('c', 'n', dataset_pb2.BondTopology.BOND_UNDEFINED, '1.18') ])
def get_molecule(self, oc_dist, cn_dist): molecule = dataset_pb2.Molecule(molecule_id=12345) molecule.bond_topologies.append(dataset_pb2.BondTopology(smiles='N=C=O')) molecule.bond_topologies[0].atoms.extend([ dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_H ]) molecule.bond_topologies[0].bonds.append( dataset_pb2.BondTopology.Bond( atom_a=0, atom_b=1, bond_type=dataset_pb2.BondTopology.BondType.BOND_DOUBLE)) molecule.bond_topologies[0].bonds.append( dataset_pb2.BondTopology.Bond( atom_a=1, atom_b=2, bond_type=dataset_pb2.BondTopology.BondType.BOND_DOUBLE)) molecule.bond_topologies[0].bonds.append( dataset_pb2.BondTopology.Bond( atom_a=2, atom_b=3, bond_type=dataset_pb2.BondTopology.BondType.BOND_SINGLE)) molecule.optimized_geometry.atom_positions.append( dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0)) molecule.optimized_geometry.atom_positions.append( dataset_pb2.Geometry.AtomPos( x=0, y=0, z=oc_dist / smu_utils_lib.BOHR_TO_ANGSTROMS)) molecule.optimized_geometry.atom_positions.append( dataset_pb2.Geometry.AtomPos( x=0, y=0, z=(oc_dist + cn_dist) / smu_utils_lib.BOHR_TO_ANGSTROMS)) molecule.optimized_geometry.atom_positions.append( dataset_pb2.Geometry.AtomPos( x=0, y=0, z=(oc_dist + cn_dist + 1) / smu_utils_lib.BOHR_TO_ANGSTROMS)) return molecule
def make_fake_molecule(self, mid): molecule = dataset_pb2.Molecule() molecule.molecule_id = mid self.add_bond_topology_to_molecule(molecule, mid // 1000, dataset_pb2.BondTopology.SOURCE_ITC) return molecule
def test_find_by_topology(self): db = smu_sqlite.SMUSQLite(self.db_filename, 'c') # We'll make a pretty fake molecule. N2O2H2 with # the O at 0,0 # the Ns at 1.1,0 and 0,1.1 # The Hs right night to the Ns # We'll given it the ring topology to start and the symetric ring broken # topologies should be found. molecule = dataset_pb2.Molecule(molecule_id=9999) molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS bt = molecule.bond_topologies.add(smiles='N1NO1', bond_topology_id=100) geom = molecule.optimized_geometry.atom_positions bt.atoms.append(dataset_pb2.BondTopology.ATOM_O) geom.append(dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0)) bt.atoms.append(dataset_pb2.BondTopology.ATOM_N) geom.append(dataset_pb2.Geometry.AtomPos(x=0, y=1.1, z=0)) bt.bonds.append( dataset_pb2.BondTopology.Bond( atom_a=0, atom_b=1, bond_type=dataset_pb2.BondTopology.BOND_SINGLE)) bt.atoms.append(dataset_pb2.BondTopology.ATOM_N) geom.append(dataset_pb2.Geometry.AtomPos(x=1.1, y=0, z=0)) bt.bonds.append( dataset_pb2.BondTopology.Bond( atom_a=0, atom_b=2, bond_type=dataset_pb2.BondTopology.BOND_SINGLE)) bt.bonds.append( dataset_pb2.BondTopology.Bond( atom_a=1, atom_b=2, bond_type=dataset_pb2.BondTopology.BOND_SINGLE)) bt.atoms.append(dataset_pb2.BondTopology.ATOM_H) geom.append(dataset_pb2.Geometry.AtomPos(x=0, y=1.2, z=0)) bt.bonds.append( dataset_pb2.BondTopology.Bond( atom_a=1, atom_b=3, bond_type=dataset_pb2.BondTopology.BOND_SINGLE)) bt.atoms.append(dataset_pb2.BondTopology.ATOM_H) geom.append(dataset_pb2.Geometry.AtomPos(x=1.2, y=0, z=0)) bt.bonds.append( dataset_pb2.BondTopology.Bond( atom_a=2, atom_b=4, bond_type=dataset_pb2.BondTopology.BOND_SINGLE)) for pos in geom: pos.x /= smu_utils_lib.BOHR_TO_ANGSTROMS pos.y /= smu_utils_lib.BOHR_TO_ANGSTROMS pos.z /= smu_utils_lib.BOHR_TO_ANGSTROMS db.bulk_insert([molecule.SerializeToString()]) db.bulk_insert_smiles([['N1NO1', 100], ['N=[NH+][O-]', 101]]) bond_lengths = bond_length_distribution.make_fake_empiricals() # We'll query by the topology that was in the DB then the one that wasn't for query_smiles in ['N1NO1', 'N=[NH+][O-]']: got = list( db.find_by_topology(query_smiles, bond_lengths=bond_lengths)) self.assertLen(got, 1) self.assertCountEqual( [100, 101, 101], [bt.bond_topology_id for bt in got[0].bond_topologies])
def test_scores(self): carbon = dataset_pb2.BondTopology.ATOM_C single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE # For testing, turn off the need for complete matching. topology_molecule.default_must_match_all_bonds = False all_distributions = bond_length_distribution.AllAtomPairLengthDistributions( ) bldc1c = triangular_distribution(1.0, 1.4, 2.0) all_distributions.add(carbon, carbon, single_bond, bldc1c) bldc2c = triangular_distribution(1.0, 1.5, 2.0) all_distributions.add(carbon, carbon, double_bond, bldc2c) molecule = dataset_pb2.Molecule() molecule.bond_topologies.append( text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C bonds: { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology())) molecule.optimized_geometry.MergeFrom( text_format.Parse( """ atom_positions { x: 0.0 y: 0.0 z: 0.0 }, atom_positions { x: 0.0 y: 0.0 z: 0.0 } """, dataset_pb2.Geometry())) molecule.optimized_geometry.atom_positions[1].x = ( 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS) matching_parameters = topology_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS molecule.molecule_id = 1001 result = topology_from_geom.bond_topologies_from_geom( molecule, all_distributions, matching_parameters) self.assertIsNotNone(result) self.assertLen(result.bond_topology, 2) self.assertLen(result.bond_topology[0].bonds, 1) self.assertLen(result.bond_topology[1].bonds, 1) self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond) self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond) self.assertGreater(result.bond_topology[0].topology_score, result.bond_topology[1].topology_score) self.assertAlmostEqual( np.sum(np.exp([bt.topology_score for bt in result.bond_topology])), 1.0) self.assertAlmostEqual(result.bond_topology[0].geometry_score, np.log(bldc1c.pdf(1.4))) self.assertAlmostEqual(result.bond_topology[1].geometry_score, np.log(bldc2c.pdf(1.4)))