예제 #1
0
def main(unused_argv):
  db = smu_sqlite.SMUSQLite('20220128_standard_v2.sqlite')

  bond_lengths = bond_length_distribution.AllAtomPairLengthDistributions()
  bond_lengths.add_from_sparse_dataframe_file(
      '20220128_bond_lengths.csv',
      bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS,
      bond_length_distribution.STANDARD_SIG_DIGITS)
  fake_smiles_id_dict = collections.defaultdict(lambda: -1)

  print('molecule_id, count_all, count_smu, count_covalent, count_allen')
  for molecule in db:
    if abs(hash(str(molecule.molecule_id))) % 1000 != 1:
      continue

    topology_from_geom.standard_topology_sensing(molecule, bond_lengths,
                                                 fake_smiles_id_dict)

    count_all = len(molecule.bond_topologies)
    count_smu = sum(bt.source & dataset_pb2.BondTopology.SOURCE_ITC != 0
                    for bt in molecule.bond_topologies)
    count_covalent = sum(bt.source & dataset_pb2.BondTopology.SOURCE_MLCR != 0
                         for bt in molecule.bond_topologies)
    count_allen = sum(bt.source & dataset_pb2.BondTopology.SOURCE_CSD != 0
                      for bt in molecule.bond_topologies)

    print(
        f'{molecule.molecule_id}, {count_all}, {count_smu}, {count_covalent}, {count_allen}'
    )
예제 #2
0
    def test_atom_ordering(self):
        all_dists = bond_length_distribution.AllAtomPairLengthDistributions()
        all_dists.add(ATOM_N, ATOM_O, BOND_SINGLE,
                      bond_length_distribution.FixedWindow(1, 2, None))
        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_O, BOND_SINGLE, 1.5),
            1)
        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_O, ATOM_N, BOND_SINGLE, 1.5),
            1)

        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_O, BOND_SINGLE, 999),
            0)
        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_O, ATOM_N, BOND_SINGLE, 999),
            0)

        # Make sure subsequent additions work as well
        all_dists.add(ATOM_N, ATOM_O, BOND_DOUBLE,
                      bond_length_distribution.FixedWindow(2, 3, None))
        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_O, BOND_DOUBLE, 2.5),
            1)
        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_O, ATOM_N, BOND_DOUBLE, 2.5),
            1)
예제 #3
0
    def __init__(self, bond_lengths_csv, bond_lengths_arg, bond_topology_csv):
        if bond_lengths_csv is None:
            raise ValueError('--bond_lengths_csv required')
        logging.info('Loading bond_lengths')
        with open(bond_lengths_csv, 'r') as infile:
            df = pd.read_csv(infile, dtype={'length_str': str})
        self.bond_lengths = bond_length_distribution.AllAtomPairLengthDistributions(
        )
        self.bond_lengths.add_from_sparse_dataframe(
            df, self._BOND_LENGTHS_UNBONDED_RIGHT_TAIL_MASS,
            self._BOND_LENGTHS_SIG_DIGITS)
        logging.info('Done loading bond_lengths_csv')

        self._parse_bond_lengths_arg(bond_lengths_arg)

        if bond_topology_csv is None:
            raise ValueError('--bond_topology_csv required')
        logging.info('Loading bond topologies')
        self.smiles_id_dict = {}
        with open(bond_topology_csv, 'r') as infile:
            reader = csv.reader(iter(infile))
            next(reader)  # skip the header line
            for row in reader:
                bt_id, _, _, _, _, smiles = row
                self.smiles_id_dict[smiles] = int(bt_id)
        logging.info('Done loading bond topologies')
def TopologyFromGeometryMain(unused_argv):
    del unused_argv

    bond_lengths = bond_length_distribution.AllAtomPairLengthDistributions()
    bond_lengths.add_from_files(FLAGS.bonds, 0.0, FLAGS.xnonbond)
    protos = ReadMolecule(bond_lengths, FLAGS.input, FLAGS.output)
    print(protos)
    def test_scores(self):
        carbon = dataset_pb2.BondTopology.AtomType.ATOM_C
        single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE
        double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

        # For testing, turn off the need for complete matching.
        smu_molecule.default_must_match_all_bonds = False

        all_distributions = bond_length_distribution.AllAtomPairLengthDistributions(
        )
        x, y = triangular_distribution(1.0, 1.4, 2.0)
        df = pd.DataFrame({"length": x, "count": y})
        bldc1c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0)
        all_distributions.add(carbon, carbon, single_bond, bldc1c)

        x, y = triangular_distribution(1.0, 1.5, 2.0)
        df = pd.DataFrame({"length": x, "count": y})
        bldc2c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0)
        all_distributions.add(carbon, carbon, double_bond, bldc2c)

        bond_topology = text_format.Parse(
            """
atoms: ATOM_C
atoms: ATOM_C
bonds: {
  atom_a: 0
  atom_b: 1
  bond_type: BOND_SINGLE
}
""", dataset_pb2.BondTopology())

        geometry = text_format.Parse(
            """
atom_positions {
  x: 0.0
  y: 0.0
  z: 0.0
},
atom_positions {
  x: 0.0
  y: 0.0
  z: 0.0
}
""", dataset_pb2.Geometry())
        geometry.atom_positions[1].x = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS

        matching_parameters = smu_molecule.MatchingParameters()
        matching_parameters.must_match_all_bonds = False
        result = topology_from_geom.bond_topologies_from_geom(
            all_distributions, bond_topology, geometry, matching_parameters)
        self.assertIsNotNone(result)
        self.assertEqual(len(result.bond_topology), 2)
        self.assertEqual(len(result.bond_topology[0].bonds), 1)
        self.assertEqual(len(result.bond_topology[1].bonds), 1)
        self.assertGreater(result.bond_topology[0].score,
                           result.bond_topology[1].score)
        self.assertEqual(result.bond_topology[0].bonds[0].bond_type,
                         single_bond)
        self.assertEqual(result.bond_topology[1].bonds[0].bond_type,
                         double_bond)
예제 #6
0
  def test_multi_topology_detection(self):
    """Tests that we can find multiple versions of the same topology."""
    single = dataset_pb2.BondTopology.BondType.BOND_SINGLE
    double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

    all_dist = bond_length_distribution.AllAtomPairLengthDistributions()
    all_dist.add(dataset_pb2.BondTopology.ATOM_N,
                 dataset_pb2.BondTopology.ATOM_N, single,
                 triangular_distribution(1.0, 1.5, 2.0))
    all_dist.add(dataset_pb2.BondTopology.ATOM_N,
                 dataset_pb2.BondTopology.ATOM_N, double,
                 triangular_distribution(1.0, 1.4, 2.0))

    # This molecule is a flat aromatic square of nitrogens. The single and
    # double bonds can be rotated such that it's the same topology but
    # individual bonds have switched single/double.
    # We set it so the bond lengths favor one of the two arrangements
    molecule = dataset_pb2.Molecule(molecule_id=123)
    molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS

    molecule.bond_topologies.add(bond_topology_id=123, smiles='N1=NN=N1')
    molecule.bond_topologies[0].atoms.extend([
        dataset_pb2.BondTopology.ATOM_N,
        dataset_pb2.BondTopology.ATOM_N,
        dataset_pb2.BondTopology.ATOM_N,
        dataset_pb2.BondTopology.ATOM_N,
    ])
    molecule.bond_topologies[0].bonds.extend([
        dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1, bond_type=single),
        dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2, bond_type=double),
        dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3, bond_type=single),
        dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0, bond_type=double),
    ])

    dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS
    dist14a = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS
    molecule.optimized_geometry.atom_positions.extend([
        dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0),
        dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0),
        dataset_pb2.Geometry.AtomPos(x=dist14a, y=dist15a, z=0),
        dataset_pb2.Geometry.AtomPos(x=dist14a, y=0, z=0),
    ])

    matching_parameters = topology_molecule.MatchingParameters()
    result = topology_from_geom.bond_topologies_from_geom(
        molecule, all_dist, matching_parameters)

    self.assertLen(result.bond_topology, 2)

    first = result.bond_topology[0]
    self.assertEqual(smu_utils_lib.get_bond_type(first, 0, 1), single)
    self.assertEqual(smu_utils_lib.get_bond_type(first, 1, 2), double)
    self.assertEqual(smu_utils_lib.get_bond_type(first, 2, 3), single)
    self.assertEqual(smu_utils_lib.get_bond_type(first, 3, 0), double)

    second = result.bond_topology[1]
    self.assertEqual(smu_utils_lib.get_bond_type(second, 0, 1), double)
    self.assertEqual(smu_utils_lib.get_bond_type(second, 1, 2), single)
    self.assertEqual(smu_utils_lib.get_bond_type(second, 2, 3), double)
    self.assertEqual(smu_utils_lib.get_bond_type(second, 3, 0), single)
예제 #7
0
 def test_probability_bond_types(self):
     all_dists = bond_length_distribution.AllAtomPairLengthDistributions()
     all_dists.add(ATOM_N, ATOM_O, BOND_SINGLE,
                   bond_length_distribution.FixedWindow(1, 4, None))
     all_dists.add(ATOM_N, ATOM_O, BOND_DOUBLE,
                   bond_length_distribution.FixedWindow(1, 2, None))
     got = all_dists.probability_of_bond_types(ATOM_N, ATOM_O, 1.5)
     self.assertLen(got, 2)
     self.assertAlmostEqual(got[BOND_SINGLE], 0.25)
     self.assertAlmostEqual(got[BOND_DOUBLE], 0.75)
예제 #8
0
    def test_missing_types(self):
        all_dists = bond_length_distribution.AllAtomPairLengthDistributions()
        all_dists.add(ATOM_N, ATOM_O, BOND_SINGLE,
                      bond_length_distribution.FixedWindow(1, 2, None))

        with self.assertRaises(KeyError):
            all_dists.probability_of_bond_types(ATOM_C, ATOM_C, 1.0)

        with self.assertRaises(KeyError):
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 1.0)
예제 #9
0
    def __init__(self, bond_lengths_csv, bond_lengths_arg):
        if bond_lengths_csv is None:
            raise ValueError('--bond_lengths_csv required')
        logging.info('Loading bond_lengths')
        self.bond_lengths = (
            bond_length_distribution.AllAtomPairLengthDistributions())
        self.bond_lengths.add_from_sparse_dataframe_file(
            bond_lengths_csv,
            bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS,
            bond_length_distribution.STANDARD_SIG_DIGITS)
        logging.info('Done loading bond_lengths_csv')

        self.bond_lengths.add_from_string_spec(bond_lengths_arg)
예제 #10
0
    def test_add_from_gaussians_file(self):
        all_dists = bond_length_distribution.AllAtomPairLengthDistributions()

        all_dists.add_from_gaussians_file(
            os.path.join(TESTDATA_PATH, 'example_gaussian_input.csv'), 1)

        # The example file has a few lines copied from the real exporte files.
        #
        # Note two things in the file that might not be obvious
        # * An line with "n/a" as Bond that is ignored
        # * A line with "N:N" but no values that is ignored
        #
        # The numbers in the test below come from looking at a few cases there.

        # Two enties for C-C, making sure they mix.
        self.assertGreater(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE,
                                            1.513), 0)
        self.assertGreater(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE,
                                            1.588 + .001), 0)
        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 1.8),
            0)

        # Testing double bond
        self.assertGreater(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_DOUBLE,
                                            1.299 + .001), 0)
        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_DOUBLE, 1.8),
            0)

        # Testing triple bond
        self.assertGreater(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_TRIPLE,
                                            1.183 + .001), 0)
        self.assertEqual(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_TRIPLE, 1.0),
            0)

        # Aromatic are converted to both single and double.
        self.assertGreater(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_SINGLE,
                                            1.304), 0)
        self.assertGreater(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_DOUBLE,
                                            1.304), 0)
        with self.assertRaises(KeyError):
            all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_TRIPLE, 1.304)
예제 #11
0
  def test_missing_types(self):
    all_dists = bond_length_distribution.AllAtomPairLengthDistributions()
    all_dists.add(
        dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_O,
        dataset_pb2.BondTopology.BOND_SINGLE,
        bond_length_distribution.FixedWindowLengthDistribution(1, 2, None))

    with self.assertRaises(KeyError):
      all_dists.probability_of_bond_types(dataset_pb2.BondTopology.ATOM_C,
                                          dataset_pb2.BondTopology.ATOM_C, 1.0)

    with self.assertRaises(KeyError):
      all_dists.pdf_length_given_type(dataset_pb2.BondTopology.ATOM_C,
                                      dataset_pb2.BondTopology.ATOM_C,
                                      dataset_pb2.BondTopology.BOND_SINGLE, 1.0)
예제 #12
0
  def process(self, conformer, bond_length_records, smiles_id_dict):
    """Per conformer updates.

    Args:
      conformer: dataset_pb2.Conformer
      bond_length_records: tuples to go to
        bond_length_distribution.AllAtomPairLengthDistributions
      smiles_id_dict: dict from SMILES to bond topology id

    Yields:
      Conformer.
    """
    # There is probably a better way to do this.
    # We get the side input with each call to process. We'll assume that it's
    # always the same input, so we set our cache value and never update it.
    # We only do this with bond_length_records because there is a reasonable
    # amount of processing in creating AllAtomPairLengthDistributions.
    # The smiles_id_dict is used directly.
    if not self._cached_bond_lengths:
      self._cached_bond_lengths = (
          bond_length_distribution.AllAtomPairLengthDistributions())
      try:
        self._cached_bond_lengths.add_from_sparse_dataframe(
            bond_length_distribution.sparse_dataframe_from_records(
                bond_length_records), _BOND_LENGTHS_UNBONDED_RIGHT_TAIL_MASS,
            _BOND_LENGTHS_SIG_DIGITS)
      except ValueError as err:
        raise ValueError(
            'Invalid sparse dataframe for conformer {0} org. ValueError: {1}'
            .format(str(conformer.conformer_id), err))

    conformer = copy.deepcopy(conformer)

    conformer.fate = smu_utils_lib.determine_fate(conformer)

    yield from self._compare_smiles(conformer)

    if (conformer.duplicated_by == 0 and
        conformer.properties.errors.status < 512):
      # The duplicate records do not need topology extraction and anything
      # with this high an error is pretty messed so, do we won't bother trying
      # to match the topolgy.
      self._add_alternative_bond_topologies(conformer, smiles_id_dict)
    else:
      beam.metrics.Metrics.counter(_METRICS_NAMESPACE,
                                   'skipped_topology_matches').inc()

    yield conformer
예제 #13
0
  def process(self, molecule, bond_length_records, smiles_id_dict):
    """Per molecule updates.

    Args:
      molecule: dataset_pb2.Molecule
      bond_length_records: tuples to go to
        bond_length_distribution.AllAtomPairLengthDistributions
      smiles_id_dict: dict from SMILES to bond topology id

    Yields:
      Molecule.
    """
    # There is probably a better way to do this.
    # We get the side input with each call to process. We'll assume that it's
    # always the same input, so we set our cache value and never update it.
    # We only do this with bond_length_records because there is a reasonable
    # amount of processing in creating AllAtomPairLengthDistributions.
    # The smiles_id_dict is used directly.
    if not self._cached_bond_lengths:
      self._cached_bond_lengths = (
          bond_length_distribution.AllAtomPairLengthDistributions())
      try:
        self._cached_bond_lengths.add_from_sparse_dataframe(
            bond_length_distribution.sparse_dataframe_from_records(
                bond_length_records),
            bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS,
            bond_length_distribution.STANDARD_SIG_DIGITS)
      except ValueError as err:
        raise ValueError(
            'Invalid sparse dataframe for molecule {0} org. ValueError: {1}'
            .format(str(molecule.molecule_id), err)) from err

    molecule = copy.deepcopy(molecule)

    molecule.properties.errors.fate = smu_utils_lib.determine_fate(molecule)

    yield from self._compare_smiles(molecule)

    if smu_utils_lib.molecule_eligible_for_topology_detection(molecule):
      self._add_alternative_bond_topologies(molecule, smiles_id_dict)
    else:
      molecule.bond_topologies[
          0].source = dataset_pb2.BondTopology.SOURCE_STARTING
      beam.metrics.Metrics.counter(_METRICS_NAMESPACE,
                                   'skipped_topology_matches').inc()

    yield molecule
예제 #14
0
 def get_smu_dists(self):
   bld = bond_length_distribution.AllAtomPairLengthDistributions()
   # This is set up to make the O=C length of 1.25 a much better fit than
   # the [O-]-C bond
   bld.add(dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_C,
           dataset_pb2.BondTopology.BondType.BOND_SINGLE,
           triangular_distribution(1.2, 1.6, 1.8))
   bld.add(dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_C,
           dataset_pb2.BondTopology.BondType.BOND_DOUBLE,
           triangular_distribution(1.2, 1.25, 1.3))
   bld.add(dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N,
           dataset_pb2.BondTopology.BondType.BOND_DOUBLE,
           bond_length_distribution.FixedWindow(1.1, 1.3, None))
   bld.add(dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N,
           dataset_pb2.BondTopology.BondType.BOND_TRIPLE,
           bond_length_distribution.FixedWindow(1.2, 1.4, None))
   return bld
    def test_add_from_sparse_dataframe(self):
        df = pd.DataFrame.from_records([
            ('c', 'c', 1, '1.0', 10),
            ('c', 'c', 1, '1.2', 30),
            ('n', 'o', 2, '1.0', 50),
            ('n', 'o', 2, '1.5', 50),
            ('n', 'n', 0, '1.5', 100),
            ('n', 'n', 0, '1.8', 100),
        ],
                                       columns=[
                                           'atom_char_0', 'atom_char_1',
                                           'bond_type', 'length_str', 'count'
                                       ])
        all_dists = bond_length_distribution.AllAtomPairLengthDistributions()
        all_dists.add_from_sparse_dataframe(df,
                                            sig_digits=1,
                                            unbonded_right_tail_mass=0.8)

        carbon = dataset_pb2.BondTopology.AtomType.ATOM_C
        nitrogen = dataset_pb2.BondTopology.AtomType.ATOM_N
        oxygen = dataset_pb2.BondTopology.AtomType.ATOM_O
        unbonded = dataset_pb2.BondTopology.BondType.BOND_UNDEFINED
        single = dataset_pb2.BondTopology.BondType.BOND_SINGLE
        double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, carbon, single, 1.05), 2.5)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, carbon, single, 999), 0.0)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(nitrogen, oxygen, double, 1.55),
            5.0)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(nitrogen, nitrogen, unbonded,
                                            1.85), 1.0)
        # This makes sure the right tail mass was included
        self.assertGreater(
            all_dists.pdf_length_given_type(nitrogen, nitrogen, unbonded, 2.0),
            0.0)
        self.assertGreater(
            all_dists.pdf_length_given_type(nitrogen, nitrogen, unbonded, 3.0),
            0.0)
예제 #16
0
    def test_add_from_sparse_dataframe(self):
        df = pd.DataFrame.from_records([
            ('c', 'c', 1, '1.0', 10),
            ('c', 'c', 1, '1.1', 30),
            ('n', 'o', 2, '1.4', 50),
            ('n', 'o', 2, '1.5', 50),
            ('n', 'n', 0, '1.7', 100),
            ('n', 'n', 0, '1.8', 100),
        ],
                                       columns=[
                                           'atom_char_0', 'atom_char_1',
                                           'bond_type', 'length_str', 'count'
                                       ])
        all_dists = bond_length_distribution.AllAtomPairLengthDistributions()
        all_dists.add_from_sparse_dataframe(df,
                                            sig_digits=1,
                                            unbonded_right_tail_mass=0.8)

        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 1.05),
            2.5)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(ATOM_C, ATOM_C, BOND_SINGLE, 999),
            0.0)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_O, BOND_DOUBLE, 1.55),
            5.0)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_UNDEFINED,
                                            1.85), 1.0)
        # This makes sure the right tail mass was included
        self.assertGreater(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_UNDEFINED,
                                            2.0), 0.0)
        self.assertGreater(
            all_dists.pdf_length_given_type(ATOM_N, ATOM_N, BOND_UNDEFINED,
                                            3.0), 0.0)
def main(argv):
  # Shortcuts for below
  atom_str = {
      dataset_pb2.BondTopology.ATOM_C: "ATOM_C",
      dataset_pb2.BondTopology.ATOM_N: "ATOM_N",
      dataset_pb2.BondTopology.ATOM_O: "ATOM_O",
      dataset_pb2.BondTopology.ATOM_F: "ATOM_F"
  }
  bond_str = {
      dataset_pb2.BondTopology.BOND_SINGLE: "BOND_SINGLE",
      dataset_pb2.BondTopology.BOND_DOUBLE: "BOND_DOUBLE",
      dataset_pb2.BondTopology.BOND_TRIPLE: "BOND_TRIPLE"
  }

  allen_dists = bond_length_distribution.AllAtomPairLengthDistributions()
  allen_dists.add_from_gaussians_file(argv[1], 3)

  for (atom_a, atom_b), bond in itertools.product(
      itertools.combinations_with_replacement([
          dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N,
          dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_F
      ], 2), [
          dataset_pb2.BondTopology.BOND_SINGLE,
          dataset_pb2.BondTopology.BOND_DOUBLE,
          dataset_pb2.BondTopology.BOND_TRIPLE
      ]):
    try:
      mn = allen_dists[(atom_a, atom_b)][bond].min()
      mx = allen_dists[(atom_a, atom_b)][bond].max()
      print(
          f"  (dataset_pb2.BondTopology.{atom_str[atom_a]},\n"
          f"   dataset_pb2.BondTopology.{atom_str[atom_b]},\n"
          f"   dataset_pb2.BondTopology.{bond_str[bond]}): ({mn:0.3f}, {mx:.03f}),"
      )
    except KeyError:
      pass
    def test_add_from_files(self):
        data = """1.0,1
1.1,2
1.2,3
1.3,2
"""
        data_increasing = """1.0,1
1.1,2
1.2,3
1.3,4
1.4,5
"""

        tmpdir = self.create_tempdir()
        stem = os.path.join(tmpdir, 'BONDS')
        self.create_tempfile(f'{stem}.6.0.6', content=data_increasing)
        self.create_tempfile(f'{stem}.6.1.6', content=data)

        self.create_tempfile(f'{stem}.6.0.7', content=data_increasing)
        self.create_tempfile(f'{stem}.6.1.7', content=data)
        self.create_tempfile(f'{stem}.6.2.7', content=data)
        self.create_tempfile(f'{stem}.6.3.7', content=data)

        all_dists = bond_length_distribution.AllAtomPairLengthDistributions()
        all_dists.add_from_files(stem, unbonded_right_tail_mass=0.8)

        carbon = dataset_pb2.BondTopology.AtomType.ATOM_C
        nitrogen = dataset_pb2.BondTopology.AtomType.ATOM_N
        unbonded = dataset_pb2.BondTopology.BondType.BOND_UNDEFINED
        single = dataset_pb2.BondTopology.BondType.BOND_SINGLE
        double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE
        triple = dataset_pb2.BondTopology.BondType.BOND_TRIPLE

        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, carbon, unbonded, 0.99),
            0.0)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, unbonded, 0.99),
            0.0)

        # The 3/15 is the counts in the data_increasing file.
        # * 10 is for the pdf because the bucket is 0.1 wide
        # * 0.2 is because of the right tail mass.
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, carbon, unbonded, 1.25),
            3.0 / 15.0 * 10 * 0.2)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, unbonded, 1.25),
            3.0 / 15.0 * 10 * 0.2)

        # Test the right tail mass for the unbonded
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, carbon, unbonded, 1.5),
            0.66666667)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, unbonded, 1.5),
            0.66666667)

        # Test the bonded inside the pdf.
        # 3/8 are the counts in the data file
        # * 10 is for the pdf because the bucket is 0.1 wide
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, carbon, single, 1.25),
            3.0 / 8.0 * 10)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, single, 1.25),
            3.0 / 8.0 * 10)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, double, 1.25),
            3.0 / 8.0 * 10)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, triple, 1.25),
            3.0 / 8.0 * 10)

        # Check for no right tail mass for the bonded
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, carbon, single, 1.5), 0.0)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, single, 1.5),
            0.0)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, double, 1.5),
            0.0)
        self.assertAlmostEqual(
            all_dists.pdf_length_given_type(carbon, nitrogen, triple, 1.5),
            0.0)
예제 #19
0
  def test_scores(self):
    carbon = dataset_pb2.BondTopology.ATOM_C
    single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE
    double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

    # For testing, turn off the need for complete matching.
    topology_molecule.default_must_match_all_bonds = False

    all_distributions = bond_length_distribution.AllAtomPairLengthDistributions(
    )
    bldc1c = triangular_distribution(1.0, 1.4, 2.0)
    all_distributions.add(carbon, carbon, single_bond, bldc1c)
    bldc2c = triangular_distribution(1.0, 1.5, 2.0)
    all_distributions.add(carbon, carbon, double_bond, bldc2c)

    molecule = dataset_pb2.Molecule()

    molecule.bond_topologies.append(
        text_format.Parse(
            """
atoms: ATOM_C
atoms: ATOM_C
bonds: {
  atom_a: 0
  atom_b: 1
  bond_type: BOND_SINGLE
}
""", dataset_pb2.BondTopology()))

    molecule.optimized_geometry.MergeFrom(
        text_format.Parse(
            """
atom_positions {
  x: 0.0
  y: 0.0
  z: 0.0
},
atom_positions {
  x: 0.0
  y: 0.0
  z: 0.0
}
""", dataset_pb2.Geometry()))
    molecule.optimized_geometry.atom_positions[1].x = (
        1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS)

    matching_parameters = topology_molecule.MatchingParameters()
    matching_parameters.must_match_all_bonds = False
    molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS
    molecule.molecule_id = 1001
    result = topology_from_geom.bond_topologies_from_geom(
        molecule, all_distributions, matching_parameters)
    self.assertIsNotNone(result)
    self.assertLen(result.bond_topology, 2)
    self.assertLen(result.bond_topology[0].bonds, 1)
    self.assertLen(result.bond_topology[1].bonds, 1)
    self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond)
    self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond)
    self.assertGreater(result.bond_topology[0].topology_score,
                       result.bond_topology[1].topology_score)
    self.assertAlmostEqual(
        np.sum(np.exp([bt.topology_score for bt in result.bond_topology])), 1.0)
    self.assertAlmostEqual(result.bond_topology[0].geometry_score,
                           np.log(bldc1c.pdf(1.4)))
    self.assertAlmostEqual(result.bond_topology[1].geometry_score,
                           np.log(bldc2c.pdf(1.4)))
예제 #20
0
print('find_by_smiles on', smiles, 'finds these molecule ids')
print([c.molecule_id for c in original_molecules])

print()
print('But you can modify the allowed distances for each type of bond')
print(
    'and find all molecules which match a given topology with these modifications'
)

print(
    'While this does not have the read the whole database, it is a much less efficient operation than querying by smiles, so only use it if you modify the allowed distances'
)

print()
print('First you have to load the default bond lengths')
bond_lengths = bond_length_distribution.AllAtomPairLengthDistributions()
bond_lengths.add_from_sparse_dataframe_file(
    '20220128_bond_lengths.csv',
    bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS,
    bond_length_distribution.STANDARD_SIG_DIGITS)

print()
print('You then provide the desired topology as a SMILES string')
print(
    'The topology query without modifying bond lengths, finds the same result')
unmodified_molecules = sorted(list(db.find_by_topology(smiles, bond_lengths)),
                              key=lambda c: c.molecule_id)
print('Unmodified find_by_topology finds these molecule ids')
print([c.molecule_id for c in unmodified_molecules])

print()
예제 #21
0
    def test_multi_topology_detection(self):
        """Tests that we can find multiple versions of the same topology."""
        single = dataset_pb2.BondTopology.BondType.BOND_SINGLE
        double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

        all_dist = bond_length_distribution.AllAtomPairLengthDistributions()
        for bond_type in [single, double]:
            all_dist.add(
                dataset_pb2.BondTopology.ATOM_N,
                dataset_pb2.BondTopology.ATOM_N, bond_type,
                bond_length_distribution.FixedWindowLengthDistribution(
                    1.0, 2.0, None))

        # This conformer is a flat aromatic square of nitrogens. The single and
        # double bonds can be rotated such that it's the same topology but
        # individual bonds have switched single/double.
        conformer = dataset_pb2.Conformer()

        conformer.bond_topologies.add(bond_topology_id=123, smiles="N1=NN=N1")
        conformer.bond_topologies[0].atoms.extend([
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
        ])
        conformer.bond_topologies[0].bonds.extend([
            dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1,
                                          bond_type=single),
            dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2,
                                          bond_type=double),
            dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3,
                                          bond_type=single),
            dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0,
                                          bond_type=double),
        ])

        dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS
        conformer.optimized_geometry.atom_positions.extend([
            dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0),
            dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0),
            dataset_pb2.Geometry.AtomPos(x=dist15a, y=dist15a, z=0),
            dataset_pb2.Geometry.AtomPos(x=dist15a, y=0, z=0),
        ])

        matching_parameters = smu_molecule.MatchingParameters()
        result = topology_from_geom.bond_topologies_from_geom(
            bond_lengths=all_dist,
            conformer_id=123,
            fate=dataset_pb2.Conformer.FATE_SUCCESS,
            bond_topology=conformer.bond_topologies[0],
            geometry=conformer.optimized_geometry,
            matching_parameters=matching_parameters)

        self.assertLen(result.bond_topology, 2)

        # The returned order is arbitrary so we figure out which is is marked
        # as the starting topology.
        starting_idx = min([
            i for i, bt, in enumerate(result.bond_topology)
            if bt.is_starting_topology
        ])
        other_idx = (starting_idx + 1) % 2

        starting = result.bond_topology[starting_idx]
        self.assertTrue(starting.is_starting_topology)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 0, 1), single)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 1, 2), double)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 2, 3), single)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 3, 0), double)

        other = result.bond_topology[other_idx]
        self.assertFalse(other.is_starting_topology)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 0, 1), double)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 1, 2), single)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 2, 3), double)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 3, 0), single)
예제 #22
0
 def test_add_itc_h_lengths(self):
     dists = bond_length_distribution.AllAtomPairLengthDistributions()
     bond_length_distribution.add_itc_h_lengths(dists)
     self.assertGreater(dists[ATOM_H, ATOM_C][BOND_SINGLE].pdf(1.0), 0)