Exemplo n.º 1
0
 def __iter__(self):
     """Iterates through all dataset_pb2.Conformer in the DB."""
     select = f'SELECT conformer FROM {_CONFORMER_TABLE_NAME} ORDER BY rowid'
     cur = self._conn.cursor()
     cur.execute(select)
     return (dataset_pb2.Conformer().FromString(snappy.uncompress(
         result[0])) for result in cur)
Exemplo n.º 2
0
  def test_merge_duplicate_information_diff_topology(self):
    main_conf = dataset_pb2.Conformer(conformer_id=123000)
    main_conf.initial_geometries.add()
    main_conf.initial_geometries[0].atom_positions.add(x=1, y=2, z=3)

    dup_conf = dataset_pb2.Conformer(conformer_id=456000, duplicated_by=123000)
    dup_conf.initial_geometries.add()
    dup_conf.initial_geometries[0].atom_positions.add(x=4, y=5, z=6)

    got = pipeline.merge_duplicate_information(123000, [dup_conf, main_conf])
    self.assertEqual(got.conformer_id, 123000)
    self.assertEqual(got.duplicated_by, 0)
    self.assertEqual(got.duplicate_of, [456000])
    # TODO(pfr, ianwatson): implement correct copying of initial geometry
    self.assertLen(got.initial_geometries, 1)
    self.assertEqual(got.initial_geometries[0].atom_positions[0].x, 1)
Exemplo n.º 3
0
  def test_merge_duplicate_information_same_topology(self):
    main_conf = dataset_pb2.Conformer(conformer_id=123000)
    main_conf.initial_geometries.add()
    main_conf.initial_geometries[0].atom_positions.add(x=1, y=2, z=3)

    dup_conf = dataset_pb2.Conformer(conformer_id=123456, duplicated_by=123000)
    dup_conf.initial_geometries.add()
    dup_conf.initial_geometries[0].atom_positions.add(x=4, y=5, z=6)

    got = pipeline.merge_duplicate_information(123000, [dup_conf, main_conf])
    self.assertEqual(got.conformer_id, 123000)
    self.assertEqual(got.duplicated_by, 0)
    self.assertEqual(got.duplicate_of, [123456])
    self.assertLen(got.initial_geometries, 2)
    self.assertEqual(got.initial_geometries[0].atom_positions[0].x, 1)
    self.assertEqual(got.initial_geometries[1].atom_positions[0].x, 4)
Exemplo n.º 4
0
    def find_by_conformer_id(self, cid):
        """Finds the conformer associated with a conformer id.

    Args:
      cid: conformer id to look up.

    Returns:
      dataset_pb2.Conformer

    Raises:
      KeyError: if cid is not found
    """
        cur = self._conn.cursor()
        select = f'SELECT conformer FROM {_CONFORMER_TABLE_NAME} WHERE cid = ?'
        cur.execute(select, (cid, ))
        result = cur.fetchall()

        if not result:
            raise KeyError(f'Conformer id {cid} not found')

        # Since it's a unique index, there should only be one result and it's a
        # tuple with one value.
        assert len(result) == 1
        assert len(result[0]) == 1
        return dataset_pb2.Conformer().FromString(
            snappy.uncompress(result[0][0]))
Exemplo n.º 5
0
def parse_equivalent_file(filename):
    """Parses the .dat of equivalent structure.

  The file is just pairs of entries where the first was kept over the second.
  Yields one entry per line keyed by the discarded conformer id.
  See merge_duplicate_information for how information is transferred to the kept
  conformer.

  Args:
    filename: string

  Yields:
    dataset_pb2.Conformer
  """
    with gfile.GFile(filename) as f:
        for line in f:
            kept_str, discard_str = line.split()
            _, _, kept_btid, kept_cid = smu_parser_lib.parse_long_identifier(
                kept_str)
            _, _, discard_btid, discard_cid = smu_parser_lib.parse_long_identifier(
                discard_str)
            # Convert to our conformer ids which include the btid
            kept_cid = kept_btid * 1000 + kept_cid
            discard_cid = discard_btid * 1000 + discard_cid

            yield dataset_pb2.Conformer(conformer_id=discard_cid,
                                        duplicated_by=kept_cid)
Exemplo n.º 6
0
  def test_extract_bond_lengths(self):
    # This conformer does not obey valence rules, but it's fine for this test.
    conf = dataset_pb2.Conformer(conformer_id=123000)
    conf.properties.errors.status = 4
    bt = conf.bond_topologies.add()
    bt.atoms.extend([
        dataset_pb2.BondTopology.ATOM_ONEG, dataset_pb2.BondTopology.ATOM_NPOS,
        dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_H
    ])
    bt.bonds.add(
        atom_a=0, atom_b=1, bond_type=dataset_pb2.BondTopology.BOND_SINGLE)
    bt.bonds.add(
        atom_a=0, atom_b=2, bond_type=dataset_pb2.BondTopology.BOND_DOUBLE)
    bt.bonds.add(
        atom_a=0, atom_b=3, bond_type=dataset_pb2.BondTopology.BOND_SINGLE)
    conf.optimized_geometry.atom_positions.add(x=0, y=0, z=0)
    conf.optimized_geometry.atom_positions.add(x=1, y=0, z=0)
    conf.optimized_geometry.atom_positions.add(x=0, y=2, z=0)
    conf.optimized_geometry.atom_positions.add(x=111, y=222, z=333)

    got = list(
        pipeline.extract_bond_lengths(
            conf, dist_sig_digits=2, unbonded_max=2.0))
    # Note that these are *not* rounded, but truncated to this many digits.
    self.assertEqual(
        got,
        [
            # 1 bohr -> 0.529177249 angstroms
            ('n', 'o', dataset_pb2.BondTopology.BOND_SINGLE, '0.52'),
            # 2 bohr -> 2 * 0.529177249 angstroms
            ('c', 'o', dataset_pb2.BondTopology.BOND_DOUBLE, '1.05'),
            # sqrt(1**2 + 2**2) bohr -> 2.23606 * 0.529177249 angstroms
            ('c', 'n', dataset_pb2.BondTopology.BOND_UNDEFINED, '1.18')
        ])
Exemplo n.º 7
0
def get_bond_length_distribution_inner(input_fname, output_fname):
    """Generate bond length distibutions.

  Args:
    input_fname: An existing TFRecord file containing Conformer protos.
    output_fname: An output file that will be created that contains all bond
      length distributions - all bond types, all atom types. Requires
      post-processing to generate bond length distribution files.
  """
    print("Reading from {input_fname} output to {output_fname}")
    options = PipelineOptions(direct_num_workers=6,
                              direct_running_mode="multi_processing")
    # options = PipelineOptions()
    with beam.Pipeline(options=options) as p:
        protos = (
            p
            | beam.io.tfrecordio.ReadFromTFRecord(
                input_fname,
                coder=beam.coders.ProtoCoder(
                    dataset_pb2.Conformer().__class__))
            | beam.ParDo(bond_lengths.GetBondLengthDistribution())
            | beam.CombinePerKey(sum)
            #     | beam.ParDo(GroupBondTypes())
            #     | beam.GroupByKey()
            | beam.ParDo(BondDistToString())
            | beam.io.WriteToText(output_fname))
        print(protos)
Exemplo n.º 8
0
def ReadConFormer(bond_lengths, input_string, output):
    """Reads conformer.

  Args:
    bond_lengths:
    input_string:
    output:

  Returns:
  """

    #   class GetAtoms(beam.DoFn):

    #     def process(self, item):
    #       yield item.optimized_geometry.atom_positions[0].x

    options = PipelineOptions(direct_num_workers=6,
                              direct_running_mode="multi_processing")
    # options = PipelineOptions()
    with beam.Pipeline(options=options) as p:
        protos = (p | beam.io.tfrecordio.ReadFromTFRecord(
            input_string,
            coder=beam.coders.ProtoCoder(dataset_pb2.Conformer().__class__))
                  | beam.ParDo(
                      topology_from_geom.TopologyFromGeom(bond_lengths))
                  | beam.ParDo(SummaryData())
                  | beam.io.textio.WriteToText(output))

        return protos
Exemplo n.º 9
0
    def test_extract_bond_lengths_max_unbonded(self):
        # This conformer does not obery valence rules, but it's fine for this test.
        conf = dataset_pb2.Conformer(conformer_id=123000)
        conf.properties.errors.status = 4
        bt = conf.bond_topologies.add()
        bt.atoms.extend([
            dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_O
        ])
        bt.bonds.add(atom_a=0,
                     atom_b=1,
                     bond_type=dataset_pb2.BondTopology.BOND_SINGLE)
        bt.bonds.add(atom_a=0,
                     atom_b=2,
                     bond_type=dataset_pb2.BondTopology.BOND_SINGLE)
        conf.optimized_geometry.atom_positions.add(x=0, y=0, z=0)
        conf.optimized_geometry.atom_positions.add(x=1, y=0, z=0)
        conf.optimized_geometry.atom_positions.add(x=100, y=2, z=0)

        got = list(
            pipeline.extract_bond_lengths(conf,
                                          dist_sig_digits=2,
                                          unbonded_max=2.0))
        # Note that these are *not* rounded, but truncated to this many digits.
        self.assertEqual(
            got,
            [
                # 1 bohr -> 0.529177249 angstroms
                ('c', 'n', dataset_pb2.BondTopology.BOND_SINGLE, '0.52'),
                # It seems like this should be 52.91 but it looks like some
                # numerical noise in np.linalg.norm.
                ('c', 'o', dataset_pb2.BondTopology.BOND_SINGLE, '52.92')
            ])
Exemplo n.º 10
0
    def parse_stage2_to_proto(self):
        """Read _raw_contents and parses the various sections.

    This parses the "stage2" files which are the complete ones from the end of
    the pipeline.

    This only reads one conformer from _raw_contents. To read multiple, you have
    to update _raw_contents between calls.

    Returns:
      dataset_pb2.Conformer with a single conformer, or an Exception
    """
        self.parse(ParseModes.INITIALIZE)
        try:
            self._conformer = dataset_pb2.Conformer()
            self.parse(ParseModes.SKIP, num_lines=1)  # Separator.
            num_atoms = self.parse_stage2_header()
            self.parse_database()
            self.parse_error_codes()
            self.parse_bond_topology()
            self.parse_identifier()
            self.parse_cluster_info(num_lines=8)
            self.parse_stage2_timings()  # Timings per step.
            self.parse_bonds()
            self.parse_gradient_norms()
            self.parse_coordinates('Initial Coords', num_atoms)
            self.parse_coordinates('Optimized Coords', num_atoms)
            self.parse_rotational_constants()
            self.parse_symmetry_used()
            # 'Frequencies and intensities'
            self.parse_frequencies_and_intensities(num_atoms, header=True)
            self.parse_gaussian_sanity_check()
            self.parse_normal_modes(num_atoms)
            self.parse_property_list(
            )  # Key-value pairs: Energies, frequencies,...
            self.parse_diagnostics()
            self.parse_atomic_block()
            self.parse_homo_lumo()
            self.parse_excitation_energies_and_oscillations()
            self.parse_nmr_isotropic_shieldings()
            self.parse_partial_charges()
            self.parse_polarizability()
            self.parse_multipole_moments()
            # Somewhere along the lines in the regeneration process (maybe just for
            # debugging), we add an extra blank line. We'll just skip it here and
            # ignore blank lines at the end.
            self.parse(ParseModes.SKIP_BLANK_LINES)

        except (SmuKnownError, ValueError, IndexError, KeyError,
                AssertionError) as exc:
            exc.conformer_id = self._conformer.conformer_id
            logging.info(
                'Got exception during conformer %d: %s\n'
                'traceback: %s', exc.conformer_id, str(exc),
                traceback.format_exc())
            return exc

        return self._conformer
Exemplo n.º 11
0
 def setUp(self):
     super().setUp()
     self.conformer = dataset_pb2.Conformer()
     properties = self.conformer.properties
     # A STANDARD field
     properties.initial_geometry_energy.value = 1.23
     # A COMPLETE field
     properties.zpe_unscaled.value = 1.23
     # An INTERNAL_ONLY field
     properties.compute_cluster_info = 'not set'
Exemplo n.º 12
0
 def setUp(self):
   super().setUp()
   self.conformer = dataset_pb2.Conformer()
   properties = self.conformer.properties
   # A STANDARD field
   properties.single_point_energy_pbe0d3_6_311gd.value = 1.23
   # A COMPLETE field
   properties.homo_pbe0_aug_pc_1.value = 1.23
   # An INTERNAL_ONLY field
   properties.nuclear_repulsion_energy.value = 1.23
Exemplo n.º 13
0
 def _create_dummy_conformer(self):
   conf = dataset_pb2.Conformer(conformer_id=123000)
   bt = conf.bond_topologies.add()
   bt.atoms.extend(
       [dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_C])
   bt.bonds.add(
       atom_a=0, atom_b=1, bond_type=dataset_pb2.BondTopology.BOND_SINGLE)
   conf.optimized_geometry.atom_positions.add(x=0, y=0, z=0)
   conf.optimized_geometry.atom_positions.add(x=1, y=0, z=0)
   return conf
Exemplo n.º 14
0
  def setUp(self):
    super().setUp()
    # We are relying on the fact that the first conformer in both x07_sample.dat
    # and x07_stage1.dat are the same.
    self.stage1_conformer = get_stage1_conformer()
    self.stage2_conformer = get_stage2_conformer()

    self.duplicate_conformer = dataset_pb2.Conformer()
    self.duplicate_conformer.conformer_id = self.stage1_conformer.conformer_id
    # A real duplicate conformer wouldn't have both of these fields filled in,
    # but it's fine for the test to make sure everything is copied.
    self.duplicate_conformer.duplicated_by = 123
    self.duplicate_conformer.duplicate_of.extend([111, 222])
Exemplo n.º 15
0
    def find_by_bond_topology_id(self, btid):
        """Finds all the conformer associated with a bond topology id.

    Args:
      btid: bond topology id to look up.

    Returns:
      iterable of dataset_pb2.Conformer
    """
        cur = self._conn.cursor()
        select = (f'SELECT cid, conformer '
                  f'FROM {_CONFORMER_TABLE_NAME} '
                  f'INNER JOIN {_BTID_TABLE_NAME} USING(cid) '
                  f'WHERE {_BTID_TABLE_NAME}.btid = ?')
        cur.execute(select, (btid, ))
        return (dataset_pb2.Conformer().FromString(snappy.uncompress(
            result[1])) for result in cur)
Exemplo n.º 16
0
    def find_by_expanded_stoichiometry(self, exp_stoich):
        """Finds all of the conformers with a stoichiometry.

    The expanded stoichiometry includes hydrogens as part of the atom type.
    See smu_utils_lib.expanded_stoichiometry_from_topology for a
    description.

    Args:
      exp_stoich: string

    Returns:
      iterable of dataset_pb2.Conformer
    """
        cur = self._conn.cursor()
        select = (f'SELECT conformer '
                  f'FROM {_CONFORMER_TABLE_NAME} '
                  f'WHERE exp_stoich = ?')
        cur.execute(select, (exp_stoich, ))
        return (dataset_pb2.Conformer().FromString(snappy.uncompress(
            result[0])) for result in cur)
Exemplo n.º 17
0
    def parse_stage1_to_proto(self):
        """Read _raw_contents and parses the various sections.

    This parses the "stage1" files which are just the geometry optimization
    before dedupping.

    This only reads one conformer from _raw_contents. To read multiple, you have
    to update _raw_contents between calls.

    Returns:
      dataset_pb2.Conformer or an Exception
    """
        self.parse(ParseModes.INITIALIZE)
        try:
            self._conformer = dataset_pb2.Conformer()
            self.parse(ParseModes.SKIP, num_lines=1)  # Separator.
            num_atoms = self.parse_stage1_header()
            self.parse_bond_topology()
            self.parse_identifier()
            self.parse_cluster_info(num_lines=4)
            self.parse_stage1_timings()
            self.parse_gradient_norms()
            self.parse_coordinates('Initial Coords', num_atoms)
            self.parse_coordinates('Optimized Coords', num_atoms)
            self.parse_frequencies_and_intensities(num_atoms, header=False)

            # Somewhere along the lines in the regeneration process (maybe just for
            # debugging), we add an extra blank line. We'll just skip it here and
            # ignore blank lines at the end.
            self.parse(ParseModes.SKIP_BLANK_LINES)
        except (SmuKnownError, ValueError, IndexError, KeyError,
                AssertionError) as exc:
            exc.conformer_id = self._conformer.conformer_id
            logging.info(
                'Got exception during conformer %d: %s\n'
                'traceback: %s', exc.conformer_id, str(exc),
                traceback.format_exc())
            return exc

        return self._conformer
Exemplo n.º 18
0
    def find_by_stoichiometry(self, stoich):
        """Finds all conformers with a given stoichiometry.

    The stoichiometry is like "C6H12".

    Internally, the stoichiometry is converted a set of expanded stoichiometries
    and the query is done to find all of those.
    Notably, this means only records with status <= 512 are returned.

    Args:
      stoich: stoichiometry string like "C6H12", case doesn't matter
    Returns:
      Iterable of type dataset_pb2.Conformer.
    """
        exp_stoichs = list(
            smu_utils_lib.expanded_stoichiometries_from_stoichiometry(stoich))
        cur = self._conn.cursor()
        select = (f'SELECT conformer '
                  f'FROM {_CONFORMER_TABLE_NAME} '
                  f'WHERE exp_stoich IN (' +
                  ','.join('?' for _ in exp_stoichs) + ')')
        cur.execute(select, exp_stoichs)
        return (dataset_pb2.Conformer().FromString(snappy.uncompress(
            result[0])) for result in cur)
Exemplo n.º 19
0
    def test_multi_topology_detection(self):
        """Tests that we can find multiple versions of the same topology."""
        single = dataset_pb2.BondTopology.BondType.BOND_SINGLE
        double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

        all_dist = bond_length_distribution.AllAtomPairLengthDistributions()
        for bond_type in [single, double]:
            all_dist.add(
                dataset_pb2.BondTopology.ATOM_N,
                dataset_pb2.BondTopology.ATOM_N, bond_type,
                bond_length_distribution.FixedWindowLengthDistribution(
                    1.0, 2.0, None))

        # This conformer is a flat aromatic square of nitrogens. The single and
        # double bonds can be rotated such that it's the same topology but
        # individual bonds have switched single/double.
        conformer = dataset_pb2.Conformer()

        conformer.bond_topologies.add(bond_topology_id=123, smiles="N1=NN=N1")
        conformer.bond_topologies[0].atoms.extend([
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
        ])
        conformer.bond_topologies[0].bonds.extend([
            dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1,
                                          bond_type=single),
            dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2,
                                          bond_type=double),
            dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3,
                                          bond_type=single),
            dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0,
                                          bond_type=double),
        ])

        dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS
        conformer.optimized_geometry.atom_positions.extend([
            dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0),
            dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0),
            dataset_pb2.Geometry.AtomPos(x=dist15a, y=dist15a, z=0),
            dataset_pb2.Geometry.AtomPos(x=dist15a, y=0, z=0),
        ])

        matching_parameters = smu_molecule.MatchingParameters()
        result = topology_from_geom.bond_topologies_from_geom(
            bond_lengths=all_dist,
            conformer_id=123,
            fate=dataset_pb2.Conformer.FATE_SUCCESS,
            bond_topology=conformer.bond_topologies[0],
            geometry=conformer.optimized_geometry,
            matching_parameters=matching_parameters)

        self.assertLen(result.bond_topology, 2)

        # The returned order is arbitrary so we figure out which is is marked
        # as the starting topology.
        starting_idx = min([
            i for i, bt, in enumerate(result.bond_topology)
            if bt.is_starting_topology
        ])
        other_idx = (starting_idx + 1) % 2

        starting = result.bond_topology[starting_idx]
        self.assertTrue(starting.is_starting_topology)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 0, 1), single)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 1, 2), double)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 2, 3), single)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 3, 0), double)

        other = result.bond_topology[other_idx]
        self.assertFalse(other.is_starting_topology)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 0, 1), double)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 1, 2), single)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 2, 3), double)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 3, 0), single)
Exemplo n.º 20
0
 def make_fake_conformer(self, cid):
     conformer = dataset_pb2.Conformer()
     conformer.conformer_id = cid
     self.add_bond_topology_to_conformer(conformer, cid // 1000)
     return conformer