def get_mol_block(self, conformer): """Returns the MOL file block with atoms and bonds. Args: conformer: dataset_pb2.Conformer Returns: list of strings """ contents = [] contents.append('\n') contents.append('{:3d}{:3d} 0 0 0 0 0 0 0 0999 V2000\n'.format( len(conformer.bond_topologies[0].atoms), len(conformer.bond_topologies[0].bonds))) for atom_type, coords in zip( conformer.bond_topologies[0].atoms, conformer.optimized_geometry.atom_positions): contents.append( '{:10.4f}{:10.4f}{:10.4f} {:s} 0 0 0 0 0 0 0 0 0 0 0 0\n' .format(smu_utils_lib.bohr_to_angstroms(coords.x), smu_utils_lib.bohr_to_angstroms(coords.y), smu_utils_lib.bohr_to_angstroms(coords.z), smu_utils_lib.ATOM_TYPE_TO_RDKIT[atom_type][0])) for bond in conformer.bond_topologies[0].bonds: contents.append('{:3d}{:3d}{:3d} 0\n'.format( bond.atom_a + 1, bond.atom_b + 1, bond.bond_type)) return contents
def geom_to_angstroms(geometry): """Convert all the coordinates in `geometry` to Angstroms. Args: geometry: starting Geometry Returns New Geometry with adjusted coordinates. Returns: Coordinates in Angstroms. """ result = dataset_pb2.Geometry() for atom in geometry.atom_positions: new_atom = dataset_pb2.Geometry.AtomPos() new_atom.x = smu_utils_lib.bohr_to_angstroms(atom.x) new_atom.y = smu_utils_lib.bohr_to_angstroms(atom.y) new_atom.z = smu_utils_lib.bohr_to_angstroms(atom.z) result.atom_positions.append(new_atom) return result
def extract_bond_lengths(conformer, dist_sig_digits, unbonded_max): """Yields quantized bond lengths. Args: conformer: dataset_pb2.Conformer dist_sig_digits: number of digits after decimal point to keep unbonded_max: maximum distance to report for unbonded pairs output atom types are single charecters, sorted lexographically. bond_type is dataset_pb2.BondTopology.BondType dist_sig_digits is a string (to avoid vagaries of floating point compares) Yields: (atom type 1, atom type 2, bond type, quantized dist) """ bt = conformer.bond_topologies[0] format_str = '{:.%df}' % dist_sig_digits for atom_idx0, atom_idx1 in itertools.combinations(range(len(bt.atoms)), r=2): if (bt.atoms[atom_idx0] == dataset_pb2.BondTopology.ATOM_H or bt.atoms[atom_idx1] == dataset_pb2.BondTopology.ATOM_H): continue bond_type = dataset_pb2.BondTopology.BOND_UNDEFINED for bond in bt.bonds: if ((bond.atom_a == atom_idx0 and bond.atom_b == atom_idx1) or (bond.atom_a == atom_idx1 and bond.atom_b == atom_idx0)): bond_type = bond.bond_type break geom = conformer.optimized_geometry atom_pos0 = np.array([ geom.atom_positions[atom_idx0].x, geom.atom_positions[atom_idx0].y, geom.atom_positions[atom_idx0].z ], dtype=np.double) atom_pos1 = np.array([ geom.atom_positions[atom_idx1].x, geom.atom_positions[atom_idx1].y, geom.atom_positions[atom_idx1].z ], dtype=np.double) # The intention is the buckets are the left edge of an empricial CDF. dist = (np.floor( smu_utils_lib.bohr_to_angstroms( np.linalg.norm(atom_pos0 - atom_pos1)) * 10**dist_sig_digits) / 10**dist_sig_digits) if (bond_type == dataset_pb2.BondTopology.BOND_UNDEFINED and dist > unbonded_max): continue atom_char0 = smu_utils_lib.ATOM_TYPE_TO_CHAR[bt.atoms[atom_idx0]] atom_char1 = smu_utils_lib.ATOM_TYPE_TO_CHAR[bt.atoms[atom_idx1]] if atom_char0 > atom_char1: atom_char0, atom_char1 = atom_char1, atom_char0 yield atom_char0, atom_char1, bond_type, format_str.format(dist)
def distance_between_atoms(geom, a1, a2): """Return the distance between atoms `a1` and `a2` in `geom`. Args: geom: a1: a2: Returns: Distance in Angstroms. """ return smu_utils_lib.bohr_to_angstroms( math.sqrt((geom.atom_positions[a1].x - geom.atom_positions[a2].x) * (geom.atom_positions[a1].x - geom.atom_positions[a2].x) + (geom.atom_positions[a1].y - geom.atom_positions[a2].y) * (geom.atom_positions[a1].y - geom.atom_positions[a2].y) + (geom.atom_positions[a1].z - geom.atom_positions[a2].z) * (geom.atom_positions[a1].z - geom.atom_positions[a2].z)))
def extract_bond_lengths(conformer, dist_sig_digits, unbonded_max): """Yields quantized bond lengths. Args: conformer: dataset_pb2.Conformer dist_sig_digits: number of digits after decimal point to keep unbonded_max: maximum distance to report for unbonded pairs output atom types are single charecters, sorted lexographically. bond_type is dataset_pb2.BondTopology.BondType dist_sig_digits is a string (to avoid vagaries of floating point compares) Yields: (atom type 1, atom type 2, bond type, quantized dist) """ # These are considered "major" or worse errors if (conformer.properties.errors.status >= 8 or conformer.duplicated_by > 0): return bt = conformer.bond_topologies[0] format_str = '{:.%df}' % dist_sig_digits for atom_idx0, atom_idx1 in itertools.combinations(range(len(bt.atoms)), r=2): if (bt.atoms[atom_idx0] == dataset_pb2.BondTopology.ATOM_H or bt.atoms[atom_idx1] == dataset_pb2.BondTopology.ATOM_H): continue # Hello huge hack. F-F creates problems for us because there is # exactly one conformer that has an F-F bond. We can't create an # empirical distribution out of 1 value. So we'll just drop that # one and let the FF conformer have no detected geometries. if (bt.atoms[atom_idx0] == dataset_pb2.BondTopology.ATOM_F and bt.atoms[atom_idx1] == dataset_pb2.BondTopology.ATOM_F): continue bond_type = smu_utils_lib.get_bond_type(bt, atom_idx0, atom_idx1) geom = conformer.optimized_geometry atom_pos0 = np.array([ geom.atom_positions[atom_idx0].x, geom.atom_positions[atom_idx0].y, geom.atom_positions[atom_idx0].z ], dtype=np.double) atom_pos1 = np.array([ geom.atom_positions[atom_idx1].x, geom.atom_positions[atom_idx1].y, geom.atom_positions[atom_idx1].z ], dtype=np.double) # The intention is the buckets are the left edge of an empricial CDF. dist = (np.floor( smu_utils_lib.bohr_to_angstroms( np.linalg.norm(atom_pos0 - atom_pos1)) * 10**dist_sig_digits) / 10**dist_sig_digits) if (bond_type == dataset_pb2.BondTopology.BOND_UNDEFINED and dist > unbonded_max): continue atom_char0 = smu_utils_lib.ATOM_TYPE_TO_CHAR[bt.atoms[atom_idx0]] atom_char1 = smu_utils_lib.ATOM_TYPE_TO_CHAR[bt.atoms[atom_idx1]] if atom_char0 > atom_char1: atom_char0, atom_char1 = atom_char1, atom_char0 yield atom_char0, atom_char1, bond_type, format_str.format(dist)