def _fingerprints_to_use(self, hparams):
        if hparams.use_counting_fp:
            key = fmap_constants.COUNTING_CIRCULAR_FP_BASENAME
        else:
            key = fmap_constants.CIRCULAR_FP_BASENAME

        return str(
            ms_constants.CircularFingerprintKey(key, hparams.fp_length,
                                                hparams.radius))
Пример #2
0
def fingerprints_to_use(hparams):
    """Given tf.HParams, return a ms_constants.CircularFingerprintKey."""
    if hparams.use_counting_fp:
        key = fmap_constants.COUNTING_CIRCULAR_FP_BASENAME
    else:
        key = fmap_constants.CIRCULAR_FP_BASENAME

    return ms_constants.CircularFingerprintKey(key, hparams.fp_length,
                                               hparams.radius)
Пример #3
0
    def test_all_circular_fingerprints_to_dict(self):
        """Test construction of fingerprints."""
        # Test on tubocurarine chloride, which has a lot of bit collisions in its fp
        test_smiles = ('Oc7ccc1cc7Oc5cc6[C@H](Cc4ccc(Oc2c3[C@@H](C1)[N+](C)(C)'
                       'CCc3cc(OC)c2O)cc4)[N+](C)(C)CCc6cc5OC')
        test_mol = Chem.MolFromSmiles(test_smiles)

        def make_fp_key(fp_type, fp_len, rad):
            return ms_constants.CircularFingerprintKey(fp_type, fp_len, rad)

        expected_fp_sums = {
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 1024, 2):
            59.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 1024, 2):
            130.,
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 1024, 4):
            117.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 1024, 4):
            194.,
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 1024, 6):
            159.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 1024, 6):
            238.,
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 2048, 2):
            60.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 2048, 2):
            130.,
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 2048, 4):
            120.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 2048, 4):
            194.,
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 2048, 6):
            164.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 2048, 6):
            238.,
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 4096, 2):
            60.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 4096, 2):
            130.,
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 4096, 4):
            121.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 4096, 4):
            194.,
            make_fp_key(fmap_constants.CIRCULAR_FP_BASENAME, 4096, 6):
            165.,
            make_fp_key(fmap_constants.COUNTING_CIRCULAR_FP_BASENAME, 4096, 6):
            238.,
        }
        for fp_len in [1024, 2048, 4096]:
            for rad in [2, 4, 6]:
                for fp_type in fmap_constants.FP_TYPE_LIST:
                    fp_key = ms_constants.CircularFingerprintKey(
                        fp_type, fp_len, rad)
                    fp = feature_utils.make_circular_fingerprint(
                        test_mol, fp_key)
                    self.assertEqual(sum(fp), expected_fp_sums[fp_key])
def dict_to_tfexample(mol_dict):
    """Convert dictionary of molecular info to tfExample.

  Args:
    mol_dict : dictionary containing molecule info.

  Returns:
    example : tf.example containing mol_dict info.
  """
    example = tf.train.Example()
    feature_map = example.features.feature
    feature_map[fmap_constants.ATOM_WEIGHTS].float_list.value.extend(
        mol_dict[fmap_constants.ATOM_WEIGHTS])
    feature_map[fmap_constants.ATOM_IDS].int64_list.value.extend(
        mol_dict[fmap_constants.ATOM_IDS])
    feature_map[fmap_constants.ADJACENCY_MATRIX].int64_list.value.extend(
        mol_dict[fmap_constants.ADJACENCY_MATRIX])
    feature_map[fmap_constants.MOLECULE_WEIGHT].float_list.value.append(
        mol_dict[fmap_constants.MOLECULE_WEIGHT])
    feature_map[fmap_constants.DENSE_MASS_SPEC].float_list.value.extend(
        mol_dict[fmap_constants.DENSE_MASS_SPEC])
    feature_map[fmap_constants.INCHIKEY].bytes_list.value.append(
        mol_dict[fmap_constants.INCHIKEY])
    feature_map[fmap_constants.MOLECULAR_FORMULA].bytes_list.value.append(
        mol_dict[fmap_constants.MOLECULAR_FORMULA])
    feature_map[fmap_constants.NAME].bytes_list.value.append(
        mol_dict[fmap_constants.NAME])
    feature_map[fmap_constants.SMILES].bytes_list.value.append(
        mol_dict[fmap_constants.SMILES])

    if fmap_constants.INDEX_TO_GROUND_TRUTH_ARRAY in mol_dict:
        feature_map[fmap_constants.
                    INDEX_TO_GROUND_TRUTH_ARRAY].int64_list.value.append(
                        mol_dict[fmap_constants.INDEX_TO_GROUND_TRUTH_ARRAY])

    for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST:
        for rad in ms_constants.CIRCULAR_FP_RADII_LIST:
            for fp_type in fmap_constants.FP_TYPE_LIST:
                fp_key = ms_constants.CircularFingerprintKey(
                    fp_type, fp_len, rad)
                feature_map[str(fp_key)].float_list.value.extend(
                    mol_dict[fp_key])

    return example
Пример #5
0
def all_circular_fingerprints_to_dict(mol):
    """Creates all circular fingerprints from list of lengths and radii.

    Based on lists of fingerprint lengths and fingerprint radii inside
    mass_spec_constants.

    Args:
      mol : rdkit.Mol
    Returns:
      a dict. The keys are CircularFingerprintKey instances and the values are
      the corresponding fingerprints
    """
    fp_dict = {}
    for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST:
        for rad in ms_constants.CIRCULAR_FP_RADII_LIST:
            for fp_type in fmap_constants.FP_TYPE_LIST:
                circular_fp_key = ms_constants.CircularFingerprintKey(
                    fp_type, fp_len, rad)
                fp_dict[circular_fp_key] = make_circular_fingerprint(
                    mol, circular_fp_key)
    return fp_dict
def _parse_example(example_protos, hparams, features_to_load):
    """Parsing map to create features for tf.Dataset.

  Args:
    example_protos: tf.Example proto read from TF.Records
    hparams: tf.HParams object, must contain
        max_atoms - Number of atoms in atom_weights array
        max_mass_spec_peak_loc - Number of bins in mass spectra
                                 Set to 2000 if unused.
    features_to_load: list of string keys of fields to load from the
      TFRecords. If None (default), all available fields are loaded.
  Returns:
    Dict containing functions for parsing featuers from a TF.Record.
  """
    features = {
        fmap_constants.MOLECULE_WEIGHT:
        tf.FixedLenFeature([1], tf.float32),
        fmap_constants.ATOM_WEIGHTS:
        tf.FixedLenFeature([hparams.max_atoms], tf.float32),
        fmap_constants.ATOM_IDS:
        tf.FixedLenFeature([hparams.max_atoms], tf.int64),
        fmap_constants.ADJACENCY_MATRIX:
        tf.FixedLenFeature([hparams.max_atoms * hparams.max_atoms], tf.int64),
        fmap_constants.DENSE_MASS_SPEC:
        tf.FixedLenFeature([hparams.max_mass_spec_peak_loc], tf.float32),
        fmap_constants.INCHIKEY:
        tf.FixedLenFeature([1], tf.string, default_value=''),
        fmap_constants.MOLECULAR_FORMULA:
        tf.FixedLenFeature([1], tf.string, default_value=''),
        fmap_constants.NAME:
        tf.FixedLenFeature([1], tf.string, default_value=''),
        fmap_constants.SMILES:
        tf.FixedLenFeature([1], tf.string, default_value=''),
        fmap_constants.INDEX_TO_GROUND_TRUTH_ARRAY:
        tf.FixedLenFeature([1], tf.int64, default_value=0),
        fmap_constants.SMILES_TOKEN_LIST_LENGTH:
        tf.FixedLenFeature([1], tf.int64, default_value=0)
    }

    for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST:
        for rad in ms_constants.CIRCULAR_FP_RADII_LIST:
            for fp_type in fmap_constants.FP_TYPE_LIST:
                fp_key = ms_constants.CircularFingerprintKey(
                    fp_type, fp_len, rad)
                features[str(fp_key)] = tf.FixedLenFeature([fp_key.fp_len],
                                                           tf.float32)
    if features_to_load is not None:
        features = {key: features[key] for key in features_to_load}

    parsed_features = tf.parse_single_example(example_protos,
                                              features=features)

    if (features_to_load is None
            or fmap_constants.ADJACENCY_MATRIX in features_to_load):
        parsed_features[fmap_constants.ADJACENCY_MATRIX] = tf.reshape(
            parsed_features[fmap_constants.ADJACENCY_MATRIX],
            shape=(hparams.max_atoms, hparams.max_atoms))

    if (features_to_load is None
            or fmap_constants.DENSE_MASS_SPEC in features_to_load):
        parsed_features[fmap_constants.DENSE_MASS_SPEC] = preprocess_spectrum(
            parsed_features[fmap_constants.DENSE_MASS_SPEC], hparams)

    if (features_to_load is None or fmap_constants.SMILES in features_to_load):
        smiles_string = parsed_features[fmap_constants.SMILES]
        index_array = tf.py_func(feature_utils.tokenize_smiles,
                                 [smiles_string], [tf.int64])
        index_array = tf.reshape(index_array, (-1, ))
        parsed_features[fmap_constants.SMILES] = index_array
        parsed_features[fmap_constants.SMILES_TOKEN_LIST_LENGTH] = tf.shape(
            index_array)[0]
    return parsed_features
"""Evaluation metric for accuracy of library matching."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
from os import path

import feature_map_constants as fmap_constants
import mass_spec_constants as ms_constants
import util
import numpy as np
import tensorflow as tf

FP_NAME_FOR_JACCARD_SIMILARITY = str(
    ms_constants.CircularFingerprintKey(fmap_constants.CIRCULAR_FP_BASENAME,
                                        1024, 2))

# When filtering the library matching candidates on a per-query basis, we
# set the query-library element similarity to this value for the elements
# that were filtered.
_SIMILARITY_FOR_FILTERED_ELEMENTS = -100000

_KEY_FOR_LIBRARY_VECTORS = fmap_constants.DENSE_MASS_SPEC


def _validate_data_dict(data_dict, name):
    if data_dict is None:
        return
    for key in [
            FP_NAME_FOR_JACCARD_SIMILARITY, fmap_constants.INCHIKEY,
            _KEY_FOR_LIBRARY_VECTORS, fmap_constants.MOLECULE_WEIGHT
Пример #8
0
  def test_record_contents(self):
    """Test the contents of the stored record file to ensure features match."""
    mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_long)

    mol_dicts = [parse_sdf_utils.make_mol_dict(mol) for mol in mol_list]
    parsed_smiles_tokens = [
        feature_utils.tokenize_smiles(
            np.array([mol_dict[fmap_constants.SMILES]]))
        for mol_dict in mol_dicts
    ]

    token_lengths = [
        np.shape(token_arr)[0] for token_arr in parsed_smiles_tokens
    ]
    parsed_smiles_tokens = [
        np.pad(token_arr,
               (0, ms_constants.MAX_TOKEN_LIST_LENGTH - token_length),
               'constant')
        for token_arr, token_length in zip(parsed_smiles_tokens, token_lengths)
    ]

    hparams_main = tf.contrib.training.HParams(
        max_atoms=ms_constants.MAX_ATOMS,
        max_mass_spec_peak_loc=ms_constants.MAX_PEAK_LOC,
        eval_batch_size=len(mol_list),
        intensity_power=1.0)

    dataset = parse_sdf_utils.get_dataset_from_record(
        [os.path.join(self.test_data_directory, 'test_14_record.gz')],
        hparams_main,
        mode=tf.estimator.ModeKeys.EVAL)

    feature_names = [
        fmap_constants.ATOM_WEIGHTS,
        fmap_constants.MOLECULE_WEIGHT,
        fmap_constants.DENSE_MASS_SPEC,
        fmap_constants.INCHIKEY, fmap_constants.NAME,
        fmap_constants.MOLECULAR_FORMULA,
        fmap_constants.ADJACENCY_MATRIX,
        fmap_constants.ATOM_IDS, fmap_constants.SMILES
    ]
    for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST:
      for rad in ms_constants.CIRCULAR_FP_RADII_LIST:
        for fp_type in fmap_constants.FP_TYPE_LIST:
          feature_names.append(
              str(ms_constants.CircularFingerprintKey(fp_type, fp_len, rad)))
    label_names = [fmap_constants.INCHIKEY]

    features, _ = parse_sdf_utils.make_features_and_labels(
        dataset, feature_names, label_names, mode=tf.estimator.ModeKeys.EVAL)

    with tf.Session() as sess:
      feature_values = sess.run(features)

      # Check that the dataset was consumed
      try:
        sess.run(features)
        raise ValueError('Dataset parsing using batch size of length of the'
                         ' dataset resulted in more than one batch.')
      except tf.errors.OutOfRangeError:  # expected behavior
        pass

    for i in range(len(mol_list)):
      self.assertAlmostEqual(
          feature_values[fmap_constants.MOLECULE_WEIGHT][i],
          mol_dicts[i][fmap_constants.MOLECULE_WEIGHT])
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ADJACENCY_MATRIX][i]
          .flatten(),
          mol_dicts[i][fmap_constants.ADJACENCY_MATRIX],
          delta=0.0001)
      self.assertEqual(feature_values[fmap_constants.NAME][i],
                       self.encode(mol_dicts[i][fmap_constants.NAME]))
      self.assertEqual(feature_values[fmap_constants.INCHIKEY][i],
                       self.encode(mol_dicts[i][fmap_constants.INCHIKEY]))
      self.assertEqual(
          feature_values[fmap_constants.MOLECULAR_FORMULA][i],
          self.encode(mol_dicts[i][fmap_constants.MOLECULAR_FORMULA]))
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.DENSE_MASS_SPEC][i],
          mol_dicts[i][fmap_constants.DENSE_MASS_SPEC],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ATOM_WEIGHTS][i],
          mol_dicts[i][fmap_constants.ATOM_WEIGHTS],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ATOM_IDS][i],
          mol_dicts[i][fmap_constants.ATOM_IDS],
          delta=0.0001)
      self.assertAllEqual(feature_values[fmap_constants.SMILES][i],
                          parsed_smiles_tokens[i])
      self.assertAllEqual(
          feature_values[fmap_constants.SMILES_TOKEN_LIST_LENGTH][i],
          token_lengths[i])
      for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST:
        for rad in ms_constants.CIRCULAR_FP_RADII_LIST:
          for fp_type in fmap_constants.FP_TYPE_LIST:
            fp_key = ms_constants.CircularFingerprintKey(fp_type, fp_len, rad)
            self.assertSequenceAlmostEqual(
                feature_values[str(fp_key)][i],
                mol_dicts[i][fp_key],
                delta=0.0001)
Пример #9
0
 def make_fp_key(fp_type, fp_len, rad):
   return ms_constants.CircularFingerprintKey(fp_type, fp_len, rad)