def write_all_dataset_files(inchikey_dict, inchikey_list, base_name, output_dir, max_atoms, max_mass_spec_peak_loc, make_library_array=False): """Helper function for writing all the files associated with a TFRecord. Args: inchikey_dict : Full dictionary keyed by inchikey containing lists of rdkit.Mol objects inchikey_list : List of inchikeys to include in dataset base_name : Base name for the dataset output_dir : Path for saving all TFRecord files max_atoms : Maximum number of atoms to include for a given molecule max_mass_spec_peak_loc : Largest m/z peak to include in a spectra. make_library_array : Flag for whether to make library array Returns: Saves 3 files: basename.tfrecord : a TFRecord file, basename.inchikey.txt : a text file with all the inchikeys in the dataset basename.tfrecord.info: a text file with one line describing the length of the TFRecord file. Also saves if make_library_array is set: basename.npy : see parse_sdf_utils.write_dicts_to_example """ record_name = base_name + TFRECORD_FILENAME_END mol_list = train_test_split_utils.make_mol_list_from_inchikey_dict( inchikey_dict, inchikey_list) if make_library_array: library_array_pathname = base_name + NP_LIBRARY_ARRAY_END parse_sdf_utils.write_dicts_to_example( mol_list, os.path.join(output_dir, record_name), max_atoms, max_mass_spec_peak_loc, os.path.join(output_dir, library_array_pathname)) else: parse_sdf_utils.write_dicts_to_example( mol_list, os.path.join(output_dir, record_name), max_atoms, max_mass_spec_peak_loc) write_list_of_inchikeys(inchikey_list, base_name, output_dir) parse_sdf_utils.write_info_file(mol_list, os.path.join(output_dir, record_name))
def test_save_true_spectra_array(self): """Checks contents of true spectra array written by write_dicts_to_example. """ mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_short) fpath = self.temp_dir records_path_name = os.path.join(fpath, 'test_record.gz') test_array_filename = 'true_spectra_array.npy' array_path_name = os.path.join(fpath, test_array_filename) parse_sdf_utils.write_dicts_to_example( mol_list, records_path_name, self.hparams.max_atoms, self.hparams.max_mass_spec_peak_loc, true_library_array_path_name=array_path_name) parse_sdf_utils.write_info_file(mol_list, records_path_name) parse_sdf_utils.validate_spectra_array_contents( records_path_name, self.hparams, array_path_name)
def test_dict_tfexample(self): """Check if the contents of tf.Records is the same as input molecule info. Writes tf.example as tf.record to disk, then reads from disk. """ mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_short) fd, fpath = tempfile.mkstemp(dir=self.temp_dir) os.close(fd) parse_sdf_utils.write_dicts_to_example(mol_list, fpath, self.hparams.max_atoms, self.hparams.max_mass_spec_peak_loc) parse_sdf_utils.write_info_file(mol_list, fpath) self._validate_info_file(mol_list, fpath) dataset = parse_sdf_utils.get_dataset_from_record( [fpath], self.hparams, mode=tf.estimator.ModeKeys.EVAL) feature_names = [ fmap_constants.ATOM_WEIGHTS, fmap_constants.MOLECULE_WEIGHT, fmap_constants.DENSE_MASS_SPEC, fmap_constants.INCHIKEY, fmap_constants.NAME, fmap_constants.MOLECULAR_FORMULA, fmap_constants.ADJACENCY_MATRIX, fmap_constants.ATOM_IDS, fmap_constants.SMILES ] label_names = [fmap_constants.INCHIKEY] features, _ = parse_sdf_utils.make_features_and_labels( dataset, feature_names, label_names, mode=tf.estimator.ModeKeys.EVAL) with tf.Session() as sess: feature_values = sess.run(features) # Check that the dataset was consumed try: sess.run(features) raise ValueError('Dataset parsing using batch size of length of the' 'dataset resulted in more than one batch.') except tf.errors.OutOfRangeError: # expected behavior pass for i in range(len(self.expected_mol_dicts)): self.assertAlmostEqual( feature_values[fmap_constants.MOLECULE_WEIGHT][i], self.expected_mol_dicts[i][fmap_constants.MOLECULE_WEIGHT]) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ADJACENCY_MATRIX][i] .flatten(), self.expected_mol_dicts[i][fmap_constants.ADJACENCY_MATRIX], delta=0.0001) self.assertSequenceAlmostEqual( feature_values[fmap_constants.DENSE_MASS_SPEC][i], self.expected_mol_dicts[i][fmap_constants.DENSE_MASS_SPEC], delta=0.0001) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ATOM_WEIGHTS][i], self.expected_mol_dicts[i][fmap_constants.ATOM_WEIGHTS], delta=0.0001) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ATOM_IDS][i], self.expected_mol_dicts[i][fmap_constants.ATOM_IDS], delta=0.0001) self.assertEqual( feature_values[fmap_constants.NAME][i], self.encode(self.expected_mol_dicts[i][fmap_constants.NAME])) self.assertEqual( feature_values[fmap_constants.INCHIKEY][i], self.encode( self.expected_mol_dicts[i][fmap_constants.INCHIKEY])) self.assertEqual( feature_values[fmap_constants.MOLECULAR_FORMULA][i], self.encode( self.expected_mol_dicts[i][fmap_constants.MOLECULAR_FORMULA])) self.assertAllEqual(feature_values[fmap_constants.SMILES][i], self.expected_mol_dicts[i]['parsed_smiles']) self.assertAllEqual( feature_values[fmap_constants.SMILES_TOKEN_LIST_LENGTH][i], self.expected_mol_dicts[i][fmap_constants.SMILES_TOKEN_LIST_LENGTH])