def main():
    mol_list = parse_sdf_utils.get_sdf_to_mol('/mnt/storage/NIST_zipped/NIST17/replib_mend.sdf')
    inchikey_dict = train_test_split_utils.make_inchikey_dict(mol_list)

    spectra_for_one_mol = make_spectra_array(inchikey_dict['PDACHFOTOFNHBT-UHFFFAOYSA-N'])
    distance_matrix = get_similarities(spectra_for_one_mol)
    print('distance for spectra in PDACHFOTOFNHBT-UHFFFAOYSA-N', distance_matrix)
    def setUp(self):
        test_data_directory = test_utils.test_dir('testdata/')
        self.temp_dir = tempfile.mkdtemp(
            dir=absltest.get_default_test_tmpdir())
        test_sdf_file_large = os.path.join(test_data_directory,
                                           'test_14_mend.sdf')
        test_sdf_file_small = os.path.join(test_data_directory,
                                           'test_2_mend.sdf')

        max_atoms = ms_constants.MAX_ATOMS
        self.mol_list_large = parse_sdf_utils.get_sdf_to_mol(
            test_sdf_file_large, max_atoms=max_atoms)
        self.mol_list_small = parse_sdf_utils.get_sdf_to_mol(
            test_sdf_file_small, max_atoms=max_atoms)
        self.inchikey_dict_large = train_test_split_utils.make_inchikey_dict(
            self.mol_list_large)
        self.inchikey_dict_small = train_test_split_utils.make_inchikey_dict(
            self.mol_list_small)
        self.inchikey_list_large = self.inchikey_dict_large.keys()
        self.inchikey_list_small = self.inchikey_dict_small.keys()
示例#3
0
def make_mainlib_replicates_train_test_split(
        mainlib_mol_list,
        replicates_mol_list,
        splitting_type,
        mainlib_fractions,
        replicates_fractions,
        mainlib_maximum_num_molecules_to_use=None,
        replicates_maximum_num_molecules_to_use=None,
        rseed=42):
    """Makes train/validation/test inchikey lists from two lists of rdkit.Mol.

    Args:
      mainlib_mol_list : list of molecules from main library
      replicates_mol_list : list of molecules from replicates library
      splitting_type : type of splitting to use for validation splits.
      mainlib_fractions : TrainValTestFractions namedtuple
          holding desired fractions for train/val/test split of mainlib
      replicates_fractions : TrainValTestFractions namedtuple
          holding desired fractions for train/val/test split of replicates.
          For the replicates set, the train fraction should be set to 0.
      mainlib_maximum_num_molecules_to_use : Largest number of molecules to use
         when making datasets from mainlib
      replicates_maximum_num_molecules_to_use : Largest number of molecules to use
         when making datasets from replicates
      rseed : random seed for shuffling

    Returns:
      main_inchikey_dict : Dict that is keyed by inchikey, containing a list of
          rdkit.Mol objects corresponding to that inchikey from the mainlib
      replicates_inchikey_dict : Dict that is keyed by inchikey, containing a list
          of rdkit.Mol objects corresponding to that inchikey from the replicates
          library
      main_replicates_split_inchikey_lists_dict : dict with keys :
        'mainlib_train', 'mainlib_validation', 'mainlib_test',
        'replicates_train', 'replicates_validation', 'replicates_test'
        Values are lists of inchikeys corresponding to each dataset.

    """
    random.seed(rseed)
    main_inchikey_dict = train_test_split_utils.make_inchikey_dict(
        mainlib_mol_list)
    main_inchikey_list = main_inchikey_dict.keys()

    if mainlib_maximum_num_molecules_to_use is not None:
        main_inchikey_list = random.sample(
            main_inchikey_list, mainlib_maximum_num_molecules_to_use)

    replicates_inchikey_dict = train_test_split_utils.make_inchikey_dict(
        replicates_mol_list)
    replicates_inchikey_list = replicates_inchikey_dict.keys()

    if replicates_maximum_num_molecules_to_use is not None:
        replicates_inchikey_list = random.sample(
            replicates_inchikey_list, replicates_maximum_num_molecules_to_use)

    # Make train/val/test splits for main dataset.
    main_train_validation_test_inchikeys = (
        train_test_split_utils.make_train_val_test_split_inchikey_lists(
            main_inchikey_list,
            main_inchikey_dict,
            mainlib_fractions,
            holdout_inchikey_list=replicates_inchikey_list,
            splitting_type=splitting_type))

    # Make train/val/test splits for replicates dataset.
    replicates_validation_test_inchikeys = (
        train_test_split_utils.make_train_val_test_split_inchikey_lists(
            replicates_inchikey_list,
            replicates_inchikey_dict,
            replicates_fractions,
            splitting_type=splitting_type))

    component_inchikey_dict = {
        ds_constants.MAINLIB_TRAIN_BASENAME:
        main_train_validation_test_inchikeys.train,
        ds_constants.MAINLIB_VALIDATION_BASENAME:
        main_train_validation_test_inchikeys.validation,
        ds_constants.MAINLIB_TEST_BASENAME:
        main_train_validation_test_inchikeys.test,
        ds_constants.REPLICATES_TRAIN_BASENAME:
        replicates_validation_test_inchikeys.train,
        ds_constants.REPLICATES_VALIDATION_BASENAME:
        replicates_validation_test_inchikeys.validation,
        ds_constants.REPLICATES_TEST_BASENAME:
        replicates_validation_test_inchikeys.test
    }

    train_test_split_utils.assert_all_lists_mutally_exclusive(
        component_inchikey_dict.values())
    # Test that the set of the 5 component inchikey lists is equal to the set of
    #   inchikeys in the main library.
    all_inchikeys_in_components = [
        ikey for ikey_list in component_inchikey_dict.values()
        for ikey in ikey_list
    ]

    assert set(main_inchikey_list + replicates_inchikey_list) == set(
        all_inchikeys_in_components
    ), ('The inchikeys in the original inchikey dictionary are not all included'
        ' in the train/val/test component libraries')

    return (main_inchikey_dict, replicates_inchikey_dict,
            component_inchikey_dict)