예제 #1
0
    def setUp(self):
        self.temp_file = TempSMILESFile()
        self.fh = self.temp_file.open()

        # See `test_data.py` for data set test cases.
        self.dataset = SMILESDataset(self.fh.name)
        self.vocab = SMILESVocabulary(self.dataset, need_corpus=True)
예제 #2
0
class SMILESConsecutiveSamplerTestCase(unittest.TestCase):
    def setUp(self):
        self.smiles_string = 'CCc1c[n+]2ccc3c4ccccc4[nH]c3c2cc1'
        with TempSMILESFile(smiles_strings=self.smiles_string) as temp_fh:
            dataset = SMILESDataset(temp_fh.file_handler.name)
        self.vocabulary = SMILESVocabulary(dataset, need_corpus=True)

    def test_sampling_with_padding(self):
        smiles_string = Token.tokenize(Token.augment(self.smiles_string))
        n_steps = 20
        sampler = SMILESConsecutiveSampler(self.vocabulary.corpus,
                                           n_steps=n_steps)

        step_i = 0
        for sample in sampler:
            input_s = ''.join(smiles_string[step_i:step_i + n_steps])
            output_s = ''.join(smiles_string[step_i + 1:step_i + n_steps + 1])

            if sample.valid_length < n_steps:
                input_s += Token.PAD * (n_steps - sample.valid_length - 1)
                output_s += Token.PAD * (n_steps - sample.valid_length)

            self.assertEqual(
                input_s,
                ''.join(self.vocabulary.get_tokens(sample.inputs)),
            )
            self.assertEqual(
                output_s,
                ''.join(self.vocabulary.get_tokens(sample.outputs)),
            )

            step_i += n_steps

    def test_sampling_without_padding(self):
        tokens = Token.tokenize(Token.augment(self.smiles_string))
        n_steps = len(tokens) - 1
        sampler = SMILESConsecutiveSampler(self.vocabulary.corpus,
                                           n_steps=n_steps)

        step_i = 0
        for n_samples, sample in enumerate(sampler, start=1):
            self.assertListEqual(
                tokens[step_i:step_i + n_steps],
                self.vocabulary.get_tokens(sample.inputs),
            )
            self.assertListEqual(
                tokens[step_i + 1:step_i + n_steps + 1],
                self.vocabulary.get_tokens(sample.outputs),
            )
            self.assertEqual(n_steps, sample.valid_length)

            step_i += n_steps

        self.assertEqual(n_samples, 1)
예제 #3
0
class SMILESVocabularyTestCase(unittest.TestCase):
    def setUp(self):
        self.temp_file = TempSMILESFile()
        self.fh = self.temp_file.open()

        # See `test_data.py` for data set test cases.
        self.dataset = SMILESDataset(self.fh.name)
        self.vocab = SMILESVocabulary(self.dataset, need_corpus=True)

    def test_tokens_and_idx(self):
        self.assertSequenceEqual(
            # Tokenize the entire dataset to get a set of unique tokens.
            sorted(
                set(
                    Token.tokenize(
                        self.temp_file.smiles_strings.replace('\n', '')))),
            # The temporary file is not augmented by the special tokens.
            sorted(set(self.vocab.token_to_idx) - Token.SPECIAL),
        )
        self.assertSequenceEqual(
            sorted(
                set(self.vocab.token_to_idx)
                # Pad and unknown tokens does not appear in the original set.
                - {Token.PAD, Token.UNK}),
            sorted(set(self.vocab.token_freqs)),
        )

    def test_corpus(self):
        # Original SMILES list without padded special tokens.
        smiles_list = self.temp_file.smiles_strings.split('\n')

        self.assertEqual(len(self.vocab.corpus), len(smiles_list))

        for idx, tokens in zip(self.vocab.corpus, smiles_list):
            # Add special tokens in order to correspond to the loaded corpus
            # for data sampling and model fitting.
            tokens = Token.augment(tokens)
            # Test id-to-token mapping.
            self.assertEqual(
                ''.join(self.vocab.get_tokens(idx)),
                tokens,
            )
            # Test token-to-id mapping.
            self.assertListEqual(idx, self.vocab[Token.tokenize(tokens)])

    def test_contains(self):
        self.assertNotIn(Token.UNK, self.vocab)

        all_tokens = Token.get_all_tokens()

        for token in self.vocab:
            if len(token) == 1 and token.islower():
                token = token.upper()
            self.assertIn(token, all_tokens)

    def tearDown(self):
        self.fh.close()
    def setUp(self):
        temp_file = TempSMILESFile(
            tempfile_kwargs={'prefix': 'softmax_sampler'})
        self.fh = temp_file.open()

        dataset = SMILESDataset(self.fh.name)
        self.vocabulary = SMILESVocabulary(dataset, need_corpus=True)

        self.model = SMILESRNN(len(self.vocabulary))

        self.predictor = SoftmaxSearch(self.model, self.vocabulary)
예제 #5
0
    def setUp(self):
        self.temp_file = TempSMILESFile(
            tempfile_kwargs={'prefix': 'dataloader'})
        self.fh = self.temp_file.open()

        dataset = SMILESDataset(self.fh.name)
        vocabulary = SMILESVocabulary(dataset=dataset, need_corpus=True)
        self.dataloader = SMILESBatchColumnSampler(
            corpus=vocabulary.corpus,
            batch_size=2,
            n_steps=4,
            shuffle=True,
        )
    def setUp(self):
        temp_file = TempSMILESFile(tempfile_kwargs={'prefix': 'model'})
        self.fh = temp_file.open()

        dataset = SMILESDataset(self.fh.name)
        self.vocabulary = SMILESVocabulary(dataset, need_corpus=True)
        self.batch_sampler = SMILESBatchColumnSampler(
            corpus=self.vocabulary.corpus,
            batch_size=3,
            n_steps=8,
        )

        self.n_rnn_layers = 1  # Used in output/state shape testing.
        self.n_rnn_units = 32  # Used in output/state shape testing.

        self.model = SMILESRNN(len(self.vocabulary),
                               use_one_hot=False,
                               embedding_dim=4,
                               embedding_dropout=0.25,
                               embedding_dropout_axes=0,
                               embedding_init=mx.init.Uniform(),
                               embedding_prefix='embedding_',
                               rnn='lstm',
                               rnn_n_layers=self.n_rnn_layers,
                               rnn_n_units=self.n_rnn_units,
                               rnn_i2h_init='xavier_normal',
                               rnn_h2h_init='orthogonal_normal',
                               rnn_reinit_state=True,
                               rnn_detach_state=False,
                               rnn_state_init=mx.nd.random.uniform,
                               rnn_dropout=0.0,
                               rnn_prefix='encoder_',
                               dense_n_layers=2,
                               dense_n_units=32,
                               dense_activation='relu',
                               dense_dropout=0.5,
                               dense_init=mx.init.Xavier(),
                               dense_prefix='decoder_',
                               dtype='float32',
                               prefix='model_')
예제 #7
0
 def setUp(self):
     self.smiles_string = 'CCc1c[n+]2ccc3c4ccccc4[nH]c3c2cc1'
     with TempSMILESFile(smiles_strings=self.smiles_string) as temp_fh:
         dataset = SMILESDataset(temp_fh.file_handler.name)
     self.vocabulary = SMILESVocabulary(dataset, need_corpus=True)