Exemplo n.º 1
0
 def load_featurizer(self, config_dict):
     # load smiles index featurizer
     if self.max_seq_len is None:
         max_seq_len = config_dict.get('max_seq_length')
         logger.debug('getting smiles index featurizer of length: ', max_seq_len)
     else:
         max_seq_len = self.max_seq_len
     return SmilesIndexFeaturizer.bert_smiles_index_featurizer(max_seq_len, permute=self.permute)
Exemplo n.º 2
0
    def load_datasets(self):
        single_seq_len, total_seq_len = get_seq_lengths(
            self.hparams.max_seq_length, self.hparams.is_same_smiles)

        featurizer = SmilesIndexFeaturizer.bert_smiles_index_featurizer(
            total_seq_len)

        train_dataset, valid_dataset, test_dataset = None, None, None

        train_dataset = BertSmilesDataset(
            input_path=self.hparams.train_file,
            featurizer=featurizer,
            single_seq_len=single_seq_len,
            total_seq_len=total_seq_len,
            is_same=self.hparams.is_same_smiles,
            num_physchem=self.hparams.num_physchem_properties,
            permute=self.hparams.permute,
            named_descriptor_set=self.hparams.named_descriptor_set,
        )

        if self.hparams.valid_file:
            valid_dataset = BertSmilesDataset(
                input_path=self.hparams.valid_file,
                featurizer=featurizer,
                single_seq_len=single_seq_len,
                total_seq_len=total_seq_len,
                is_same=self.hparams.is_same_smiles,
                num_physchem=self.hparams.num_physchem_properties,
                permute=self.hparams.permute,
                named_descriptor_set=self.hparams.named_descriptor_set,
            )

        if self.hparams.test_file:
            test_dataset = BertSmilesDataset(
                input_path=self.hparams.test_file,
                featurizer=featurizer,
                single_seq_len=single_seq_len,
                total_seq_len=total_seq_len,
                is_same=self.hparams.is_same_smiles,
                num_physchem=self.hparams.num_physchem_properties,
                permute=self.hparams.permute,
                named_descriptor_set=self.hparams.named_descriptor_set,
            )

        assert (
            self.hparams.vocab_size == train_dataset.featurizer.vocab_size
        ), f"{self.hparams.vocab_size} should equal {train_dataset.featurizer.vocab_size}"

        return {
            'train': train_dataset,
            'valid': valid_dataset,
            'test': test_dataset
        }
Exemplo n.º 3
0
    def load_datasets(self):
        featurizer = SmilesIndexFeaturizer.bert_smiles_index_featurizer(
            self.hparams.max_seq_length)

        train_dataset = BertFinetuneSmilesDataset(
            input_path=self.hparams.train_file,
            featurizer=featurizer,
            single_seq_len=self.hparams.max_seq_length,
            total_seq_len=self.hparams.max_seq_length,
            label_column=self.hparams.label_column,
            is_same=False,
        )

        validation_dataset = BertFinetuneSmilesDataset(
            input_path=self.hparams.valid_file,
            featurizer=featurizer,
            single_seq_len=self.hparams.max_seq_length,
            total_seq_len=self.hparams.max_seq_length,
            label_column=self.hparams.label_column,
            is_same=False,
        )

        test_dataset = BertFinetuneSmilesDataset(
            input_path=self.hparams.test_file,
            featurizer=featurizer,
            single_seq_len=self.hparams.max_seq_length,
            total_seq_len=self.hparams.max_seq_length,
            label_column=self.hparams.label_column,
            is_same=False,
            inference_mode=True,
        )

        return {
            'train': train_dataset,
            'valid': validation_dataset,
            'test': test_dataset
        }
Exemplo n.º 4
0
 def __init__(self, config: MolbertConfig):
     super().__init__(config)
     from molbert.utils.featurizer.molfeaturizer import SmilesIndexFeaturizer
     self.tokenizer = SmilesIndexFeaturizer.bert_smiles_index_featurizer(
         config.max_size)
Exemplo n.º 5
0
def featurizer():
    return SmilesIndexFeaturizer.bert_smiles_index_featurizer(64)
Exemplo n.º 6
0
import random

import numpy as np

from molbert.utils.lm_utils import (
    InputExample,
    _truncate_seq_pair,
    convert_example_to_features,
    get_seq_lengths,
    random_word,
    unmask_lm_labels,
)
from molbert.utils.featurizer.molfeaturizer import SmilesIndexFeaturizer

TOKENIZER = SmilesIndexFeaturizer.bert_smiles_index_featurizer(10)


def test_get_seq_lenghts_with_issame():
    seqlen = 10
    single_seq_len, total_seq_len = get_seq_lengths(seqlen, is_same=True)

    assert single_seq_len == seqlen - 2
    assert total_seq_len == 2 * seqlen


def test_get_seq_lenghts_without_issame():
    seqlen = 10
    single_seq_len, total_seq_len = get_seq_lengths(seqlen, is_same=False)

    assert single_seq_len == seqlen - 2
    assert total_seq_len == seqlen