예제 #1
0
def test_forcedownload():
    model = ProteinBertModel.from_pretrained('bert-base')
    url = BERT_PRETRAINED_MODEL_ARCHIVE_MAP['bert-base']
    filename = url_to_filename(url, get_etag(url))
    wholepath = get_cache() / filename
    oldtime = time.ctime(os.path.getmtime(wholepath))
    model = ProteinBertModel.from_pretrained('bert-base', force_download=True)
    newtime = time.ctime(os.path.getmtime(wholepath))
    assert (newtime != oldtime)
    # Deploy model
    # iupac is the vocab for TAPE models, use unirep for the UniRep model
    tokenizer = TAPETokenizer(vocab='iupac')
    # Pfam Family: Hexapep, Clan: CL0536
    sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
    token_ids = torch.tensor([tokenizer.encode(sequence)])
    model(token_ids)
예제 #2
0
    def __init__(self, dataset_sequences):

        self.dataset_sequences = dataset_sequences
        self.model = ProteinBertModel.from_pretrained('bert-base')
        self.tokenizer = TAPETokenizer(
            vocab='iupac'
        )  # iupac is the vocab for TAPE models, use unirep for the UniRep model
예제 #3
0
    def __init__(self, config):
        super().__init__(config)

        self.bert = ProteinBertModel(config)
        self.classify = MHCHead(
            config.hidden_size, config.num_labels)

        self.init_weights()
예제 #4
0
파일: gnn.py 프로젝트: nyu-dl/dl4chem-mgm
 def __init__(self, params):
     from tape import ProteinBertModel
     super().__init__(params)
     del self.node_embedding_layers
     self.seq_output_dim = params.seq_output_dim
     self.seq_model = ProteinBertModel.from_pretrained(
         'bert-base', cache_dir='data/proteins/tape_pretrained/')
     self.seq2dim_h = nn.Linear(self.seq_output_dim, self.dim_h)
예제 #5
0
    def __init__(self, use_gpu, embedding_size, pretraining='bert-base'):
        super(BaseModel, self).__init__()

        # initialize model variables
        self.use_gpu = use_gpu
        self.historical_rmsd_avg_values = list()
        self.historical_drmsd_avg_values = list()

        if pretraining == 'bert-base':
            self.emb = ProteinBertModel.from_pretrained(pretraining)
            self.embedding_size = 768
예제 #6
0
def test_basic():
    import torch
    from tape import ProteinBertModel, ProteinBertConfig, TAPETokenizer  # type: ignore

    config = ProteinBertConfig(hidden_size=12,
                               intermediate_size=12 * 4,
                               num_hidden_layers=2)
    model = ProteinBertModel(config)
    tokenizer = TAPETokenizer(vocab='iupac')

    sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
    token_ids = torch.tensor([tokenizer.encode(sequence)])
    output = model(token_ids)
    sequence_output = output[0]  # noqa
    pooled_output = output[1]  # noqa
예제 #7
0
파일: utgn.py 프로젝트: alifkurniawan/tesis
    def __init__(self,
                 dropout=0.5,
                 alphabet_size=60,
                 input_dim=20,
                 num_vocab=256,
                 n_hid=512,
                 embedding_size=21,
                 n_head=8,
                 n_layers=6,
                 use_gpu=False,
                 batch_size=32,
                 pretraining='bert-base',
                 use_aa=True,
                 use_pssm=True,
                 use_token=False):
        super().__init__(use_gpu, embedding_size, pretraining)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.input_dim = input_dim
        self.embedding_size = embedding_size
        self.embedding_dim = embedding_size
        self.use_gpu = use_gpu
        self.num_vocab = num_vocab
        self.batch_size = batch_size

        self.src_mask = None
        if pretraining is not -1:
            self.emb = ProteinBertModel.from_pretrained(pretraining)

        self.W = nn.Linear(self.embedding_dim, self.num_vocab)

        self.pos_encoder = PositionalEncoding(num_vocab)

        encoder_layers = TransformerEncoderLayer(num_vocab, n_head, n_hid,
                                                 dropout)

        # self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers)
        encoders = TransformerEncoder(encoder_layers, n_layers)
        self.transformer_encoder = UniversalTransformer(encoders, n_layers)

        # initialize alphabet to random values between -pi and pi
        u = torch.distributions.Uniform(-3.14, 3.14)
        self.alphabet = nn.Parameter(u.rsample(torch.Size([alphabet_size, 3])))

        self._dehidrals = Dihedral(num_vocab, alphabet_size, self.batch_size)
        self.use_aa = use_aa
        self.use_pssm = use_pssm
        self.use_token = use_token
예제 #8
0
    def __init__(self, cfg=None):
        if not cfg:
            cfg = DEFAULT_CONFIG

        super().__init__(cfg)
        self.cfg = cfg
        self.device = torch.device('cuda') if cfg.use_cuda else torch.device('cpu')

        # get encoder:
        if self.cfg.encoder_type == "patched_conv":
            self.enc = PatchedConvEncoder(self.cfg)
            self._enc_hidden_dim = cfg.enc_hidden_dim
        elif self.cfg.encoder_type == "patched_conv_large":
            self.enc = PatchedConvEncoder2(self.cfg)
            self._enc_hidden_dim = cfg.enc_hidden_dim
        elif self.cfg.encoder_type == "bert":
            # use pretrained weights
            self.enc = ProteinBertModel.from_pretrained("bert-base")
            self._enc_hidden_dim = 768
        elif self.cfg.encoder_type == "resnet":
            # kaiming initialized weights
            resnet_cfg = ProteinResNetConfig()          # use defaults
            self.enc = ProteinResNetModel(resnet_cfg)   # default: 512
            self._enc_hidden_dim = resnet_cfg.hidden_size
        else:
            self._enc_hidden_dim = None
            raise NotImplementedError
        self.enc.to(self.device)

        # Get autoregressor
        # for dot product critic, z and c has same hidden dimensions
        if self.cfg.autoregressor_type == "gru":
            self.autoregressor = GRUAutoregressor(cfg, self._enc_hidden_dim, self._enc_hidden_dim).to(self.device)
        elif self.cfg.autoregressor_type == "lstm":
            self.autoregressor = LSTMAutoregressor(cfg, self._enc_hidden_dim, self._enc_hidden_dim).to(self.device)
        else:
            raise NotImplementedError

        if cfg.critic_type == "bilinear":
            raise NotImplementedError
        if cfg.critic_type == "dot_product":
            # parameterless, but make a list for each k just for compatibility with using per-position critics
            # a la original CPC paper
            self.critics = [batch_dot_product] * cfg.K
        else:
            raise NotImplementedError
예제 #9
0
         model = BertModel.from_pretrained("Rostlab/prot_bert_bfd",
                                           output_attentions=True)
         tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd",
                                                   do_lower_case=False)
     elif model_version == 'prot_bert':
         model = BertModel.from_pretrained("Rostlab/prot_bert",
                                           output_attentions=True)
         tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert",
                                                   do_lower_case=False)
     elif model_version == 'prot_albert':
         model = AlbertModel.from_pretrained("Rostlab/prot_albert",
                                             output_attentions=True)
         tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert",
                                                     do_lower_case=False)
     else:
         model = ProteinBertModel.from_pretrained(model_version,
                                                  output_attentions=True)
         tokenizer = TAPETokenizer()
     num_layers = model.config.num_hidden_layers
     num_heads = model.config.num_attention_heads
 elif args.model == 'xlnet':
     model_version = args.model_version
     if model_version == 'prot_xlnet':
         model = XLNetModel.from_pretrained("Rostlab/prot_xlnet",
                                            output_attentions=True)
         tokenizer = XLNetTokenizer.from_pretrained("Rostlab/prot_xlnet",
                                                    do_lower_case=False)
     else:
         raise ValueError('Invalid model version')
     num_layers = model.config.n_layer
     num_heads = model.config.n_head
 else:
예제 #10
0
    f_savepath = f'{savepath}{rpr}/'
    os.makedirs(f_savepath, exist_ok=True)
    
    all_indices = {}
    
    if rpr=='protein':
        import torch
        from tape import ProteinBertModel, TAPETokenizer

        unique_protein = list(set(all_protein))
        print(f'n unique protein used to compute repr: {len(unique_protein)}')
        
        unique_prot_to_idx = get_data_to_idx_mapping(all_protein)
        
        # init protein pretrained model
        model = ProteinBertModel.from_pretrained('bert-base')
        tokenizer = TAPETokenizer(vocab='iupac') 
        
        results = Parallel(n_jobs=nworkers)(delayed(get_PROTrepr)(i,x,model,tokenizer,f_savepath) for i,x in enumerate(unique_protein))
        
        all_indices = {}
        for x in results:
            i = x[0]
            prot = x[1]
            _all_idx = unique_prot_to_idx[prot]
            for _idx in _all_idx:
                all_indices[_idx] = i
        
        hp.save_pkl(f'{savepath}all_indices_{rpr}.pkl', all_indices)
        
        z_norma = True
예제 #11
0
            pass

    cpc_args.__dict__ = default_cfg

    base_model = patched_cpc.PatchedCPCModel(cpc_args)
    state_dict = dict(torch.load(base_model_path / 'best.ckpt'))
    for i in list(state_dict.keys()):
        if i.startswith('module.'):
            state_dict[i[7:]] = state_dict[i]
            del state_dict[i]
    base_model.load_state_dict(state_dict)
    base_model = heads.CPCProtEmbedding(base_model.to(device).eval(),
                                        emb_type='patched_cpc')
    emb_func = getattr(base_model, funcs[args.task])
elif args.model_type == 'bert':
    base_model = ProteinBertModel.from_pretrained('bert-base').eval().to(
        device)
elif args.model_type == 'unirep':
    base_model = UniRepModel.from_pretrained('babbler-1900').eval().to(device)

if args.model_type in ['unirep', 'bert']:
    if args.task == 'secondary_structure':
        emb_func = lambda x: base_model(x['primary'])[
            0]  # n_samples x n_tokens x emb_length
    else:
        emb_func = lambda x: base_model(x['primary'])[
            1]  # n_samples x emb_length

if args.task == 'fluorescence':
    dataset_cls = FluorescenceDataset
elif args.task == 'stability':
    dataset_cls = StabilityDataset