def test_forcedownload(): model = ProteinBertModel.from_pretrained('bert-base') url = BERT_PRETRAINED_MODEL_ARCHIVE_MAP['bert-base'] filename = url_to_filename(url, get_etag(url)) wholepath = get_cache() / filename oldtime = time.ctime(os.path.getmtime(wholepath)) model = ProteinBertModel.from_pretrained('bert-base', force_download=True) newtime = time.ctime(os.path.getmtime(wholepath)) assert (newtime != oldtime) # Deploy model # iupac is the vocab for TAPE models, use unirep for the UniRep model tokenizer = TAPETokenizer(vocab='iupac') # Pfam Family: Hexapep, Clan: CL0536 sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ' token_ids = torch.tensor([tokenizer.encode(sequence)]) model(token_ids)
def __init__(self, dataset_sequences): self.dataset_sequences = dataset_sequences self.model = ProteinBertModel.from_pretrained('bert-base') self.tokenizer = TAPETokenizer( vocab='iupac' ) # iupac is the vocab for TAPE models, use unirep for the UniRep model
def __init__(self, config): super().__init__(config) self.bert = ProteinBertModel(config) self.classify = MHCHead( config.hidden_size, config.num_labels) self.init_weights()
def __init__(self, params): from tape import ProteinBertModel super().__init__(params) del self.node_embedding_layers self.seq_output_dim = params.seq_output_dim self.seq_model = ProteinBertModel.from_pretrained( 'bert-base', cache_dir='data/proteins/tape_pretrained/') self.seq2dim_h = nn.Linear(self.seq_output_dim, self.dim_h)
def __init__(self, use_gpu, embedding_size, pretraining='bert-base'): super(BaseModel, self).__init__() # initialize model variables self.use_gpu = use_gpu self.historical_rmsd_avg_values = list() self.historical_drmsd_avg_values = list() if pretraining == 'bert-base': self.emb = ProteinBertModel.from_pretrained(pretraining) self.embedding_size = 768
def test_basic(): import torch from tape import ProteinBertModel, ProteinBertConfig, TAPETokenizer # type: ignore config = ProteinBertConfig(hidden_size=12, intermediate_size=12 * 4, num_hidden_layers=2) model = ProteinBertModel(config) tokenizer = TAPETokenizer(vocab='iupac') sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ' token_ids = torch.tensor([tokenizer.encode(sequence)]) output = model(token_ids) sequence_output = output[0] # noqa pooled_output = output[1] # noqa
def __init__(self, dropout=0.5, alphabet_size=60, input_dim=20, num_vocab=256, n_hid=512, embedding_size=21, n_head=8, n_layers=6, use_gpu=False, batch_size=32, pretraining='bert-base', use_aa=True, use_pssm=True, use_token=False): super().__init__(use_gpu, embedding_size, pretraining) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.input_dim = input_dim self.embedding_size = embedding_size self.embedding_dim = embedding_size self.use_gpu = use_gpu self.num_vocab = num_vocab self.batch_size = batch_size self.src_mask = None if pretraining is not -1: self.emb = ProteinBertModel.from_pretrained(pretraining) self.W = nn.Linear(self.embedding_dim, self.num_vocab) self.pos_encoder = PositionalEncoding(num_vocab) encoder_layers = TransformerEncoderLayer(num_vocab, n_head, n_hid, dropout) # self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers) encoders = TransformerEncoder(encoder_layers, n_layers) self.transformer_encoder = UniversalTransformer(encoders, n_layers) # initialize alphabet to random values between -pi and pi u = torch.distributions.Uniform(-3.14, 3.14) self.alphabet = nn.Parameter(u.rsample(torch.Size([alphabet_size, 3]))) self._dehidrals = Dihedral(num_vocab, alphabet_size, self.batch_size) self.use_aa = use_aa self.use_pssm = use_pssm self.use_token = use_token
def __init__(self, cfg=None): if not cfg: cfg = DEFAULT_CONFIG super().__init__(cfg) self.cfg = cfg self.device = torch.device('cuda') if cfg.use_cuda else torch.device('cpu') # get encoder: if self.cfg.encoder_type == "patched_conv": self.enc = PatchedConvEncoder(self.cfg) self._enc_hidden_dim = cfg.enc_hidden_dim elif self.cfg.encoder_type == "patched_conv_large": self.enc = PatchedConvEncoder2(self.cfg) self._enc_hidden_dim = cfg.enc_hidden_dim elif self.cfg.encoder_type == "bert": # use pretrained weights self.enc = ProteinBertModel.from_pretrained("bert-base") self._enc_hidden_dim = 768 elif self.cfg.encoder_type == "resnet": # kaiming initialized weights resnet_cfg = ProteinResNetConfig() # use defaults self.enc = ProteinResNetModel(resnet_cfg) # default: 512 self._enc_hidden_dim = resnet_cfg.hidden_size else: self._enc_hidden_dim = None raise NotImplementedError self.enc.to(self.device) # Get autoregressor # for dot product critic, z and c has same hidden dimensions if self.cfg.autoregressor_type == "gru": self.autoregressor = GRUAutoregressor(cfg, self._enc_hidden_dim, self._enc_hidden_dim).to(self.device) elif self.cfg.autoregressor_type == "lstm": self.autoregressor = LSTMAutoregressor(cfg, self._enc_hidden_dim, self._enc_hidden_dim).to(self.device) else: raise NotImplementedError if cfg.critic_type == "bilinear": raise NotImplementedError if cfg.critic_type == "dot_product": # parameterless, but make a list for each k just for compatibility with using per-position critics # a la original CPC paper self.critics = [batch_dot_product] * cfg.K else: raise NotImplementedError
model = BertModel.from_pretrained("Rostlab/prot_bert_bfd", output_attentions=True) tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case=False) elif model_version == 'prot_bert': model = BertModel.from_pretrained("Rostlab/prot_bert", output_attentions=True) tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False) elif model_version == 'prot_albert': model = AlbertModel.from_pretrained("Rostlab/prot_albert", output_attentions=True) tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False) else: model = ProteinBertModel.from_pretrained(model_version, output_attentions=True) tokenizer = TAPETokenizer() num_layers = model.config.num_hidden_layers num_heads = model.config.num_attention_heads elif args.model == 'xlnet': model_version = args.model_version if model_version == 'prot_xlnet': model = XLNetModel.from_pretrained("Rostlab/prot_xlnet", output_attentions=True) tokenizer = XLNetTokenizer.from_pretrained("Rostlab/prot_xlnet", do_lower_case=False) else: raise ValueError('Invalid model version') num_layers = model.config.n_layer num_heads = model.config.n_head else:
f_savepath = f'{savepath}{rpr}/' os.makedirs(f_savepath, exist_ok=True) all_indices = {} if rpr=='protein': import torch from tape import ProteinBertModel, TAPETokenizer unique_protein = list(set(all_protein)) print(f'n unique protein used to compute repr: {len(unique_protein)}') unique_prot_to_idx = get_data_to_idx_mapping(all_protein) # init protein pretrained model model = ProteinBertModel.from_pretrained('bert-base') tokenizer = TAPETokenizer(vocab='iupac') results = Parallel(n_jobs=nworkers)(delayed(get_PROTrepr)(i,x,model,tokenizer,f_savepath) for i,x in enumerate(unique_protein)) all_indices = {} for x in results: i = x[0] prot = x[1] _all_idx = unique_prot_to_idx[prot] for _idx in _all_idx: all_indices[_idx] = i hp.save_pkl(f'{savepath}all_indices_{rpr}.pkl', all_indices) z_norma = True
pass cpc_args.__dict__ = default_cfg base_model = patched_cpc.PatchedCPCModel(cpc_args) state_dict = dict(torch.load(base_model_path / 'best.ckpt')) for i in list(state_dict.keys()): if i.startswith('module.'): state_dict[i[7:]] = state_dict[i] del state_dict[i] base_model.load_state_dict(state_dict) base_model = heads.CPCProtEmbedding(base_model.to(device).eval(), emb_type='patched_cpc') emb_func = getattr(base_model, funcs[args.task]) elif args.model_type == 'bert': base_model = ProteinBertModel.from_pretrained('bert-base').eval().to( device) elif args.model_type == 'unirep': base_model = UniRepModel.from_pretrained('babbler-1900').eval().to(device) if args.model_type in ['unirep', 'bert']: if args.task == 'secondary_structure': emb_func = lambda x: base_model(x['primary'])[ 0] # n_samples x n_tokens x emb_length else: emb_func = lambda x: base_model(x['primary'])[ 1] # n_samples x emb_length if args.task == 'fluorescence': dataset_cls = FluorescenceDataset elif args.task == 'stability': dataset_cls = StabilityDataset