def test_basic(): import torch from tape import ProteinBertModel, ProteinBertConfig, TAPETokenizer # type: ignore config = ProteinBertConfig(hidden_size=12, intermediate_size=12 * 4, num_hidden_layers=2) model = ProteinBertModel(config) tokenizer = TAPETokenizer(vocab='iupac') sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ' token_ids = torch.tensor([tokenizer.encode(sequence)]) output = model(token_ids) sequence_output = output[0] # noqa pooled_output = output[1] # noqa
def test_forcedownload(): model = ProteinBertModel.from_pretrained('bert-base') url = BERT_PRETRAINED_MODEL_ARCHIVE_MAP['bert-base'] filename = url_to_filename(url, get_etag(url)) wholepath = get_cache() / filename oldtime = time.ctime(os.path.getmtime(wholepath)) model = ProteinBertModel.from_pretrained('bert-base', force_download=True) newtime = time.ctime(os.path.getmtime(wholepath)) assert (newtime != oldtime) # Deploy model # iupac is the vocab for TAPE models, use unirep for the UniRep model tokenizer = TAPETokenizer(vocab='iupac') # Pfam Family: Hexapep, Clan: CL0536 sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ' token_ids = torch.tensor([tokenizer.encode(sequence)]) model(token_ids)
class encoding_tape(object): def __init__(self, dataset_sequences): self.dataset_sequences = dataset_sequences self.model = ProteinBertModel.from_pretrained('bert-base') self.tokenizer = TAPETokenizer( vocab='iupac' ) # iupac is the vocab for TAPE models, use unirep for the UniRep model def apply_encoding(self): matrix_encoding = [] for i in range(len(self.dataset_sequences)): try: token_ids = torch.tensor([ self.tokenizer.encode( self.dataset_sequences['sequence'][i]) ]) output = self.model(token_ids) sequence_output = output[0] matrix_data = [] for element in sequence_output[0].cpu().detach().numpy(): matrix_data.append(element) encoding_avg = [] for k in range(len(matrix_data[0])): array_value = [] for j in range(len(matrix_data)): array_value.append(matrix_data[j][k]) encoding_avg.append(np.mean(array_value)) matrix_encoding.append(encoding_avg) except: pass header = [ "P_" + str(i + 1) for i in range(len(matrix_encoding[0]) - 1) ] self.dataset_encoding = pd.DataFrame(matrix_encoding, columns=header)
def UniRep_Embed(input_seq): T0 = time.time() UNIREPEB_ = [] PID = [] print("UniRep Embedding...") model = UniRepModel.from_pretrained('babbler-1900') model = model.to(DEVICE) tokenizer = TAPETokenizer(vocab='unirep') for key, value in input_seq.items(): PID.append(key) sequence = value if len(sequence) == 0: print('# WARNING: sequence', PID, 'has length=0. Skipping.', file=sys.stderr) continue with torch.no_grad(): token_ids = torch.tensor([tokenizer.encode(sequence)]) token_ids = token_ids.to(DEVICE) output = model(token_ids) unirep_output = output[0] unirep_output = torch.squeeze(unirep_output) unirep_output = unirep_output.mean(0) unirep_output = unirep_output.cpu().numpy() UNIREPEB_.append(unirep_output.tolist()) unirep_feature = pd.DataFrame(UNIREPEB_) col = ["UniRep_F" + str(i + 1) for i in range(0, 1900)] unirep_feature.columns = col unirep_feature = pd.concat([unirep_feature], axis=1) unirep_feature.index = PID # print(unirep_feature.shape) unirep_feature.to_csv("./dataset/unirep_feature.csv") print("Getting Deep Representation Learning Features with UniRep is done.") print("it took %0.3f mins.\n" % ((time.time() - T0) / 60)) return unirep_feature
tokenizer = TAPETokenizer(vocab='iupac') num_of_features = 768 import numpy as np X=np.zeros((len(train_seqs),num_of_features)) y=np.zeros(len(train_seqs)) ind_X=np.zeros((len(test_seqs),num_of_features)) ind_y=np.zeros(len(test_seqs)) # now lets populate X i=0 for s in train_seqs: #f=extractFeatures(s) token_ids = torch.tensor([tokenizer.encode(s)]) output = model(token_ids) sequence_output = output[0] pooled_output = output[1] X[i]=np.array(np.mean(sequence_output.detach().numpy(),axis=1) ) i=i+1 y=np.array(train_labels) i=0 for s in test_seqs: #f=extractFeatures(s) token_ids = torch.tensor([tokenizer.encode(s)]) output = model(token_ids)
def DRLF_Embed(fastaFile, outFile, device=-2): path = fastaFile count = 0 SSAEMB_ = [] UNIREPEB_ = [] ##read Fasta File inData = fasta.fasta2csv(path) Seqs = inData["Seq"] PID_ = [] ##SSA Embedding print("SSA Embedding...") lm_embed, lstm_stack, proj = load_model( "./src/PretrainedModel/SSA_embed.model", use_cuda=True) with open(path, 'rb') as f: for name, sequence in fasta.parse_stream(f): pid = str(name.decode('utf-8')) if len(sequence) == 0: print('# WARNING: sequence', pid, 'has length=0. Skipping.', file=sys.stderr) continue PID_.append(pid) z = embed_sequence(sequence, lm_embed, lstm_stack, proj, final_only=True, pool='avg', use_cuda=True) SSAEMB_.append(z) count += 1 print(sequence, '# {} sequences processed...'.format(count), file=sys.stderr, end='\r') print("SSA embedding finished@") ssa_feature = pd.DataFrame(SSAEMB_) col = ["SSA_F" + str(i + 1) for i in range(0, 121)] ssa_feature.columns = col print("UniRep Embedding...") print("Loading UniRep Model...", file=sys.stderr, end='\r') model = UniRepModel.from_pretrained('babbler-1900') model = model.to(DEVICE) tokenizer = TAPETokenizer(vocab='unirep') count = 0 PID_ = inData["PID"] for sequence in Seqs: if len(sequence) == 0: print('# WARNING: sequence', pid, 'has length=0. Skipping.', file=sys.stderr) continue with torch.no_grad(): token_ids = torch.tensor([tokenizer.encode(sequence)]) token_ids = token_ids.to(DEVICE) output = model(token_ids) unirep_output = output[0] #print(unirep_output.shape) unirep_output = torch.squeeze(unirep_output) #print(unirep_output.shape) unirep_output = unirep_output.mean(0) unirep_output = unirep_output.cpu().numpy() # print(sequence,len(sequence),unirep_output.shape) UNIREPEB_.append(unirep_output.tolist()) count += 1 print(sequence, '# {} sequences processed...'.format(count), file=sys.stderr, end='\r') unirep_feature = pd.DataFrame(UNIREPEB_) col = ["UniRep_avg_F" + str(i + 1) for i in range(0, 1900)] unirep_feature.columns = col print("UniRep Embedding Finished@!") Features = pd.concat([ssa_feature, unirep_feature], axis=1) Features.index = PID_ Features.to_csv(outFile) print("Getting Deep Representation Learning Features is done.") return Features, inData
import torch from tape import ProteinBertModel, TAPETokenizer model = ProteinBertModel.from_pretrained('bert-base') tokenizer = TAPETokenizer( vocab='iupac' ) # iupac is the vocab for TAPE models, use unirep for the UniRep model # Pfam Family: Hexapep, Clan: CL0536 sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ' token_ids = torch.tensor([tokenizer.encode(sequence)]) output = model(token_ids) sequence_output = output[0] pooled_output = output[1]