def open_model(transformer_name): print(f"Opening {transformer_name} sentence embeddings") if "bert-base-nli-mean" in transformer_name: model = SentenceTransformer('bert-base-nli-mean-tokens') elif "s_infersent" in transformer_name: V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH, encoding="utf-8")) # setting word vector path for the model W2V_PATH = 'data/embeddings/crawl-300d-2M.vec' #need to adapt the path model.set_w2v_path(W2V_PATH) return model
class SBERTembedder(Embedder): def __init__(self): self.sbert = SentenceTransformer('paraphrase-distilroberta-base-v1') def embedding(self, texts: str): return self.sbert.encode([preproc(text) for text in texts]) def transform(self, texts: list): return [self.embedding(x) for x in texts] def fit(self, texts): pass def save(self, output_path: str): torch.save(self.sbert.state_dict(), output_path) def load(self, input_path: str): self.sbert = self.sbert.load_state_dict(torch.load(input_path)) # todo check it # self.sbert.load_state_dict(torch.load(input_path)) self.sbert.eval()
from models import InferSent import torch V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) model.build_vocab(sentences, tokenize=True) query = "I had pizza and pasta" query_vec = model.encode(query)[0] pprint(query_vec) similarity = [] for sent in sentences: sim = cosine(query_vec, model.encode([sent])[0]) print("Sentence = ", sent, "; similarity = ", sim)