def open_model(transformer_name):
    print(f"Opening {transformer_name} sentence embeddings")
    if "bert-base-nli-mean" in transformer_name:
        model = SentenceTransformer('bert-base-nli-mean-tokens')
        
    elif "s_infersent" in transformer_name:
        V = 2
        MODEL_PATH = 'encoder/infersent%s.pkl' % V
        params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                        'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH, encoding="utf-8"))

        # setting word vector path for the model
        W2V_PATH = 'data/embeddings/crawl-300d-2M.vec' #need to adapt the path
        model.set_w2v_path(W2V_PATH)

    return model
Exemplo n.º 2
0
class SBERTembedder(Embedder):
    def __init__(self):
        self.sbert = SentenceTransformer('paraphrase-distilroberta-base-v1')

    def embedding(self, texts: str):
        return self.sbert.encode([preproc(text) for text in texts])

    def transform(self, texts: list):
        return [self.embedding(x) for x in texts]

    def fit(self, texts):
        pass

    def save(self, output_path: str):
        torch.save(self.sbert.state_dict(), output_path)

    def load(self, input_path: str):
        self.sbert = self.sbert.load_state_dict(torch.load(input_path))
        # todo check it
        # self.sbert.load_state_dict(torch.load(input_path))
        self.sbert.eval()
Exemplo n.º 3
0
from models import InferSent
import torch

V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = 'GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)

model.build_vocab(sentences, tokenize=True)

query = "I had pizza and pasta"
query_vec = model.encode(query)[0]
pprint(query_vec)

similarity = []
for sent in sentences:
    sim = cosine(query_vec, model.encode([sent])[0])
    print("Sentence = ", sent, "; similarity = ", sim)