class UniversalSentenceEncoder: def __init__(self): super().__init__() model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) self.model.eval() use_cuda = True self.model = self.model.cuda() if use_cuda else self.model self.model.set_w2v_path(W2V_PATH) self.model.build_vocab_k_words(K=100000) def semantic_sim(self, sents1, sents2): embed1 = self.model.encode(sents1, tokenize=False) embed2 = self.model.encode(sents2, tokenize=False) embed1 = torch.tensor(embed1) embed2 = torch.tensor(embed2) sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True) sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True) cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1) clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0) scores = 1.0 - torch.acos(clip_cosine_similarities) return scores.cpu().numpy()
def compute_intent_vectors(self, sentences): # TODO IMPLEMENT CACHING! from InferSent.models import InferSent infersent_folder = Path('./InferSent') infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl') MODEL_PARAMETERS = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt' model = InferSent(MODEL_PARAMETERS) model.load_state_dict(torch.load(infersent_path)) if torch.cuda.is_available(): model.cuda() model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) utterances_dict = self.get_utterances_dict(sentences) vectors = {} for i, (intent, sentences) in enumerate(utterances_dict.items()): LOGGER.info('{}/{} done'.format(i + 1, len(utterances_dict.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
def _load_pretrained_model(verbose=True): if verbose: print(f">>> Loading pretrained model from {_MODEL_PATH}") infersent = InferSent(_PARAMS_MODEL) infersent.load_state_dict(torch.load(_MODEL_PATH)) infersent.set_w2v_path(_W2V_PATH) infersent.build_vocab_k_words(K=_K_WORDS_VOCAB) return infersent
def __init__(self): #print("Initializing Infersent..") model_version = 1 MODEL_PATH = get_project_root() / Path("encoder/infersent%s.pkl" % model_version) params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # word vector path for the model: W2V_PATH = get_project_root() / Path('GloVe/glove.840B.300d.txt' if model_version == 1 else '../fastText/crawl-300d-2M.vec') model.set_w2v_path(W2V_PATH) # build the vocabulary of word vectors model.build_vocab_k_words(K=100000) self.model = model
def Start_chatbot(): model_version = 1 MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) use_cuda = False model = model.cuda() if use_cuda else model W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=570000) dict = {} embeddings = {} questions = [] answers = [] with open('../data/questions.txt') as f: content = f.readlines() questions = [x.strip() for x in content] with open('../data/answers.txt') as f: content = f.readlines() answers = [x.strip() for x in content] for i in range(len(questions)): dict[questions[i]] = answers[i] embeddings[questions[i]] = model.encode([questions[i]])[0] return model, dict, embeddings
def process(channel): # Load the Classifier tf.reset_default_graph() NN = classifer() NN.load('nn-classifier-v2') # Load the sentence embedder model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') all_files = glob.glob('../files/CableNews/%s/*.p' % channel) read_files = pickle.load(open('%s_visit.p' % (channel), 'rb')) counter = len(read_files) for file in tqdm(all_files): if file in read_files: continue else: read_files.append(file) if np.random.rand() < 0.3: pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb')) res = pickle.load(open(file, 'rb')) results = {} prev_text = "" all_text = [] all_keys = [] for key in res.keys(): meta_data = res[key][0] # First in the list if len(meta_data['text']) < 10: continue # Make sure we drop the duplicates: Texts should be differents current_text = meta_data['text'][:10] if current_text == prev_text: continue else: prev_text = current_text text = tokenizer.tokenize(meta_data['text']) if len(text) <= 2: continue # Drop the first sentence text = text[1:] senteces = [] for s in text: #Drop super small and super large senteces if len(s.split()) > 30 and len(s.split()) < 50: senteces.append(s) if len(senteces) == 0: continue # Calculate the embedding all_text.extend(senteces) all_keys.extend([key] * len(senteces)) if len(all_text) == 0: continue all_embed = model.encode(all_text, bsize=128, tokenize=True, verbose=False) all_predictions = NN.predict(all_embed)[ 0] # Merge the probabilties and take top 2: prev_key = None total_prob = np.zeros((13, 1)) key_counter = 0 for current_key in all_keys: if current_key == prev_key: total_prob[:, 0] += all_predictions[key_counter, :] else: Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])] Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100 results[current_key] = { 'Topics': list(Topics), 'Probs': list(Probs), 'gender': res[current_key][0]['gender'], 'persons': res[current_key][0]['persons'], 'locations': res[current_key][0]['locations'] } prev_key = current_key total_prob = np.zeros((13, 1)) total_prob[:, 0] += all_predictions[key_counter, :] key_counter += 1 pickle.dump(results, open('processed_data/%s/%d.p' % (channel, counter), 'wb')) counter += 1
model_pkl = '../InferSent/encoder/infersent1.pkl' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } infer_sent_model = InferSent(params_model) infer_sent_model.load_state_dict(torch.load(model_pkl)) # In[111]: infer_sent_model.set_w2v_path(glove_w2v_loc) infer_sent_model.build_vocab_k_words(K=100000) # infer_sent_model.to(torch.device("cuda:0")) # In[112]: infer_sent_model.encode(["This man is playing computer games"], tokenize=True) # In[113]: def get_embedding_for_context(ctx): if not isinstance(ctx, list): # print("ctx is not list") ctx = [ctx] return infer_sent_model.encode(ctx, tokenize=True)
'version': 1 } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # CUDA use_cuda = True model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words print('Load glove') model.build_vocab_k_words(K=2000000) # load sentence dis = '/home/shl183/nlp4note/classified_txt/discharge-sep' res = '/home/shl183/nlp4note/infersent' patients = os.listdir(dis) # exist = os.listdir(res) # with open('./tmp.pkl','wb') as f: # pickle.dump(exist,f) for patient in patients: # if patient in exist: # continue notes = os.listdir('{}/{}'.format(dis, patient)) for note in notes: tps = os.listdir('{}/{}/{}'.format(dis, patient, note)) if os.path.exists('{}/{}/{}'.format(res, patient, note)):
'pool_type': 'max', 'dpout_model': 0.0, 'version': VERSION } if VERSION == 1: W2V = 'C:/users/georg/Desktop/GloVe/glove.840B.300d.txt' else: W2V = 'C:/Users/georg/Desktop/fastText/crawl-300d-2M.vec' VOCAB_SIZE = 100000 NUM_STEPS = 300 # set up model model = InferSent(PARAMS).to(DEVICE) model.load_state_dict(torch.load(WEIGHTS)) model.set_w2v_path(W2V) word2vec = model.build_vocab_k_words(K=VOCAB_SIZE) # setup the NN-classifer vec2word = KNeighborsClassifier(n_neighbors=1) vecs = [] words = [] for key, val in word2vec.items(): if val.shape == (300, ): vecs.append(val) words.append(key) X = np.vstack(vecs) y = np.array(words) vec2word.fit(X, y) # NOTE: stacked word vectors are len x 1 x vec dim
params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) # infersent.build_vocab(sentences, tokenize=True) infersent.build_vocab_k_words(K=100000) embeddings = infersent.encode(sentences, bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(embeddings))) #### End Paste parsedQs = [] with open(questions, "r+") as f: for q in f.readlines(): parsedQs.append(preprocessQs(q)) # print(parsedQs)s qEmbeddings = infersent.encode(parsedQs,