def compute_intent_vectors(self, sentences): # TODO IMPLEMENT CACHING! from InferSent.models import InferSent infersent_folder = Path('./InferSent') infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl') MODEL_PARAMETERS = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt' model = InferSent(MODEL_PARAMETERS) model.load_state_dict(torch.load(infersent_path)) if torch.cuda.is_available(): model.cuda() model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) utterances_dict = self.get_utterances_dict(sentences) vectors = {} for i, (intent, sentences) in enumerate(utterances_dict.items()): LOGGER.info('{}/{} done'.format(i + 1, len(utterances_dict.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
class UniversalSentenceEncoder: def __init__(self): super().__init__() model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) self.model.eval() use_cuda = True self.model = self.model.cuda() if use_cuda else self.model self.model.set_w2v_path(W2V_PATH) self.model.build_vocab_k_words(K=100000) def semantic_sim(self, sents1, sents2): embed1 = self.model.encode(sents1, tokenize=False) embed2 = self.model.encode(sents2, tokenize=False) embed1 = torch.tensor(embed1) embed2 = torch.tensor(embed2) sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True) sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True) cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1) clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0) scores = 1.0 - torch.acos(clip_cosine_similarities) return scores.cpu().numpy()
def __init__(self, bsize=64, word_emb_dim=300, enc_lstm_dim=2048, pool_type='max', dpout_model=0.0, version=2, model_path='../infersent/infersent2.pkl', path_to_w2v='../fasttext/crawl-300d-2M.vec', use_cuda=True): self.version = version self.dpout_model = dpout_model self.pool_type = pool_type self.enc_lstm_dim = enc_lstm_dim self.word_emb_dim = word_emb_dim self.bsize = bsize model = InferSent({ 'bsize': bsize, 'word_emb_dim': word_emb_dim, 'enc_lstm_dim': enc_lstm_dim, 'pool_type': pool_type, 'dpout_model': dpout_model, 'version': version }) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(path_to_w2v) if not use_cuda: self.model = model else: self.model = model.cuda() self.first_call = True
def Start_chatbot(): model_version = 1 MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) use_cuda = False model = model.cuda() if use_cuda else model W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=570000) dict = {} embeddings = {} questions = [] answers = [] with open('../data/questions.txt') as f: content = f.readlines() questions = [x.strip() for x in content] with open('../data/answers.txt') as f: content = f.readlines() answers = [x.strip() for x in content] for i in range(len(questions)): dict[questions[i]] = answers[i] embeddings[questions[i]] = model.encode([questions[i]])[0] return model, dict, embeddings
def process(channel): # Load the Classifier tf.reset_default_graph() NN = classifer() NN.load('nn-classifier-v2') # Load the sentence embedder model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') all_files = glob.glob('../files/CableNews/%s/*.p' % channel) read_files = pickle.load(open('%s_visit.p' % (channel), 'rb')) counter = len(read_files) for file in tqdm(all_files): if file in read_files: continue else: read_files.append(file) if np.random.rand() < 0.3: pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb')) res = pickle.load(open(file, 'rb')) results = {} prev_text = "" all_text = [] all_keys = [] for key in res.keys(): meta_data = res[key][0] # First in the list if len(meta_data['text']) < 10: continue # Make sure we drop the duplicates: Texts should be differents current_text = meta_data['text'][:10] if current_text == prev_text: continue else: prev_text = current_text text = tokenizer.tokenize(meta_data['text']) if len(text) <= 2: continue # Drop the first sentence text = text[1:] senteces = [] for s in text: #Drop super small and super large senteces if len(s.split()) > 30 and len(s.split()) < 50: senteces.append(s) if len(senteces) == 0: continue # Calculate the embedding all_text.extend(senteces) all_keys.extend([key] * len(senteces)) if len(all_text) == 0: continue all_embed = model.encode(all_text, bsize=128, tokenize=True, verbose=False) all_predictions = NN.predict(all_embed)[ 0] # Merge the probabilties and take top 2: prev_key = None total_prob = np.zeros((13, 1)) key_counter = 0 for current_key in all_keys: if current_key == prev_key: total_prob[:, 0] += all_predictions[key_counter, :] else: Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])] Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100 results[current_key] = { 'Topics': list(Topics), 'Probs': list(Probs), 'gender': res[current_key][0]['gender'], 'persons': res[current_key][0]['persons'], 'locations': res[current_key][0]['locations'] } prev_key = current_key total_prob = np.zeros((13, 1)) total_prob[:, 0] += all_predictions[key_counter, :] key_counter += 1 pickle.dump(results, open('processed_data/%s/%d.p' % (channel, counter), 'wb')) counter += 1
MODEL_PATH = "encoder/infersent1.pkl" params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # CUDA use_cuda = True model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words print('Load glove') model.build_vocab_k_words(K=2000000) # load sentence dis = '/home/shl183/nlp4note/classified_txt/discharge-sep' res = '/home/shl183/nlp4note/infersent' patients = os.listdir(dis) # exist = os.listdir(res) # with open('./tmp.pkl','wb') as f:
if __name__ == "__main__": # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(PATH_TO_W2V) params_senteval['infersent'] = model.cuda() se = senteval.engine.SE(params_senteval, batcher, prepare) # transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', # 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', # 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark'] transfer_tasks = ['AmenitySimilarEvents'] results = se.eval(transfer_tasks) print(results) if not os.path.exists(PATH_TO_RESULTS): os.mkdir(PATH_TO_RESULTS) with open(os.path.join(PATH_TO_RESULTS, 'infersent.json'), 'w') as out_file: json.dump(results, out_file, cls=NumpyEncoder)