import nltk nltk.download('punkt') import torch from models import InferSent # Initialize infersent = InferSent({ 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 }) infersent.load_state_dict( torch.load( '/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl' )) infersent.set_w2v_path( '/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt') # My sentences sentences = ["Hi I'm Peter", "Hi I'm Danny", "Hi I'm Ryan"] infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) print(embeddings) infersent.visualize(sentences[0], tokenize=True)
# In[11]: def cosine(u, v): return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) # In[12]: cosine( model.encode(['the cat eats.'])[0], model.encode(['the cat drinks.'])[0]) # In[13]: idx = randint(0, len(sentences)) _, _ = model.visualize(sentences[idx]) # In[14]: my_sent = 'The cat is drinking milk.' _, _ = model.visualize(my_sent) # In[15]: model.build_vocab_k_words(500000) # getting 500K words vocab my_sent = 'barack-obama is the former president of the United-States.' _, _ = model.visualize(my_sent) # In[ ]:
for input in range(len(sentences)): top = max_similar(input, embeddings, tembeddings) #print(top); #print("output is") #print() fo = open("./output/inference_outputs", "a+") #print(sentences[top[i][0]]) fo.write(esentences[top[0][0]] + '\n') fa = open("./output/output_nos", "a+") fa.write("%d" % (top[0][0] + 1) + '\n') fb = open("./output/sim_scores", "a+") fb.write("%f" % (top[0][1]) + '\n') #print() fo.close() fa.close() fb.close() print(sentences[max_similar(input, embeddings)], cosine(embeddings[input], embeddings[max_similar(input, embeddings)])) for i in range(9): _, _ = model.visualize(sentences[i], i)
refs.append(line[:-1]) hyps = [] with open(args.generated, 'r') as f: for line in f: hyps.append(line[:-1]) # build voceb infersent.build_vocab(refs+hyps, tokenize=True) # get embeddings refs_embeds = infersent.encode(refs, tokenize=True) hyps_embeds = infersent.encode(hyps, tokenize=True) # compute cosine similarity refs_norm = np.linalg.norm(refs_embeds, ord=2, axis=1) hyps_norm = np.linalg.norm(hyps_embeds, ord=2, axis=1) cosine = np.sum((refs_embeds*hyps_embeds), axis=1)/refs_norm/hyps_norm if args.output_file is not None: with open(args.output_file, 'a') as f: print(json.dumps({'embedding_cosin':float(np.mean(cosine))}), file=f) else: print ('%s,%f'%(sys.argv[1].split('/')[-2], np.mean(cosine))) ''' # visualize importance infersent.visualize('A man plays an instrument.', tokenize=True) '''
# In[20]: infersent.build_vocab(train_doc, tokenize=True) # In[21]: embeddings = infersent.encode(train_doc, tokenize=True) # In[22]: infersent.visualize('A man plays an instrument.', tokenize=True) # In[31]: embeddings.shape #This outputs a numpy array with n vectors of dimension 4096. # Now we have each and every sentence in the form of vector. We can try to match up each to some of our topics or we can try to apply clustering to see the distribution. # # k nn clustering applied # In[34]: