def calcule_eucl(text, question): blob = TextBlob("".join(text)) sentences = [item.raw for item in blob.sentences] V = 2 MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'InferSent/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) dict_embeddings = {} for i in range(len(sentences)): dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True) encode_question = infersent.encode([question], tokenize=True) eucl = eucl_sim(dict_embeddings, encode_question) return sentences, eucl
def embed_sent(datafile): sentences = [] with open(datafile, 'r') as f: i = 0 for line in f: line = line.replace('\n', '') sentences.append(line) i += 1 if i == 455820: break V = 1 MODEL_PATH = 'infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'GloVe/glove.840B.300d.txt' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) np.savetxt("../../wiki-split/Data/Infersent_vectors/complex_sent", embeddings)
class InferSentFeatures: def __init__(self, lang_enc_dir, sentences): sys.path.insert(0, os.path.join(lang_enc_dir, 'InferSent/')) from models import InferSent version = 1 MODEL_PATH = os.path.join( lang_enc_dir, 'InferSent/encoder/infersent%s.pkl' % version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = os.path.join(lang_enc_dir, 'glove/glove.6B.300d.txt') self.model.set_w2v_path(W2V_PATH) self.model.build_vocab(sentences, tokenize=True) def generate_embeddings(self, sentences): embeddings = self.model.encode(sentences, tokenize=True) return embeddings
def create_embeddings(infer_path, data_path, em_type): yt_titles = yt.get_yt_titles() with open("data/whtitles", "r") as f: wh_titles = [line.rstrip('\n') for line in f] if em_type == "yt": # Youtube save_f = os.path.join(data_path, "yt_embed") titles = yt_titles elif em_type == "wh": # Wikihow save_f = os.path.join(data_path, "wh_embed") titles = wh_titles else: raise "Unknown embedding type: {}".format(em_type) nltk.download('punkt') V = 1 MODEL_PATH = os.path.join(infer_path, 'encoder/infersent%s.pkl' % V) params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent = infersent.cuda() W2V_PATH = os.path.join(infer_path, 'GloVe/glove.840B.300d.txt') infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(yt_titles + wh_titles, tokenize=True) embed = infersent.encode(titles, tokenize=True) np.save(save_f, embed)
class Encoder2: ''' Encoder based on InferSent ''' WORD_VECTORS_FILE = 'crawl-300d-2M.vec' MODEL_FILE = 'infersent2.pkl' def __init__(self, word_vectors_dir, models_dir): word_vectors = os.path.join(word_vectors_dir, self.WORD_VECTORS_FILE) model_file = os.path.join(models_dir, self.MODEL_FILE) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(model_file)) self.model.set_w2v_path(word_vectors) def start(self, texts): texts_list = texts.values.tolist() self.model.build_vocab(texts_list, tokenize=True) def close(self): pass def encode(self, texts_batch): texts_batch_list = texts_batch.values.tolist() texts_batch_vec = self.model.encode(texts_batch_list, tokenize=True) return texts_batch_vec
def infersent_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path): model_path = data_fold_path + 'word_sent_embed/infersent2.pickle' word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec' posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim)) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(word_emb_path) all_sents = [] for sens in posts: all_sents.extend(sens) model.build_vocab(all_sents, tokenize=False) for ind, sentences in enumerate(posts): embeddings = model.encode(sentences, tokenize=False, verbose=False) l = min(max_sent_cnt, len(sentences)) posts_arr[ind, :l, :] = embeddings[:l] return posts_arr
def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ): sp = spacy.load('en_core_web_sm') tokenized = sp(doc) sentences = [] for token in tokenized.sents: sentences.append(token.text) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']: # Use encoder for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {}) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(sentences) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=sentences) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(sentences, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(sentences) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') else: raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder)) return list(zip(sentences, sentence_embeddings))
def infersent_embeddings(): train_data_list = [] test_data_list = [] sys.path.append( '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master') # Load model from models import InferSent model_version = 1 MODEL_PATH = "/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/glove.840B.300d-003.txt' if model_version == 1 else '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) train_data_list = model.encode(final_train['text'].tolist(), bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(train_data_list))) test_data_list = model.encode(final_test['text'].tolist(), bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(test_data_list))) return train_data_list, test_data_list
def infersent_flat_embed_posts(posts, embed_dim, data_fold_path): model_path = data_fold_path + 'word_sent_embed/infersent2.pickle' word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(word_emb_path) model.build_vocab(posts, tokenize=False) return model.encode(posts, tokenize=False, verbose=False)
class Infersent: def __init__(self): V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } self.infersent = InferSent(params_model) self.infersent.load_state_dict(torch.load(MODEL_PATH)) self.infersent.set_w2v_path('fastText/crawl-300d-2M.vec') def get(self, sentences): self.infersent.build_vocab(sentences, tokenize=True) return self.infersent.encode(sentences, tokenize=True)
def embed_dataset(dataset_path, infersent_path, force_cpu=False): """ To make this work, first run ./get_infersent.sh """ MODEL_PATH = infersent_path / "encoder/infersent1.pkl" params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1} model = InferSent(params_model) if force_cpu: model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu')) else: model.load_state_dict(torch.load(MODEL_PATH)) model.cuda() W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) csv_data = read_csv(dataset_path / 'train.csv') csv_data = csv_data[1:] # skip header data = defaultdict(list) for irow, row in enumerate(csv_data): if 'snips' in str(dataset_path): utterance, labels, delexicalised, intent = row else: raise TypeError( "Unknown dataset type. Implement your own first. See the " "README") data[intent].append(utterance) vectors = {} for i, (intent, sentences) in enumerate(data.items()): print('{}/{} done'.format(i, len(data.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
def no_stopwords(): infersent2 = InferSent(params_model) infersent2.load_state_dict(torch.load(MODEL_PATH)) infersent2.set_w2v_path(W2V_PATH) use_cuda = True infersent2 = infersent.cuda() if use_cuda else infersent pdss = pd.DataFrame(columns=['embds', 'set', 'catg']) start = time.time() global current_idx for x in range(3): crix = current_idx abss, catg, sets, crix = get_batch_from_dataframe(crix) for index in range(len(abss)): doc = nlp(abss[index]) strs_after_stop_arr = [] for token in doc: if not token.is_stop: strs_after_stop_arr.append(token.text) abss[index] = ' '.join(strs_after_stop_arr) if x == 0: infersent2.build_vocab(abss, tokenize=True) else: infersent2.update_vocab(abss, tokenize=True) embed = infersent2.encode(abss, tokenize=True) df2 = pd.DataFrame({ 'embds': embed.tolist(), 'set': sets, 'catg': catg }) pdss = pdss.append(df2, ignore_index=True) current_idx = crix end = time.time() - start print("Time without stopwords", end) pdss.to_csv("/home/psrivastava/Intern_Summer/data/embeds_no_stopwords.csv")
'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'final_text_vectors.txt' model.set_w2v_path(W2V_PATH) model.build_vocab(sentences, tokenize=True) #build_vocab_k_words(K=100000) embeddings = model.encode( sentences, tokenize=True) #(sentences, bsize=168, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(embeddings))) sen_vec = preprocessing.normalize(embeddings) sen_vec = Variable(torch.from_numpy(sen_vec)) #sen_vec = nn.Linear(4096,300) model = net() n = (1, 300) nparray = np.zeros(n) for i in sen_vec: out = model(i) out = out.data.numpy() #print(out) nparray = np.append(nparray, [out], axis=0) nparray = np.delete(nparray, 0, axis=0)
'dpout_model': 0.0, 'version': model_version } model = InferSent(hyperparameters) model.load_state_dict(torch.load(MODEL_PATH)) use_cuda = False model = model.cuda() if use_cuda else model W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=10000) #10000 train_df = pandas.read_csv('/home/stefan/projects/disaster_tweets/train.csv') tweets = train_df.text.to_list() embeddings = model.encode(tweets) # train_df['embedding'] = [np.zeros(4096) for i in range(train_df.shape[0])] embeddings_list = [embeddings[x] for x in range(embeddings.shape[0])] # train_df['embedding'] = embeddings_list # print(train_df.head()) # for i in range(train_df.shape[0]): # # for i in range(10): # tweet_text = train_df['text'][i] # tweet_embedding = model.encode(tweet_text) # train_df.to_csv('train_w_embeddings.csv') train_dict = {} for i in range(train_df.shape[0]):
model.set_w2v_path(args.w2v_path) # Ensure directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) # Read files and extract features for fpath in args.files: print('Reading file {}'.format(fpath)) sents = [] with open(fpath) as f: for line in f: line = line.strip() assert line, 'Empty line in {}'.format(fpath) sents.append(line) # Set output file name out_name = os.path.join( args.out_dir, "{}.embs.npy".format(os.path.basename(fpath))) # Build vocab print('Building vocabulary') model.build_vocab(sents, args.tokenize) # Get embeddings embs = model.encode(sents, tokenize=args.tokenize, verbose=True, bsize=args.batch_size) print('Saving to {}'.format(out_name)) np.save(out_name, embs)
header=None) relational.columns = ["vocab"] culture.vocab = culture.vocab.apply(lambda x: re.sub(',', '_', x)) demographic.vocab = demographic.vocab.apply(lambda x: re.sub(',', '_', x)) relational.vocab = relational.vocab.apply(lambda x: re.sub(',', '_', x)) ################################################## ################################################## ################################################## ################################################## ################################################## #generating semantic embeddings for the inq terms d = {'terms': culture.vocab} culture_df = pd.DataFrame(d) culture_embeddings = model.encode(culture_df['terms'], verbose=True) d = {'terms': demographic.vocab} demographic_df = pd.DataFrame(d) demographic_embeddings = model.encode(demographic_df['terms'], verbose=True) d = {'terms': relational.vocab} relational_df = pd.DataFrame(d) relational_embeddings = model.encode(relational_df['terms'], verbose=True) print("Dictionaries embeddings generated!") #generating embeddings embeddings = model.encode(df.text, verbose=True) print('documents encoded : {0}'.format(len(embeddings))) try: np.savez_compressed(
class LCPR_I: def __init__(self): self.filename = "LCP/lcpr_i.sav" self.cmudict = cmudict.dict() self.wnlp = WonderlicNLP() self.embeddings_index = {} self.wiki_top10 = [ word[0].split()[0] for word in pd.read_csv("LCP/wiki_top10.csv").values ][:10001] self.infersent_model_path = 'LCP/infersent%s.pkl' % 1 self.infersent_model_params = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.infersent = InferSent(self.infersent_model_params) self.model = RandomForestRegressor(n_estimators=100) #InferSent setup (boilerplate code from InferSent's repository): def initialize_infersent(self, sentences): print("INITIALIZING INFERSENT...", datetime.now().strftime("%H:%M:%S")) self.infersent.load_state_dict(torch.load(self.infersent_model_path)) w2v_path = 'LCP/glove.42B.300d.txt' self.infersent.set_w2v_path(w2v_path) self.infersent.build_vocab(sentences, tokenize=True) print("INFERSENT READY!", datetime.now().strftime("%H:%M:%S")) def infersent_embedding(self, sentence): return self.infersent.encode(sentence, tokenize=True) # GloVe setup: def initialize_glove(self): print("INITIALIZING GLOVE...", datetime.now().strftime("%H:%M:%S")) f = open('LCP/glove.42B.300d.txt', encoding="utf8") for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print("GLOVE READY!", datetime.now().strftime("%H:%M:%S")) def glove_embedding(self, word): embedding = [ emb for emb in self.embeddings_index[str(word).lower()] ] if str(word).lower() in self.embeddings_index.keys() else [ -1 for i in range(300) ] return embedding def find_word_pos(self, word, tokens): lemmatizer = WordNetLemmatizer() search_tokens = [lemmatizer.lemmatize(word) for word in tokens] if word in tokens: return tokens.index(word) elif word in search_tokens: return search_tokens.index(word) else: return None # Used to find the index of the word in the sentence def extract_features(self, data): features = defaultdict(list) for id in tqdm(data.index, desc="PROCESSING DATA"): raw_token = "null" if str(data.loc[id]["token"]) == "nan" else str( data.loc[id]["token"]) token = raw_token.lower() sent = data.loc[id]["sentence"] mrc_features = self.wnlp.get_mrc_features(token) glove = self.glove_embedding(token) infersent = self.infersent_embedding([sent])[0] # Sentence InferSent embedding: for i in range(1, 4097): features[f"infersent{i}"].append(infersent[i - 1]) # Word GloVe embedding: for i in range(1, 301): features[f"glove{i}"].append(glove[i - 1]) # MRC features: features["word_length"].append(mrc_features["Nlet"]) features["syl_count"].append(mrc_features["Nsyl"]) features["brown_freq"].append(mrc_features["Brown-freq"]) features["familiarity"].append(mrc_features["Fam"]) features["concreteness"].append(mrc_features["Conc"]) features["imagability"].append(mrc_features["Imag"]) features["meaningfulness_c"].append(mrc_features["Meanc"]) features["meaningfulness_p"].append(mrc_features["Meanp"]) features["age_of_aquisition"].append(mrc_features["AOA"]) features["wiki_freq"].append(int(token in self.wiki_top10)) return features def fit(self, train_data, train_labels): print("TRAINING...", datetime.now().strftime("%H:%M:%S")) self.initialize_glove() self.initialize_infersent(train_data["sentence"]) features = self.extract_features(train_data) self.model.fit(pd.DataFrame(features), train_labels) print("TRAINING DONE!", datetime.now().strftime("%H:%M:%S")) def to_likert(self, prediction): if prediction >= 0 and prediction < 0.2: return 1 elif prediction >= 0.2 and prediction < 0.4: return 2 elif prediction >= 0.4 and prediction < 0.6: return 3 elif prediction >= 0.6 and prediction < 0.8: return 4 else: return 5 def predict(self, test_data, development=False): print("LOOKING INTO THE ORB...", datetime.now().strftime("%H:%M:%S")) self.infersent.update_vocab(test_data) tokens = test_data["token"] predictions = self.model.predict( pd.DataFrame(self.extract_features(test_data))) if not development: for i in range(len(predictions)): print( f"{tokens[i]} is a {self.to_likert(predictions[i])} on the Likert scale." ) return predictions def score(self, train_data, train_labels): print("SCORING MODEL...", datetime.now().strftime("%H:%M:%S")) return self.model.score( pd.DataFrame(self.extract_features(train_data)), train_labels) def metrics(self, test_data, test_labels): labels_pred = self.predict(test_data, True) mae = mean_absolute_error(test_labels, labels_pred) rmse = math.sqrt(mean_squared_error(test_labels, labels_pred)) print("MAE:", mae) print("RMSE:", rmse) def save(self): pickle.dump([self.model, self.embeddings_index, self.infersent], open(self.filename, "wb")) def load(self): data = pickle.load(open(self.filename, "rb")) self.model = data[0] self.embeddings_index = data[1] self.infersent = data[2]
print(len(sentences)) # In[7]: sentences[:5] # ## Encode sentences # In[8]: # gpu mode : >> 1000 sentences/s # cpu mode : ~100 sentences/s # In[9]: embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(embeddings))) # ## Visualization # In[10]: np.linalg.norm(model.encode(['the cat eats.'])) # In[11]: def cosine(u, v): return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
temp = clean_debian(temp) if temp == '': cnt += 1 print('NULL') continue temp = obj.replace_tokens(temp) if flag==0: t = temp flag = 1 continue t=t.strip() print(str(t)) print('---------------------------------------') #calculate sentence embedding for body and average it into 4096 sized vector if t!='' : embedding =infermodel.encode( str(t), bsize=1, tokenize=False, verbose=True) sent_vec =[] numw = 0 for w in embedding: try: if numw == 0: sent_vec = w else: sent_vec = np.add(sent_vec, w) numw+=1 except: pass v = np.asarray(sent_vec) / numw print(v.shape) print(v) v=np.transpose(v)
class InferSentEmbeddings(EmbeddingBaseClass, FlairDocumentEmbeddings): """ Class to infer the InferSent embeddings to flair sentences. cf. `here <https://github.com/facebookresearch/InferSent>`_ """ def __init__(self, version=1): super().__init__() self.version = version if version == 1: self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'glove.840B.300d', 'glove.840B.300d.txt') if version == 2: self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'crawl-300d-2M', 'crawl-300d-2M.vec') self.MODEL_PATH = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'infersent%s' % version, 'infersent%s.pkl' % version) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(self.MODEL_PATH)) self.model.set_w2v_path(self.PATH_TO_W2V) self._embedding_length: int = params_model['enc_lstm_dim'] self.name = f"{self.__class__.__name__ }_v{self.version}" self.static_embeddings = True @property def embedding_length(self) -> int: return self._embedding_length def _add_embeddings_internal(self, sentences: List[Sentence]): everything_embedded: bool = True infersent_sentences = [] for sentence in sentences: if self.name not in sentence._embeddings.keys(): everything_embedded = False if not everything_embedded: for sentence in sentences: infersent_sentences.append(sentence.to_tokenized_string()) self.model.build_vocab(infersent_sentences, tokenize=False) self.model.update_vocab(infersent_sentences, tokenize=False) embeddings = self.model.encode(infersent_sentences, tokenize=False) for sentence, sentence_embedding in zip(sentences, embeddings): sentence.set_embedding(self.name, torch.tensor(sentence_embedding))
'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) print('Load our pre-trained model (in encoder/)') # Set word vector path for the model W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) print('Set word vector path for the model') # Build the vocabulary of word vectors (i.e keep only those needed) infersent.build_vocab(all_captions, tokenize=True) print('Build the vocabulary of word vectors') # Start encoding captions caption2id = {} f = open('pascal-sentences-dataset/text_features.txt', 'w+') for caption in all_captions: current_feature = list( infersent.encode([caption], tokenize=True).squeeze()) if not caption in caption2id: caption2id[caption] = 'caption_' + str(len(caption2id)) current_feature = [str(feature) for feature in current_feature] current_feature_str = ' '.join(current_feature) f.write('%s %s\n' % (caption2id[caption], current_feature_str)) f.close() with open('pascal-sentences-dataset/caption2id.json', 'w') as outfile: json.dump(caption2id, outfile)
import pandas as pd import spacy import nltk import numpy as np import torch from models import InferSent df=pd.read_csv("/home/psrivastava/Intern_Summer/data/new_output.csv") abs_arr=df.ix[:4,'clean_text'] nlp=spacy.load("en_core_web_sm") MODEL_PATH="/home/psrivastava/Intern_Summer/infersent/encoder/infersent2.pkl" W2V_PATH="/home/psrivastava/Intern_Summer/infersent/fastText/crawl-300d-2M.vec" params_model={'bsize':64,'word_emb_dim':300,'enc_lstm_dim':2048,'pool_type':'max','dpout_model':0.0,'version':2} infersent=InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent.set_w2v_path(W2V_PATH) for index in range(len(abs_arr)): doc=nlp(abs_arr[index]) strs_after_stop_arr=[] for token in doc: if not token.is_stop: strs_after_stop_arr.append(token.text) abs_arr[index]=' '.join(strs_after_stop_arr) infersent.build_vocab(abs_arr) #But Actually they are abstracts of diffrent papers print(infersent.encode(abs_arr)[0][:])
if params.encoder_path and params.encoder_type == 'InferSent': params_model = {'bsize': params.batch_size, 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim , 'pool_type': params.pool_type, 'dpout_model': params.dpout_model, 'version': params.model_version} encoder = InferSent(params_model) encoder.load_state_dict(torch.load(params.encoder_path)) encoder.set_w2v_path(params.vector_rep) if params.vocab_samples.isdigit() : print("Build vocab from K samples") encoder.build_vocab_k_words(K=int(params.vocab_samples)) else: print("Build vocab from full file") encoder.build_vocab(K=params.vocab_samples) print("========TEST encoder=======") print(encoder.encode(['the cat eats.'])) encoder.to(device) # model config config_nli_model = { 'n_words' : len(word_vec) , 'word_emb_dim' : params.word_emb_dim , 'enc_lstm_dim' : params.enc_lstm_dim , 'n_enc_layers' : params.n_enc_layers , 'dpout_model' : params.dpout_model , 'dpout_fc' : params.dpout_fc , 'fc_dim' : params.fc_dim ,
'Question': [], 'Answer': [], 'Question_Emb': [], 'Answer_Emb': [], 'Label': [], 'Cosine_Dist': [], 'Euclidean_Dist': [], 'Predicted_label_Cos': [], 'Predicted_label_Euc': [] } pred_labels_cos = [] pred_labels_euc = [] for i_q, this_q in enumerate(quetsions): embeddings_q = infersent.encode([this_q], tokenize=True, verbose=False) dist_cos_group = [] dist_euc_group = [] for i_a, this_a in enumerate(answers[i_q]): print(f'Question {i_q: <10} Answer {i_a: <10} is done!') embeddings_a = infersent.encode([this_a], tokenize=True, verbose=False) # calculate the distances this_dist_cos = distance.cosine(embeddings_q, embeddings_a) this_dist_euc = distance.euclidean(embeddings_q, embeddings_a) dist_cos_group.append(this_dist_cos) dist_euc_group.append(this_dist_euc)
from models import InferSent V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' \ if V == 1 else 'dataset/fastText/crawl-300d-2M-subword.vec' W2V_PATH = 'dataset/fastText/crawl-300d-2M-subword.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) # infersent.visualize('A man plays an instrument.', tokenize=True) print(embeddings.shape) joblib.dump(embeddings, os.path.join(args.file_path, 'data/embeddings.pkl'))
with open(ORI_PATH) as f: ori = f.read() ori = ori.replace('[[[[Premise]]]]: ', '').replace('>>>>[[[[Hypothesis]]]]:', '') ori = ori.replace('[[', '').replace(']]', '') ori = ori.splitlines() params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent.set_w2v_path(W2V_PATH) infersent.build_vocab_k_words(K) adv_emb = infersent.encode(adv, tokenize=True) ori_emb = infersent.encode(ori, tokenize=True) result = [cos_sim(i, j) for i, j in zip(adv_emb, ori_emb)] with open('../results/InferSent.txt', 'w') as f: f.write('\n'.join([str(i) for i in result])) result = [distance(i, j) for i, j in zip(adv_emb, ori_emb)] with open('../results/InferSent_distance.txt', 'w') as f: f.write('\n'.join([str(i) for i in result]))
model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = True model = model.cuda() if use_cuda else model W2V_PATH = '/home1/InferSent/oov_train_model.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words # model.build_vocab_k_words(K=100000) model.build_vocab_k_words(K=2051129) # Extract embedding word . # Load test sentences train_test = pd.read_csv('/home1/InferSent/testset.csv', header=None, delimiter=",", encoding='UTF-8') source_s = train_test[0][1:] target_s = train_test[1][1:] embeddings_source = model.encode(source_s, bsize=128, tokenize=False, verbose=True) print('nb source_s encoded : {0}'.format(len(embeddings_source))) embeddings_target = model.encode(target_s, bsize=128, tokenize=False, verbose=True) print('nb target_s encoded : {0}'.format(len(embeddings_target))) np.save('embeddings_source.npy', embeddings_source) np.save('embeddings_target.npy', embeddings_target) if args.cosine == True: source_np = np.load('embeddings_source.npy') target_np = np.load('embeddings_target.npy') print('Success vector load') # Load for checking the vector name. train_test = pd.read_csv('/home1/InferSent/testset.csv', header=None, delimiter=",", encoding='UTF-8') # ground-truth dataset source_s = train_test[0][1:] target_s = train_test[1][1:]
def infer(inputs): radius = 0.09 nlp = spacy.load("en_core_web_sm") sentences = [] locations = [] import json pass_in = json.loads(inputs) for call in pass_in: sentences.append(call['transcript']) locations.append((call['latitude'], call['longitude'])) from models import InferSent V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) ## The old, bag of filtered words, implementation follows # for i, sentence in enumerate(sentences): # sentences[i] = nlp(' '.join([str(t) for t in nlp(sentence) if t.pos_ in ['NOUN', 'PROPN', 'ADJ']])) # # sentences_matrix = np.vstack([x.vector / norm(x.vector) for x in sentences]) # ling_compatibility = np.matmul(sentences_matrix, np.transpose(sentences_matrix)) # print(ling_compatibility) infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) embeddings = embeddings / np.linalg.norm( embeddings, ord=2, axis=1, keepdims=True) ling_compatibility = np.matmul(embeddings, np.transpose(embeddings)) #print(ling_compatibility) def intersection_area(d, r): if d == 0: # the circles are the same return np.pi * r**2 if d >= 2 * r: # The circles don't overlap at all. return 0 r2, d2 = r**2, d**2 alpha = np.arccos(d2 / (2 * d * r)) wow = 2 * r2 * alpha - r2 * np.sin(2 * alpha) return wow geo_compatibility = np.zeros((len(locations), len(locations))) for i in range(len(locations)): for k in range(i, len(locations)): geo_compatibility[i][k] = intersection_area( math.sqrt((locations[i][0] - locations[k][0])**2 + (locations[i][1] - locations[k][1])**2), radius) / (math.pi * (2**2)) from sklearn.cluster import KMeans total = np.multiply(ling_compatibility, geo_compatibility) #print(total.shape) #for i in range(len(locations)): # for k in range(len(locations)): # if i != k and total[i][k] > 0.65: # print(str(i) + " and " + str(k) + " are the same incident") kmeany = KMeans(init='k-means++').fit(total) labels = kmeany.labels_.tolist() mapper = {} for call, label in enumerate(labels): mapper[call] = label class Analysis: def __init__(self, sentence): self.sentence = sentence self.nlpped = nlp(sentence) self.nouns = [ str(t.lemma_) for t in self.nlpped if (t.pos_ in ['PROPN', 'NOUN'] and t.lemma_ not in ['I', 'help']) ] self.verbs = [ str(t.lemma_) for t in self.nlpped if (t.pos_ in ['VERB', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] and t.lemma_ not in [ 'be', 'have', 'do', 'say', 'go', 'get', 'make', 'know', 'think', 'take', 'help', 'may', 'fear', 'see', 'stop', 'reach', 'seem', 'hope', 'want', 'would', 'cause', 'let', 'like', 'will' ]) ] analyses = [] for sentence in sentences: analyses.append(Analysis(sentence)) d = [] for n in set(mapper.values()): nouns = [] for k in mapper.keys(): if mapper[k] == n: nouns += analyses[k].nouns noun_counter = Counter(nouns) verbs = [] for k in mapper.keys(): if mapper[k] == n: verbs += analyses[k].verbs verb_counter = Counter(verbs) calls = [] for k in mapper.keys(): if mapper[k] == n: call = { 'transcript': sentences[k], 'file': pass_in[k]['file'], 'lat': locations[k][0], 'lon': locations[k][1], 'id': pass_in[k]['id'] } calls.append(call) blah = [x[0] for x in verb_counter.most_common(3) if x[1] > 1 ] + [x[0] for x in noun_counter.most_common(3) if x[1] > 1] if len(blah) == 0: blah = [x[0] for x in verb_counter.most_common(1) ] + [x[0] for x in noun_counter.most_common(1)] d.append({'name': ' '.join(blah), 'calls': calls}) return json.dumps(d)
ARGS = PARSER.parse_args() question = ARGS.question sentences = [question] #### Load Facebook's InferSent (download the files from the internet) infersent = InferSent({'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}) infersent.load_state_dict(torch.load('/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl')) infersent.set_w2v_path('/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt') # Extract the most relevant Wikipedia page #### Wikipedia recommends 10 pages wikipedia_pages = wikipedia.search(question) sentences = sentences + wikipedia_pages #### Convert sentences to numbers infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True, verbose=False) #### Choose the most relevant pages distances = pdist(np.array(embeddings), metric='euclidean') sentence_similarity_matrix = squareform(distances) most_relevant_pages = np.argsort(sentence_similarity_matrix[0][1:]) #### Extract the content on the most relevant page (tries multiple pages in case of failure) for page in most_relevant_pages: try: content_on_the_page = wikipedia.page(wikipedia_pages[page]).content break except: pass # Find and print the most relevant sentences #### Split the content into sentences sents = nltk.sent_tokenize(content_on_the_page)
def extract_answer_IFST(story_data, question_and_ans_data, story_ids, model_version, Vocab_Size): """ (1) get answer, then modify self.question_and_ans_data by add the answer to it. (2) for each story id, extract its question, then look up in story_data, find the best sentence""" import re import pandas as pd import torch import numpy as np from models import InferSent #sentence_list=build_vocabulary(story_data) W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'dataset/fastText/crawl-300d-2M.vec' MODEL_PATH = 'encoder/infersent%s.pkl' % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(W2V_PATH) if model_version == 3: sentence_list = build_vocabulary(story_data) model.build_vocab(sentence_list) else: model.build_vocab_k_words(K=Vocab_Size) for story_id in story_ids: story = story_data.loc[lambda df: df.story_id == story_id, 'story'].values[0] question_ids = question_and_ans_data.loc[ lambda df: df.story_id == story_id, 'question_id'] for question_id in question_ids: # get the question and answer question = question_and_ans_data.loc[ lambda df: df.question_id == question_id, 'question'].values[0] if 'answer' in question_and_ans_data: answer = question_and_ans_data.loc[ lambda df: df.question_id == question_id, 'answer'].values[0] question_encoded = model.encode( str(question_and_ans_data.loc[question_and_ans_data.index[ question_and_ans_data['question_id'] == question_id][0], 'question']))[0] ans = [] for sent in story.sents: #sim = sent.similarity(question) sim = cosine(question_encoded, model.encode(str(sent))[0]) ans.append({ 'question_id': question_id, 'answer_pred': sent, 'similarity': sim }) ans = pd.DataFrame(ans).reindex( ['question_id', 'answer_pred', 'similarity'], axis=1) ans.sort_values(by=['similarity'], ascending=False, inplace=True) question_and_ans_data.loc[lambda df: df.question_id == question_id, 'answer_pred'] = str( ans.iloc[0]['answer_pred']).replace( '\n', ' ') #.text #question_and_ans_data['answer_pred'] = question_and_ans_data['answer_pred'].apply(TextBlob) return question_and_ans_data