def create_embeddings(infer_path, data_path, em_type): yt_titles = yt.get_yt_titles() with open("data/whtitles", "r") as f: wh_titles = [line.rstrip('\n') for line in f] if em_type == "yt": # Youtube save_f = os.path.join(data_path, "yt_embed") titles = yt_titles elif em_type == "wh": # Wikihow save_f = os.path.join(data_path, "wh_embed") titles = wh_titles else: raise "Unknown embedding type: {}".format(em_type) nltk.download('punkt') V = 1 MODEL_PATH = os.path.join(infer_path, 'encoder/infersent%s.pkl' % V) params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent = infersent.cuda() W2V_PATH = os.path.join(infer_path, 'GloVe/glove.840B.300d.txt') infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(yt_titles + wh_titles, tokenize=True) embed = infersent.encode(titles, tokenize=True) np.save(save_f, embed)
def load_inferSent(sentences): logger.info('load InferSent') V = 2 MODEL_PATH = 'Infersent/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) if torch.cuda.is_available(): infersent.cuda() # set word vector if V == 1: W2V_PATH = 'Infersent/Glove/glove.840B.300d.txt' logger.warning('Use Glove Embedding') elif V == 2: W2V_PATH = 'Infersen/fastText/crawl-300d-2M.vec' logger.warning('Use fastText Embedding') else: raise NotImplementedError infersent.set_w2v_path(W2V_PATH) # build voceb infersent.build_vocab(sentences, tokenize=True) return infersent
class InferSentFeatures: def __init__(self, lang_enc_dir, sentences): sys.path.insert(0, os.path.join(lang_enc_dir, 'InferSent/')) from models import InferSent version = 1 MODEL_PATH = os.path.join( lang_enc_dir, 'InferSent/encoder/infersent%s.pkl' % version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = os.path.join(lang_enc_dir, 'glove/glove.6B.300d.txt') self.model.set_w2v_path(W2V_PATH) self.model.build_vocab(sentences, tokenize=True) def generate_embeddings(self, sentences): embeddings = self.model.encode(sentences, tokenize=True) return embeddings
def embed_sent(datafile): sentences = [] with open(datafile, 'r') as f: i = 0 for line in f: line = line.replace('\n', '') sentences.append(line) i += 1 if i == 455820: break V = 1 MODEL_PATH = 'infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'GloVe/glove.840B.300d.txt' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) np.savetxt("../../wiki-split/Data/Infersent_vectors/complex_sent", embeddings)
class Encoder2: ''' Encoder based on InferSent ''' WORD_VECTORS_FILE = 'crawl-300d-2M.vec' MODEL_FILE = 'infersent2.pkl' def __init__(self, word_vectors_dir, models_dir): word_vectors = os.path.join(word_vectors_dir, self.WORD_VECTORS_FILE) model_file = os.path.join(models_dir, self.MODEL_FILE) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(model_file)) self.model.set_w2v_path(word_vectors) def start(self, texts): texts_list = texts.values.tolist() self.model.build_vocab(texts_list, tokenize=True) def close(self): pass def encode(self, texts_batch): texts_batch_list = texts_batch.values.tolist() texts_batch_vec = self.model.encode(texts_batch_list, tokenize=True) return texts_batch_vec
def infersent_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path): model_path = data_fold_path + 'word_sent_embed/infersent2.pickle' word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec' posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim)) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(word_emb_path) all_sents = [] for sens in posts: all_sents.extend(sens) model.build_vocab(all_sents, tokenize=False) for ind, sentences in enumerate(posts): embeddings = model.encode(sentences, tokenize=False, verbose=False) l = min(max_sent_cnt, len(sentences)) posts_arr[ind, :l, :] = embeddings[:l] return posts_arr
def prepare(model_path: str, word_vecs: str, out_path: str, sentences: Union[str, List[str]] = None, max_vocab: int = 0): """ this method is for adapting the vocabulary, :param model_path: unadapted model state :param word_vecs: word vectors :param out_path: where to store the state :param sentences: training sentences for scanning the vocabulary :param max_vocab: maximum vocabulary size (optional) :return: """ assert bool(sentences) != bool( max_vocab), 'Either sentences or max_vocab should be given' model = InferSent(config=MODEL_CONF) log.info(f"Loading state from {out_path}") model.load_state_dict(torch.load(model_path)) log.info(f"Loading word vecs from {out_path}") model.set_w2v_path(word_vecs) if sentences: if type(sentences) is not list: sentences = list(read_lines(sentences)) log.info("Building vocabulary from sentences") model.build_vocab(sentences, tokenize=True) if max_vocab: log.info(f"Pruning vocabulary to top {max_vocab} types") model.build_vocab_k_words(K=max_vocab) log.info(f"Saving at {out_path}") state = SentenceEncoder._get_state(model) torch.save(state, out_path)
def calcule_eucl(text, question): blob = TextBlob("".join(text)) sentences = [item.raw for item in blob.sentences] V = 2 MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'InferSent/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) dict_embeddings = {} for i in range(len(sentences)): dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True) encode_question = infersent.encode([question], tokenize=True) eucl = eucl_sim(dict_embeddings, encode_question) return sentences, eucl
def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ): sp = spacy.load('en_core_web_sm') tokenized = sp(doc) sentences = [] for token in tokenized.sents: sentences.append(token.text) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']: # Use encoder for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {}) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(sentences) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=sentences) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(sentences, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(sentences) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') else: raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder)) return list(zip(sentences, sentence_embeddings))
def infersent_flat_embed_posts(posts, embed_dim, data_fold_path): model_path = data_fold_path + 'word_sent_embed/infersent2.pickle' word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(word_emb_path) model.build_vocab(posts, tokenize=False) return model.encode(posts, tokenize=False, verbose=False)
class Infersent: def __init__(self): V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } self.infersent = InferSent(params_model) self.infersent.load_state_dict(torch.load(MODEL_PATH)) self.infersent.set_w2v_path('fastText/crawl-300d-2M.vec') def get(self, sentences): self.infersent.build_vocab(sentences, tokenize=True) return self.infersent.encode(sentences, tokenize=True)
def no_stopwords(): infersent2 = InferSent(params_model) infersent2.load_state_dict(torch.load(MODEL_PATH)) infersent2.set_w2v_path(W2V_PATH) use_cuda = True infersent2 = infersent.cuda() if use_cuda else infersent pdss = pd.DataFrame(columns=['embds', 'set', 'catg']) start = time.time() global current_idx for x in range(3): crix = current_idx abss, catg, sets, crix = get_batch_from_dataframe(crix) for index in range(len(abss)): doc = nlp(abss[index]) strs_after_stop_arr = [] for token in doc: if not token.is_stop: strs_after_stop_arr.append(token.text) abss[index] = ' '.join(strs_after_stop_arr) if x == 0: infersent2.build_vocab(abss, tokenize=True) else: infersent2.update_vocab(abss, tokenize=True) embed = infersent2.encode(abss, tokenize=True) df2 = pd.DataFrame({ 'embds': embed.tolist(), 'set': sets, 'catg': catg }) pdss = pdss.append(df2, ignore_index=True) current_idx = crix end = time.time() - start print("Time without stopwords", end) pdss.to_csv("/home/psrivastava/Intern_Summer/data/embeds_no_stopwords.csv")
# For Load encoder encoder = None if params.encoder_path and params.encoder_type == 'InferSent': params_model = {'bsize': params.batch_size, 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim , 'pool_type': params.pool_type, 'dpout_model': params.dpout_model, 'version': params.model_version} encoder = InferSent(params_model) encoder.load_state_dict(torch.load(params.encoder_path)) encoder.set_w2v_path(params.vector_rep) if params.vocab_samples.isdigit() : print("Build vocab from K samples") encoder.build_vocab_k_words(K=int(params.vocab_samples)) else: print("Build vocab from full file") encoder.build_vocab(K=params.vocab_samples) print("========TEST encoder=======") print(encoder.encode(['the cat eats.'])) encoder.to(device) # model config config_nli_model = { 'n_words' : len(word_vec) , 'word_emb_dim' : params.word_emb_dim , 'enc_lstm_dim' : params.enc_lstm_dim , 'n_enc_layers' : params.n_enc_layers ,
model.set_w2v_path(args.w2v_path) # Ensure directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) # Read files and extract features for fpath in args.files: print('Reading file {}'.format(fpath)) sents = [] with open(fpath) as f: for line in f: line = line.strip() assert line, 'Empty line in {}'.format(fpath) sents.append(line) # Set output file name out_name = os.path.join( args.out_dir, "{}.embs.npy".format(os.path.basename(fpath))) # Build vocab print('Building vocabulary') model.build_vocab(sents, args.tokenize) # Get embeddings embs = model.encode(sents, tokenize=args.tokenize, verbose=True, bsize=args.batch_size) print('Saving to {}'.format(out_name)) np.save(out_name, embs)
'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) # In[19]: W2V_PATH = 'Documents/FastText/crawl-300d-2M.vec/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) # In[20]: infersent.build_vocab(train_doc, tokenize=True) # In[21]: embeddings = infersent.encode(train_doc, tokenize=True) # In[22]: infersent.visualize('A man plays an instrument.', tokenize=True) # In[31]:
for word in f: english_long.add(word.strip()) df = pd.read_csv( "../../../models_storage/word_embeddings_data/ocr_text_with_tags_10000.csv" ) df = df[df.text.isna() == False] #filtering out rows with NA's for text #df = df[:50] #take this line out if it works #df.text = df.text.apply(lambda x: x[:10000] if len(x) > 10000 else x) # Create useful lists using above functions: stop_words_list = stopwords_make() punctstr = punctstr_make() unicode_list = unicode_make() model.build_vocab(df.text) print("Vocabulary loading complete!") #writing function for common cosine similarity def doc_words_cosine(i, t): emb = embeddings[i] if t == 'culture': word_vec_avg = np.sum(culture_embeddings, axis=0) / len(culture) elif t == 'demographic': word_vec_avg = np.sum(demographic_embeddings, axis=0) / len(demographic) elif t == 'relational': word_vec_avg = np.sum(relational_embeddings, axis=0) / len(relational) return absolute(dot(emb, word_vec_avg) / (norm(emb) * norm(word_vec_avg)))
def answer_the_question(): print("***********************************************") #print(request.form) print("***********************************************") input_info = request.form['cont'] #print(request.args['data']) question = request.form['question'] #question = 'where are you.' #input_info = [['I am here.'],['f**k ooff']] print("___________________________________________________________") print(question) print(len(input_info)) print("_________________________________________________________________") MODEL_PATH = 'encoder/infersent1.pkl' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'Glove/glove/glove.42B.300d.txt' infersent.set_w2v_path(W2V_PATH) sentences = [] sentences.append(convert(question)) li = input_info.split('.') for k in li: if len(k) > 4: k = convert(k) sentences.append(k) print( "_____________________________________________________________________________" ) print(len(sentences)) print( '__________________________________________________________________________' ) infersent.build_vocab(sentences, tokenize=True) dict_embeddings = {} for i in range(len(sentences)): try: dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True) except: continue #print(dict_embeddings[sentences[i]]) li_of_dis = [] for a2 in sentences: try: li_of_dis.append( spatial.distance.cosine(dict_embeddings[sentences[0]], dict_embeddings[a2])) except: li_of_dis.append(1.00) mini_d = 1 x = 0 print(li_of_dis) for i in range(1, len(li_of_dis)): if (li_of_dis[i] < mini_d and li_of_dis[i] > 0.05): mini_d = li_of_dis[i] x = i ans_s = sentences[x] print( "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo" ) if (x + 3 < len(sentences)): ans_s = ' ' + sentences[x + 1] + ' ' + sentences[ x + 2] + ' ' + sentences[x + 3] return jsonify(ans=ans_s)
raise NotImplementedError infersent.set_w2v_path(W2V_PATH) # read data refs = [] with open(args.golden, 'r') as f: for line in f: refs.append(line[:-1]) hyps = [] with open(args.generated, 'r') as f: for line in f: hyps.append(line[:-1]) # build voceb infersent.build_vocab(refs+hyps, tokenize=True) # get embeddings refs_embeds = infersent.encode(refs, tokenize=True) hyps_embeds = infersent.encode(hyps, tokenize=True) # compute cosine similarity refs_norm = np.linalg.norm(refs_embeds, ord=2, axis=1) hyps_norm = np.linalg.norm(hyps_embeds, ord=2, axis=1) cosine = np.sum((refs_embeds*hyps_embeds), axis=1)/refs_norm/hyps_norm if args.output_file is not None: with open(args.output_file, 'a') as f: print(json.dumps({'embedding_cosin':float(np.mean(cosine))}), file=f) else:
class LCPR_I: def __init__(self): self.filename = "LCP/lcpr_i.sav" self.cmudict = cmudict.dict() self.wnlp = WonderlicNLP() self.embeddings_index = {} self.wiki_top10 = [ word[0].split()[0] for word in pd.read_csv("LCP/wiki_top10.csv").values ][:10001] self.infersent_model_path = 'LCP/infersent%s.pkl' % 1 self.infersent_model_params = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.infersent = InferSent(self.infersent_model_params) self.model = RandomForestRegressor(n_estimators=100) #InferSent setup (boilerplate code from InferSent's repository): def initialize_infersent(self, sentences): print("INITIALIZING INFERSENT...", datetime.now().strftime("%H:%M:%S")) self.infersent.load_state_dict(torch.load(self.infersent_model_path)) w2v_path = 'LCP/glove.42B.300d.txt' self.infersent.set_w2v_path(w2v_path) self.infersent.build_vocab(sentences, tokenize=True) print("INFERSENT READY!", datetime.now().strftime("%H:%M:%S")) def infersent_embedding(self, sentence): return self.infersent.encode(sentence, tokenize=True) # GloVe setup: def initialize_glove(self): print("INITIALIZING GLOVE...", datetime.now().strftime("%H:%M:%S")) f = open('LCP/glove.42B.300d.txt', encoding="utf8") for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print("GLOVE READY!", datetime.now().strftime("%H:%M:%S")) def glove_embedding(self, word): embedding = [ emb for emb in self.embeddings_index[str(word).lower()] ] if str(word).lower() in self.embeddings_index.keys() else [ -1 for i in range(300) ] return embedding def find_word_pos(self, word, tokens): lemmatizer = WordNetLemmatizer() search_tokens = [lemmatizer.lemmatize(word) for word in tokens] if word in tokens: return tokens.index(word) elif word in search_tokens: return search_tokens.index(word) else: return None # Used to find the index of the word in the sentence def extract_features(self, data): features = defaultdict(list) for id in tqdm(data.index, desc="PROCESSING DATA"): raw_token = "null" if str(data.loc[id]["token"]) == "nan" else str( data.loc[id]["token"]) token = raw_token.lower() sent = data.loc[id]["sentence"] mrc_features = self.wnlp.get_mrc_features(token) glove = self.glove_embedding(token) infersent = self.infersent_embedding([sent])[0] # Sentence InferSent embedding: for i in range(1, 4097): features[f"infersent{i}"].append(infersent[i - 1]) # Word GloVe embedding: for i in range(1, 301): features[f"glove{i}"].append(glove[i - 1]) # MRC features: features["word_length"].append(mrc_features["Nlet"]) features["syl_count"].append(mrc_features["Nsyl"]) features["brown_freq"].append(mrc_features["Brown-freq"]) features["familiarity"].append(mrc_features["Fam"]) features["concreteness"].append(mrc_features["Conc"]) features["imagability"].append(mrc_features["Imag"]) features["meaningfulness_c"].append(mrc_features["Meanc"]) features["meaningfulness_p"].append(mrc_features["Meanp"]) features["age_of_aquisition"].append(mrc_features["AOA"]) features["wiki_freq"].append(int(token in self.wiki_top10)) return features def fit(self, train_data, train_labels): print("TRAINING...", datetime.now().strftime("%H:%M:%S")) self.initialize_glove() self.initialize_infersent(train_data["sentence"]) features = self.extract_features(train_data) self.model.fit(pd.DataFrame(features), train_labels) print("TRAINING DONE!", datetime.now().strftime("%H:%M:%S")) def to_likert(self, prediction): if prediction >= 0 and prediction < 0.2: return 1 elif prediction >= 0.2 and prediction < 0.4: return 2 elif prediction >= 0.4 and prediction < 0.6: return 3 elif prediction >= 0.6 and prediction < 0.8: return 4 else: return 5 def predict(self, test_data, development=False): print("LOOKING INTO THE ORB...", datetime.now().strftime("%H:%M:%S")) self.infersent.update_vocab(test_data) tokens = test_data["token"] predictions = self.model.predict( pd.DataFrame(self.extract_features(test_data))) if not development: for i in range(len(predictions)): print( f"{tokens[i]} is a {self.to_likert(predictions[i])} on the Likert scale." ) return predictions def score(self, train_data, train_labels): print("SCORING MODEL...", datetime.now().strftime("%H:%M:%S")) return self.model.score( pd.DataFrame(self.extract_features(train_data)), train_labels) def metrics(self, test_data, test_labels): labels_pred = self.predict(test_data, True) mae = mean_absolute_error(test_labels, labels_pred) rmse = math.sqrt(mean_squared_error(test_labels, labels_pred)) print("MAE:", mae) print("RMSE:", rmse) def save(self): pickle.dump([self.model, self.embeddings_index, self.infersent], open(self.filename, "wb")) def load(self): data = pickle.load(open(self.filename, "rb")) self.model = data[0] self.embeddings_index = data[1] self.infersent = data[2]
def main(): # Dictionary for Final Rankings. ranking = dict() print("\n CSI 4107 - Microblog information retrieval system \n") print("\n Importing Query Files and Documents... \n") # Load the tweet list. # {'34952194402811904': 'Save BBC World Service from Savage Cuts http://www.petitionbuzz.com/petitions/savews', ...} tweets_dict = importTweets() # Load the list of queries. # {1: ['bbc', 'world', 'servic', 'staff', 'cut'], ...} queries_dict = importQuery() print("\n Importing Done! \n") print("\n Initializing InferSent Model... \n") # Initialize InferSent Model. infersent = InferSent(params_model) # Load Infersent v1 Model Encoder. infersent.load_state_dict(torch.load(MODEL_PATH)) # Use GPU Mode infersent = infersent.cuda() if USE_CUDA else infersent # Load Pre-trained GloVe Model. infersent.set_w2v_path(W2V_PATH) print("\n InferSent Initialization Done! \n") print("\n Building Vocabulary from Tweets... \n") # Deconstruct the dictionary of Documents to Document ID, and Document Contents. tweets = list(tweets_dict.values()) tweet_ids = list(tweets_dict.keys()) # Deconstruct the dictionary of Queries to Query Contents, since we can replicate Query ID. queries = list(queries_dict.values()) # Build the Infersent Vocabulary based on all the Documents' Contents. infersent.build_vocab(tweets, tokenize=False) print("\n Vocabulary Completed! \n") print("\n Building Document & Query Vectors... \n") doc_embeddings = infersent.encode(tweets, bsize=128, tokenize=False, verbose=True) query_embeddings = infersent.encode(queries, bsize=128, tokenize=False, verbose=True) print("\n Building Document & Query Vectors Done! \n") print("\n Retrieval and Ranking... \n") dranking = dict() for query_id in range(len(queries)): print(dranking) # Encoded array starts at 0 for first chronological document. current_document = 0 # Calculate the Cosine Similarity between the current Query, and corpus of Documents. for tweet_id in tweet_ids: # Calculate the Cossine Sim dranking[tweet_id] = cosine(doc_embeddings[current_document], query_embeddings[query_id]) current_document += 1 # Put the ranking of Documents in Descending order into ranking. ranking[query_id + 1] = { k: v for k, v in sorted(dranking.items(), key=lambda dranking: dranking[1], reverse=True)[:1000] } # Create the resulting file. print("Query " + str(query_id) + " Done.") dranking.clear() resultFileCreation(ranking) print("\n Retrieval and Ranking Done! \n")
def extract_answer_IFST(story_data, question_and_ans_data, story_ids, model_version, Vocab_Size): """ (1) get answer, then modify self.question_and_ans_data by add the answer to it. (2) for each story id, extract its question, then look up in story_data, find the best sentence""" import re import pandas as pd import torch import numpy as np from models import InferSent #sentence_list=build_vocabulary(story_data) W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'dataset/fastText/crawl-300d-2M.vec' MODEL_PATH = 'encoder/infersent%s.pkl' % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(W2V_PATH) if model_version == 3: sentence_list = build_vocabulary(story_data) model.build_vocab(sentence_list) else: model.build_vocab_k_words(K=Vocab_Size) for story_id in story_ids: story = story_data.loc[lambda df: df.story_id == story_id, 'story'].values[0] question_ids = question_and_ans_data.loc[ lambda df: df.story_id == story_id, 'question_id'] for question_id in question_ids: # get the question and answer question = question_and_ans_data.loc[ lambda df: df.question_id == question_id, 'question'].values[0] if 'answer' in question_and_ans_data: answer = question_and_ans_data.loc[ lambda df: df.question_id == question_id, 'answer'].values[0] question_encoded = model.encode( str(question_and_ans_data.loc[question_and_ans_data.index[ question_and_ans_data['question_id'] == question_id][0], 'question']))[0] ans = [] for sent in story.sents: #sim = sent.similarity(question) sim = cosine(question_encoded, model.encode(str(sent))[0]) ans.append({ 'question_id': question_id, 'answer_pred': sent, 'similarity': sim }) ans = pd.DataFrame(ans).reindex( ['question_id', 'answer_pred', 'similarity'], axis=1) ans.sort_values(by=['similarity'], ascending=False, inplace=True) question_and_ans_data.loc[lambda df: df.question_id == question_id, 'answer_pred'] = str( ans.iloc[0]['answer_pred']).replace( '\n', ' ') #.text #question_and_ans_data['answer_pred'] = question_and_ans_data['answer_pred'].apply(TextBlob) return question_and_ans_data
class InferSentEmbeddings(EmbeddingBaseClass, FlairDocumentEmbeddings): """ Class to infer the InferSent embeddings to flair sentences. cf. `here <https://github.com/facebookresearch/InferSent>`_ """ def __init__(self, version=1): super().__init__() self.version = version if version == 1: self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'glove.840B.300d', 'glove.840B.300d.txt') if version == 2: self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'crawl-300d-2M', 'crawl-300d-2M.vec') self.MODEL_PATH = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'infersent%s' % version, 'infersent%s.pkl' % version) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(self.MODEL_PATH)) self.model.set_w2v_path(self.PATH_TO_W2V) self._embedding_length: int = params_model['enc_lstm_dim'] self.name = f"{self.__class__.__name__ }_v{self.version}" self.static_embeddings = True @property def embedding_length(self) -> int: return self._embedding_length def _add_embeddings_internal(self, sentences: List[Sentence]): everything_embedded: bool = True infersent_sentences = [] for sentence in sentences: if self.name not in sentence._embeddings.keys(): everything_embedded = False if not everything_embedded: for sentence in sentences: infersent_sentences.append(sentence.to_tokenized_string()) self.model.build_vocab(infersent_sentences, tokenize=False) self.model.update_vocab(infersent_sentences, tokenize=False) embeddings = self.model.encode(infersent_sentences, tokenize=False) for sentence, sentence_embedding in zip(sentences, embeddings): sentence.set_embedding(self.name, torch.tensor(sentence_embedding))
# text_unpacked_short = [t[:2000] for t in text_unpacked] # df.text_unpacked = [' '.join(t) for t in text_unpacked_short] def shorten_text(x): t = ast.literal_eval(x) if len(t) > 2000: return ' '.join(t[:2000]) else: return ' '.join(t) df['text_unpacked'] = df.text.apply(shorten_text) #df.text_unpacked = df.text.apply(lambda x: ' '.join(ast.literal_eval(x))) model.build_vocab(df.text_unpacked) print("Vocabulary loading complete!") #writing function for common cosine similarity def doc_words_cosine(i, t): emb = embeddings[i] if t == 'culture': word_vec_avg = np.sum(culture_embeddings, axis=0) / len(culture) elif t == 'demographic': word_vec_avg = np.sum(demographic_embeddings, axis=0) / len(demographic) elif t == 'relational': word_vec_avg = np.sum(relational_embeddings, axis=0) / len(relational) return absolute(dot(emb, word_vec_avg) / (norm(emb) * norm(word_vec_avg)))
V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'final_text_vectors.txt' model.set_w2v_path(W2V_PATH) model.build_vocab(sentences, tokenize=True) #build_vocab_k_words(K=100000) embeddings = model.encode( sentences, tokenize=True) #(sentences, bsize=168, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(embeddings))) sen_vec = preprocessing.normalize(embeddings) sen_vec = Variable(torch.from_numpy(sen_vec)) #sen_vec = nn.Linear(4096,300) model = net() n = (1, 300) nparray = np.zeros(n) for i in sen_vec: out = model(i) out = out.data.numpy()
def getDocumentEmbedding(doc, model_params: dict = {}, encoder = 'xlnet', model_name = 'xlnet-base-uncased'): #model = SentenceTransformer(model_name, model_params) #sentence_embedding = model.encode(doc) ## Word tokenizer from spacy.lang.en import English nlp = English() # Create a Tokenizer with the default settings for English including punctuation rules and exceptions tokenizer = nlp.Defaults.create_tokenizer(nlp) tokens = tokenizer("This is a sentence") if len(tokens) > getMaxLength(encoder): warnings.warn("The input sequence length exceeds the maximum limit.", Warning) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart', 'finbert']: # Use BERT for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(doc) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=doc) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(doc, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(doc) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') return sentence_embeddings
def infer(inputs): radius = 0.09 nlp = spacy.load("en_core_web_sm") sentences = [] locations = [] import json pass_in = json.loads(inputs) for call in pass_in: sentences.append(call['transcript']) locations.append((call['latitude'], call['longitude'])) from models import InferSent V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) ## The old, bag of filtered words, implementation follows # for i, sentence in enumerate(sentences): # sentences[i] = nlp(' '.join([str(t) for t in nlp(sentence) if t.pos_ in ['NOUN', 'PROPN', 'ADJ']])) # # sentences_matrix = np.vstack([x.vector / norm(x.vector) for x in sentences]) # ling_compatibility = np.matmul(sentences_matrix, np.transpose(sentences_matrix)) # print(ling_compatibility) infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) embeddings = embeddings / np.linalg.norm( embeddings, ord=2, axis=1, keepdims=True) ling_compatibility = np.matmul(embeddings, np.transpose(embeddings)) #print(ling_compatibility) def intersection_area(d, r): if d == 0: # the circles are the same return np.pi * r**2 if d >= 2 * r: # The circles don't overlap at all. return 0 r2, d2 = r**2, d**2 alpha = np.arccos(d2 / (2 * d * r)) wow = 2 * r2 * alpha - r2 * np.sin(2 * alpha) return wow geo_compatibility = np.zeros((len(locations), len(locations))) for i in range(len(locations)): for k in range(i, len(locations)): geo_compatibility[i][k] = intersection_area( math.sqrt((locations[i][0] - locations[k][0])**2 + (locations[i][1] - locations[k][1])**2), radius) / (math.pi * (2**2)) from sklearn.cluster import KMeans total = np.multiply(ling_compatibility, geo_compatibility) #print(total.shape) #for i in range(len(locations)): # for k in range(len(locations)): # if i != k and total[i][k] > 0.65: # print(str(i) + " and " + str(k) + " are the same incident") kmeany = KMeans(init='k-means++').fit(total) labels = kmeany.labels_.tolist() mapper = {} for call, label in enumerate(labels): mapper[call] = label class Analysis: def __init__(self, sentence): self.sentence = sentence self.nlpped = nlp(sentence) self.nouns = [ str(t.lemma_) for t in self.nlpped if (t.pos_ in ['PROPN', 'NOUN'] and t.lemma_ not in ['I', 'help']) ] self.verbs = [ str(t.lemma_) for t in self.nlpped if (t.pos_ in ['VERB', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] and t.lemma_ not in [ 'be', 'have', 'do', 'say', 'go', 'get', 'make', 'know', 'think', 'take', 'help', 'may', 'fear', 'see', 'stop', 'reach', 'seem', 'hope', 'want', 'would', 'cause', 'let', 'like', 'will' ]) ] analyses = [] for sentence in sentences: analyses.append(Analysis(sentence)) d = [] for n in set(mapper.values()): nouns = [] for k in mapper.keys(): if mapper[k] == n: nouns += analyses[k].nouns noun_counter = Counter(nouns) verbs = [] for k in mapper.keys(): if mapper[k] == n: verbs += analyses[k].verbs verb_counter = Counter(verbs) calls = [] for k in mapper.keys(): if mapper[k] == n: call = { 'transcript': sentences[k], 'file': pass_in[k]['file'], 'lat': locations[k][0], 'lon': locations[k][1], 'id': pass_in[k]['id'] } calls.append(call) blah = [x[0] for x in verb_counter.most_common(3) if x[1] > 1 ] + [x[0] for x in noun_counter.most_common(3) if x[1] > 1] if len(blah) == 0: blah = [x[0] for x in verb_counter.most_common(1) ] + [x[0] for x in noun_counter.most_common(1)] d.append({'name': ' '.join(blah), 'calls': calls}) return json.dumps(d)
import pandas as pd import spacy import nltk import numpy as np import torch from models import InferSent df=pd.read_csv("/home/psrivastava/Intern_Summer/data/new_output.csv") abs_arr=df.ix[:4,'clean_text'] nlp=spacy.load("en_core_web_sm") MODEL_PATH="/home/psrivastava/Intern_Summer/infersent/encoder/infersent2.pkl" W2V_PATH="/home/psrivastava/Intern_Summer/infersent/fastText/crawl-300d-2M.vec" params_model={'bsize':64,'word_emb_dim':300,'enc_lstm_dim':2048,'pool_type':'max','dpout_model':0.0,'version':2} infersent=InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent.set_w2v_path(W2V_PATH) for index in range(len(abs_arr)): doc=nlp(abs_arr[index]) strs_after_stop_arr=[] for token in doc: if not token.is_stop: strs_after_stop_arr.append(token.text) abs_arr[index]=' '.join(strs_after_stop_arr) infersent.build_vocab(abs_arr) #But Actually they are abstracts of diffrent papers print(infersent.encode(abs_arr)[0][:])
'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) print('Load our pre-trained model (in encoder/)') # Set word vector path for the model W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) print('Set word vector path for the model') # Build the vocabulary of word vectors (i.e keep only those needed) infersent.build_vocab(all_captions, tokenize=True) print('Build the vocabulary of word vectors') # Start encoding captions caption2id = {} f = open('pascal-sentences-dataset/text_features.txt', 'w+') for caption in all_captions: current_feature = list( infersent.encode([caption], tokenize=True).squeeze()) if not caption in caption2id: caption2id[caption] = 'caption_' + str(len(caption2id)) current_feature = [str(feature) for feature in current_feature] current_feature_str = ' '.join(current_feature) f.write('%s %s\n' % (caption2id[caption], current_feature_str)) f.close()
PARSER.add_argument('--question', metavar='string', required=True, help="The question you want answered") ARGS = PARSER.parse_args() question = ARGS.question sentences = [question] #### Load Facebook's InferSent (download the files from the internet) infersent = InferSent({'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}) infersent.load_state_dict(torch.load('/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl')) infersent.set_w2v_path('/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt') # Extract the most relevant Wikipedia page #### Wikipedia recommends 10 pages wikipedia_pages = wikipedia.search(question) sentences = sentences + wikipedia_pages #### Convert sentences to numbers infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True, verbose=False) #### Choose the most relevant pages distances = pdist(np.array(embeddings), metric='euclidean') sentence_similarity_matrix = squareform(distances) most_relevant_pages = np.argsort(sentence_similarity_matrix[0][1:]) #### Extract the content on the most relevant page (tries multiple pages in case of failure) for page in most_relevant_pages: try: content_on_the_page = wikipedia.page(wikipedia_pages[page]).content break except: pass # Find and print the most relevant sentences #### Split the content into sentences
from models import InferSent import torch import pandas as pd from textblob import TextBlob df = pd.read_csv('data/train.csv') blob = TextBlob(" ".join(df['context'].drop_duplicates().reset_index(drop=True))) # Droping all dupliacte context from the dataframe sentences = [item.raw for item in blob.sentences] MODEL_PATH = 'models/infersent_untrained.pkl' GLOVE_PATH = 'data/glove.840B.300d.txt' params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(GLOVE_PATH) model.build_vocab(sentences, tokenize=True) torch.save(model, 'models/infersent_trained.pt')