def doc_word_embed0(path, no_add_set): with open(path, 'r') as file: lines = file.readlines() lines1 = [] #for line in lines: # lines1.extend(line.lower().split('.') ) #lines = lines1 words = [] vocab, embeds = data.process_glove_data(dim=100) embed_map = dict(zip(vocab, embeds)) tk = tokenizer.WordTokenizer() #list of list of tokens tokens_l = tk.batch_tokenize(lines) stop_word_filter = StopwordFilter() word_embeds = [] words_ar = [] added_set = set(no_add_set) for sentence in tokens_l: sentence = stop_word_filter.filter_words(sentence) for w in sentence: w = w.text.lower() if w in embed_map and w not in added_set: added_set.add(w) words_ar.append(w) word_embeds.append(embed_map[w]) word_embeds = torch.stack(word_embeds, dim=0).to(utils.device) if False: #sanity check word_embeds[:] = word_embeds[0] #word_embeds = word_embeds / (word_embeds**2).sum(dim=1, keepdim=True).sqrt() return words_ar, word_embeds
def get_elmo_fea(data, op, wg): ''' Took this method from public kernel: https://www.kaggle.com/wochidadonggua/elmo-baseline modified it to concatenate all 3 layers ''' def get_nearest(slot, target): for i in range(target, -1, -1): if i in slot: return i # add parameter cuda_device=0 to use GPU elmo = ElmoEmbedder(options_file=op, weight_file=wg) tk = word_tokenizer.WordTokenizer() tokens = tk.batch_tokenize(data.Text) idx = [] for i in range(len(tokens)): idx.append([x.idx for x in tokens[i]]) tokens[i] = [x.text for x in tokens[i]] vectors = elmo.embed_sentences(tokens) ans = [] for i, vector in enumerate([v for v in vectors]): P_l = data.iloc[i].Pronoun A_l = data.iloc[i].A.split() B_l = data.iloc[i].B.split() P_offset = data.iloc[i]['Pronoun-offset'] A_offset = data.iloc[i]['A-offset'] B_offset = data.iloc[i]['B-offset'] if P_offset not in idx[i]: P_offset = get_nearest(idx[i], P_offset) if A_offset not in idx[i]: A_offset = get_nearest(idx[i], A_offset) if B_offset not in idx[i]: B_offset = get_nearest(idx[i], B_offset) # P is a single token. For A and B, average over tokens in the span. emb_P = vector[:, idx[i].index(P_offset), :] emb_A = np.mean(vector[:, idx[i].index(A_offset):idx[i].index(A_offset) + len(A_l), :], axis=1) emb_B = np.mean(vector[:, idx[i].index(B_offset):idx[i].index(B_offset) + len(B_l), :], axis=1) ans.append(np.concatenate([emb_A[0], emb_A[1], emb_A[2], emb_B[0], emb_B[1], emb_B[2], emb_P[0], emb_P[1], emb_P[2]], axis=0).reshape(1, -1)) emb = np.concatenate(ans, axis=0) return emb
def doc_word_embed(path, no_add_set, content_lines=None): if content_lines is not None: lines = content_lines else: with open(path, 'r') as file: lines = file.readlines() lines1 = [] words = [] vocab, embeds = data.process_glove_data(dim=100) embed_map = dict(zip(vocab, embeds)) tk = tokenizer.WordTokenizer() #list of list of tokens tokens_l = tk.batch_tokenize(lines) stop_word_filter = StopwordFilter() tokens_l1 = [] for sentence_l in tokens_l: tokens_l1.extend(sentence_l) tokens_l = [tokens_l1] n_avg = 5 #5 word_embeds = [] words_ar = [] added_set = set(no_add_set) for sentence in tokens_l: sentence = stop_word_filter.filter_words(sentence) cur_embed = torch.zeros_like(embed_map['a']) cur_counter = 0 for j, w in enumerate(sentence): w = w.text.lower() if w in embed_map: # and w not in added_set: if cur_counter == n_avg: # or j==len(sentence)-1: added_set.add(w) words_ar.append(w) #word_embeds.append(embed_map[w]) #word_embeds.append(cur_embed/(cur_counter if cur_counter > 0 else 1)) word_embeds.append(cur_embed / n_avg) cur_embed = torch.zeros_like(embed_map['a']) cur_counter = 0 else: cur_counter += 1 cur_embed += embed_map[w] word_embeds = torch.stack(word_embeds, dim=0).to(utils.device) if False: #is_noise :#False: #sanity check word_embeds[:] = word_embeds.mean(0) #word_embeds[0] return words_ar, word_embeds
def doc_sentence_embed(path): with open(path, 'r') as file: lines = file.readlines() lines1 = [] for line in lines: lines1.extend(line.lower().split('.')) lines = lines1 words = [] vocab, embeds = data.process_glove_data(dim=100) embed_map = dict(zip(vocab, embeds)) tk = tokenizer.WordTokenizer() tokens_l = tk.batch_tokenize(lines) word_embeds = [] words_ar = [] added_set = set() for sentence in tokens_l: if len(sentence) < 3: continue sentence_embed = 0 aa = True for w in sentence: w = w.text.lower() if w in embed_map: # and w not in added_set: ##added_set.add(w) ##words_ar.append(w) ##word_embeds.append(embed_map[w]) sentence_embed += embed_map[w] aa = False if aa: continue words_ar.append(sentence) word_embeds.append(sentence_embed / len(sentence)) word_embeds = torch.stack(word_embeds, dim=0).to(utils.device) #word_embeds = word_embeds / (word_embeds**2).sum(dim=1, keepdim=True).sqrt() return words_ar, word_embeds
import torch import sklearn.decomposition as decom import data import utils import numpy as np import numpy.linalg as linalg import re import pdb USE_ALLENNLP = False #use flag, as some users reported issues with installation. if USE_ALLENNLP: import allennlp.data.tokenizers.word_tokenizer as tokenizer from allennlp.data.tokenizers.word_filter import StopwordFilter tk = tokenizer.WordTokenizer() stop_word_filter = StopwordFilter() else: print('Note: using rudimentary tokenizer, for better results enable allennlp.') stop_word_filter = utils.stop_word_filter() tk = utils.tokenizer() ''' Combines content and noise words embeddings ''' def doc_word_embed_content_noise(content_path, noise_path, whiten_path=None, content_lines=None, noise_lines=None, opt=None): no_add_set = set() doc_word_embed_f = doc_word_embed_sen content_words_ar, content_word_embeds = doc_word_embed_f(content_path, no_add_set, content_lines=content_lines) words_set = set(content_words_ar) noise_words_ar, noise_word_embeds = doc_word_embed_f(noise_path, set(content_words_ar), content_lines=noise_lines)
def get_elmo_emb(data_name, op, wg): elmo = ElmoEmbedder(options_file=op, weight_file=wg, cuda_device=0) # data = pd.read_csv("input/gap-validation.tsv", sep = '\t') data = pd.read_csv(f'input/{data_name}.tsv', sep='\t') index = data.index columns = ['emb_A', 'emb_B', 'emb_P', 'label'] emb = pd.DataFrame(index=index, columns=columns) emb.index.name = 'ID' tk = word_tokenizer.WordTokenizer() tokens = tk.batch_tokenize(data.Text) idx = [] for i in range(len(tokens)): idx.append([x.idx for x in tokens[i]]) tokens[i] = [x.text for x in tokens[i]] vectors = elmo.embed_sentences(tokens) ans = [] for i, vector in enumerate([v for v in vectors]): P_l = data.iloc[i].Pronoun A_l = data.iloc[i].A.split() B_l = data.iloc[i].B.split() P_offset = data.iloc[i]['Pronoun-offset'] A_offset = data.iloc[i]['A-offset'] B_offset = data.iloc[i]['B-offset'] if P_offset not in idx[i]: P_offset = get_nearest(idx[i], P_offset) if A_offset not in idx[i]: A_offset = get_nearest(idx[i], A_offset) if B_offset not in idx[i]: B_offset = get_nearest(idx[i], B_offset) emb_P = np.mean(vector[1:3, idx[i].index(P_offset), :], axis=0, keepdims=True) emb_A = np.mean(vector[1:3, idx[i].index(A_offset):idx[i].index(A_offset) + len(A_l), :], axis=(1, 0), keepdims=True) emb_A = np.squeeze(emb_A, axis=0) emb_B = np.mean(vector[1:3, idx[i].index(B_offset):idx[i].index(B_offset) + len(B_l), :], axis=(1, 0), keepdims=True) emb_B = np.squeeze(emb_B, axis=0) emb_A = emb_A.reshape((1024, )) emb_B = emb_B.reshape((1024, )) emb_P = emb_P.reshape((1024, )) label = 'Neither' if data.loc[i, 'A-coref']: label = 'A' if data.loc[i, 'B-coref']: label = 'B' emb.iloc[i] = [emb_A, emb_B, emb_P, label] return emb