def get_gensen_synset_definitions(entity_file, vocab_file, gensen_file): from gensen import GenSen, GenSenSingle gensen_1 = GenSenSingle( model_folder='./data/models', filename_prefix='nli_large_bothskip', pretrained_emb='./data/embedding/glove.840B.300d.h5') gensen_1.eval() definitions = {} with open(entity_file, 'r') as fin: for line in fin: node = json.loads(line) if node['type'] == 'synset': definitions[node['id']] = node['definition'] with open(vocab_file, 'r') as fin: vocab_list = fin.read().strip().split('\n') # get the descriptions sentences = [''] * NUM_EMBEDDINGS for k, entity in enumerate(vocab_list): definition = definitions.get(entity) if definition is None: assert entity in ('@@UNKNOWN@@', '@@MASK@@', '@@NULL@@') else: sentences[k + 1] = definition embeddings = np.zeros((NUM_EMBEDDINGS, 2048), dtype=np.float32) for k in range(0, NUM_EMBEDDINGS, 32): sents = sentences[k:(k + 32)] reps_h, reps_h_t = gensen_1.get_representation(sents, pool='last', return_numpy=True, tokenize=True) embeddings[k:(k + 32), :] = reps_h_t print(k) with h5py.File(gensen_file, 'w') as fout: ds = fout.create_dataset('gensen', data=embeddings)
feat_h5 = h5py.File(f'{DATA_PATH}/questions_{split}_clevr.h5', 'w') ques = json.load( open(f'{DATA_PATH}/questions/CLEVR_{split}_questions.json')) ques = ques['questions'] questions = [q['question'] for q in ques] qids = [q['question_index'] for q in ques] qids = np.int64(qids) dt = h5py.special_dtype(vlen=str) feat_h5.create_dataset('feats', (len(qids), 2048), dtype=np.float32) feat_h5.create_dataset('qids', (len(qids), ), dtype=np.int64) feat_h5.create_dataset('questions', (len(qids), ), dtype=dt) feat_h5['qids'][:] = qids feat_h5['questions'][:] = questions chunksize = 5000 question_chunks = [ questions[x:x + chunksize] for x in range(0, len(questions), chunksize) ] done = 0 for qchunk in question_chunks: print(done) _, reps_h_t = gensen_1.get_representation(qchunk, pool='last', return_numpy=True, tokenize=True) feat_h5['feats'][done:done + len(qchunk)] = reps_h_t done += len(qchunk) feat_h5.close()
obama = [ 'Condolences to the family of John Singleton. His seminal work, Boyz n the Hood, remains one of the most searing, loving portrayals of the challenges facing inner-city youth. He opened doors for filmmakers of color to tell powerful stories that have been too often ignored.', 'This generation of climate activists is tired of inaction, and theyve caught the attention of leaders all over the world. So while this challenge is only getting more urgent, they show us the kind of action itll take to meet this moment.', 'That we are in the midst of crisis is now well understood. Our nation is at war, against a far-reaching network of violence and hatred. Our economy is badly weakened, a consequence of greed and irresponsibility on the part of some, but also our collective failure to make hard choices and prepare the nation for a new age. Homes have been lost; jobs shed; businesses shuttered. Our health care is too costly; our schools fail too many; and each day brings further evidence that the ways we use energy strengthen our adversaries and threaten our planet.' ] idx2speaker = [ 'trump1', 'trump2', 'trumpinaguration', 'obama1', 'obama2', 'obamainaguration', 'shakespeare1', 'shakespeare2', 'wutang1', 'wutang2', 'lukecombs', 'lukecombs' ] sentences = trump + obama + shakespeare + lukecombs + wutang gensen_1 = GenSenSingle(model_folder='./data/models', filename_prefix='nli_large_bothskip', pretrained_emb='./data/embedding/glove.840B.300d.h5') reps_h, reps_h_t = gensen_1.get_representation(sentences, pool='last', return_numpy=True, tokenize=True) x = [] for i in range(len(reps_h)): x.append(reps_h[i].mean(axis=0)) model = TSNE(n_components=2, perplexity=20, init='pca', method='exact', n_iter=5000) x = model.fit_transform(x) pyplot.figure(figsize=(20, 20)) pyplot.xlim((np.min(x[:, 0]) - 10, np.max(x[:, 0]) + 10)) pyplot.ylim((np.min(x[:, 1]) - 10, np.max(x[:, 1]) + 10)) for i in range(len(x)):
class SemanticAnalyser(object): """Class for comparing sentences for entailment """ def __init__(self): """Initalizes object """ self.__encoder = GenSenSingle( model_folder=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models'), filename_prefix='nli_large', pretrained_emb=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'embedding', 'glove.840B.300d.h5') ) with open(os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models', 'senteval.pickle'), 'rb') as file: self.__evaluator = pickle.load(file) self.__mutex = Lock() def get_entailments_with_levels(self, sentence, sentences): """Analyzes relation between a sentence and all in a collection Args: sentence: a sentence sentences: a non-empty list of sentences Returns: entailment: 0 if entailed, 1 if neutral, 2 if contradicting for each element in sentences level: a non-negative value of how much this sentence is entailed with each element in sentences """ self.__mutex.acquire() _, encoded = self.__encoder.get_representation([sentence] + sentences, pool='last', return_numpy=True, tokenize=True) input = np.concatenate(( np.repeat([encoded[0]], len(sentences), axis=0), encoded[1:], (np.repeat([encoded[0]], len(sentences), axis=0)) * encoded[1:]), axis=1) output = self.__model_predict(input) self.__mutex.release() entailment = np.argmax(output, axis=1) level = np.max(output, axis=1) - np.transpose(output)[1] for i, sent in enumerate(sentences): if sentence == sent: entailment[i] = 0 level[i] = 1e10 return entailment, level def get_entailment(self, sentence1, sentence2): """Analyzes relation between two sentences Args: sentence1: first sentence as a string sentence2: second sentence as a string Returns: 0 if entailed, 1 if neutral, 2 if contradicting """ if sentence1 == sentence2: return 0 self.__mutex.acquire() _, encoded = self.__encoder.get_representation([sentence1, sentence2], pool='last', return_numpy=True, tokenize=True) input = np.concatenate((encoded[0], encoded[1], encoded[0] * encoded[1])) output = self.__model_predict(np.array([input])) self.__mutex.release() return np.argmax(output) def __model_predict(self, input): sentence_size = input.shape[1] // 3 batch_size = input.shape[0] switched_input = np.hstack((input[:, sentence_size:2*sentence_size], input[:, 0:sentence_size], input[:, 2*sentence_size:3*sentence_size])) input = np.vstack((input, switched_input)) self.__evaluator.model.eval() input = torch.FloatTensor(input).cuda() yhat = [] with torch.no_grad(): for i in range(0, len(input), self.__evaluator.batch_size): x = input[i:i + self.__evaluator.batch_size] output = self.__evaluator.model(x) yhat.append(output.data.cpu().numpy()) yhat = np.vstack(yhat) yhat = (yhat[0:batch_size, :] + yhat[batch_size:2*batch_size, :]) / 2 return yhat
max_length=max_length) model = GenSenSingle(model_folder=args.folder_path, filename_prefix=args.prefix, pretrained_emb=args.pretrain, cuda=True) iterator.word2id = model.word2id iterator.id2word = model.id2word model.vocab_expansion(model.id2word.values()) sentences = iterator.lines if batch_size is 'all' else iterator.lines[ 0:batch_size] sentences = [' '.join(s[:max_length]) for s in sentences] repr_last_h = np.empty((0, hidden_size)) for mbatch_idx, mbatch in enumerate(range(0, len(sentences), 200)): less_sentences = sentences[mbatch:mbatch + 200] _, last_h = model.get_representation(less_sentences, pool='last', return_numpy=True, tokenize=False) repr_last_h = np.append(repr_last_h, last_h, axis=0) print(repr_last_h.shape) iterator.build_kde(repr_last_h=repr_last_h, num_dim_pca=40, grid_search_num=7) data_gen = iterator.sample_kde(batch_size=10, cuda=True) print(data_gen['input'].shape) iterator.save_kde(file_name_kde="kde.sav", file_name_pca="pca.sav") iterator.load_kde(file_name_kde="kde.sav", file_name_pca="pca.sav") data_gen = iterator.sample_kde(batch_size=10, cuda=False) print(data_gen['input'].shape) total_time = time.time() - start print(total_time)