def arora_mean_pairwise_sim(sample): """ Returns the mean cosine similarity (w.r.t. Arora embeddings) of each (prompt sentence, story sentence) pair """ # Init sentence embedder if necessary global arora_sentence_embedder if arora_sentence_embedder is None: print( "\nInitializing arora sent embedder for the pairwise_arora_cosine_similarity metric..." ) arora_sentence_embedder = load_arora_sentence_embedder() # Get sentences prompt_sentences = util.get_sentences(sample, 'prompt') story_sentences = util.get_sentences(sample, 'story') # Get embeddings # prompt_embeddings should be a np array shape (num_prompt_sents, emb_len); similarly for story_embeddings prompt_embeddings = [ arora_sentence_embedder.embed_sent(sent.split()) for sent in prompt_sentences ] prompt_embeddings = np.array( [np.array(emb) for emb in prompt_embeddings if emb is not None]) story_embeddings = [ arora_sentence_embedder.embed_sent(sent.split()) for sent in story_sentences ] story_embeddings = np.array( [np.array(emb) for emb in story_embeddings if emb is not None]) # Get prompt/story similarities. Might both be None. prompt_story_table, mean_pairwise_sim = util.get_sims( prompt_embeddings, story_embeddings) # Compute story sent / prompt sim table. # Is np array shape (num_story_sents), or None, representing the similarity of each story sentence to the prompt storysent_prompt_table = np.mean( prompt_story_table, axis=0) if prompt_story_table is not None else None # Save the tables to cache sample.cache['arora_stats'] = { "prompt_story_table": prompt_story_table, "storysent_prompt_table": storysent_prompt_table, } return mean_pairwise_sim
def get_keywords(text): data = {} for i, sentence in enumerate(get_sentences(text), start=1): tagged_tokens = pos_tag(tokenize(sentence)) for term in get_terms(chunk(tagged_tokens)): _keywords(term, i, data) keywords = [{ 'keyword': v['term_forms'][0], 'count': v['count'], 'locations': v['locations'] } for k, v in data.items()] return sorted(keywords, key=itemgetter('count'), reverse=True)
train_set_y.append(type_list.index(item.trigger_label)) lines = all_text.split('\n') si=0 for line in lines: words=line.split(' ') for word in words: if word in embd: train_set_x.append(np.hstack((embd[word],ae_sf[si][0]))) #train_set_x.append(embd[word]) train_set_y.append(19) return train_set_x, train_set_y if __name__=="__main__": data,filepath=get_sentences() ae=AE(data,filepath) #ae.build() ae.load('LSTM_AE') train_set_x, train_set_y = load_dataset('./mlee/train',ae) with open('data/train_set_x', 'w') as f: pickle.dump(train_set_x, f) f.flush() f.close() with open('data/train_set_y', 'w') as f: pickle.dump(train_set_y, f) f.flush() f.close() valid_set_x, valid_set_y = load_dataset('./mlee/valid', ae) with open('data/valid_set_x', 'w') as f: pickle.dump(valid_set_x, f)
train_set_y.append(type_list.index(item.trigger_label)) lines = all_text.split('\n') si = 0 for line in lines: words = line.split(' ') for word in words: if word in embd: # train_set_x.append(np.hstack((embd[word],ae_sf[si][0]))) train_set_x.append(embd[word]) train_set_y.append(19) return train_set_x, train_set_y if __name__ == "__main__": data, filepath = get_sentences() train_set_x, train_set_y = load_dataset('./mlee/train') with open('data/train_set_x', 'w') as f: pickle.dump(train_set_x, f) f.flush() f.close() with open('data/train_set_y', 'w') as f: pickle.dump(train_set_y, f) f.flush() f.close() print '**********loading valid set*****************' valid_set_x, valid_set_y = load_dataset('./mlee/valid') with open('data/valid_set_x', 'w') as f:
def mean_sent_len(sample): """Returns average story sentence length (measured in words)""" sents = util.get_sentences(sample, 'story') lengths = [_num_words(s) for s in sents] return util.mean(lengths)