def extract_story_elements(): min_head_vocab = 5 min_role_vocab = 4 min_tuples = 3 ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) heads = defaultdict(int) tokens = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} print "Extracting story elements" for f_i, f in enumerate(parsed_files): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename) story_elements[basename] = element_list for element in element_list: for h in element.head_words: heads[h] += 1 for t in element.attributes: attributes[t] += 1 for t in element.agent_roles: agent_roles[t] += 1 for t in element.patient_roles: patient_roles[t] += 1 print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) """ common_tokens = [(v, k) for k, v in tokens.items()] common_tokens.sort() common_tokens.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json') fh.write_to_json(common_tokens, output_filename, sort_keys=False) """ common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) print pronoun_list #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list} most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" valid_elements = defaultdict(list) for basename, element_list in story_elements.items(): for se in element_list: se.valid_heads = [h for h in se.head_words if h not in pronoun_list] se.valid_phrases = [h for h in se.phrases if h not in pronoun_list] if len(se.valid_heads) > 0: se.valid_attributes = [t for t in se.attributes if t in most_common_attributes] se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles] se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles] se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \ [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \ [(PATIENT_ROLE, t) for t in se.valid_patient_roles] #[(SURFACE_FORM, t) for t in se.valid_heads] if len(se.tuples) >= min_tuples: valid_elements[basename].append(se) print "Constructing vocabulary" n_tuples = 0 vocab = VocabWithCounts('', add_oov=False) n_entities = 0 for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for role, token in se.tuples] vocab.add_tokens(tokens) n_tuples += len(tokens) n_entities += 1 head_word_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for token in se.valid_heads] head_word_vocab.add_tokens(tokens) head_phrase_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for token in se.valid_phrases] head_phrase_vocab.add_tokens(tokens) print "Building indices" tuple_vocab = np.zeros(n_tuples, dtype=int) # vocab index of the ith word tuple_entity = np.zeros(n_tuples, dtype=int) tuple_role = [] entity_doc = np.zeros(n_entities, dtype=int) # topic of the ith word docs = valid_elements.keys() docs.sort() vocab_counts = np.zeros(len(vocab), dtype=int) article_mapping = [] entity_index = 0 head_word_vocab_list = [] head_word_entity_list = [] head_phrase_vocab_list = [] head_phrase_entity_list = [] t_i = 0 for d_i, d in enumerate(docs): element_list = valid_elements[d] for se in element_list: entity_doc[entity_index] = d_i for role, token in se.tuples: tuple_entity[t_i] = entity_index tuple_role.append(role) vocab_index = vocab.get_index(token) tuple_vocab[t_i] = vocab_index vocab_counts[vocab_index] += 1 t_i += 1 for token in se.valid_heads: head_word_vocab_index = head_word_vocab.get_index(token) head_word_vocab_list.append(head_word_vocab_index) head_word_entity_list.append(entity_index) for token in se.valid_phrases: head_phrase_vocab_index = head_phrase_vocab.get_index(token) head_phrase_vocab_list.append(head_phrase_vocab_index) head_phrase_entity_list.append(entity_index) article_mapping.append(str(entity_index) + ':' + d + ':' + ','.join(se.head_words) + ':' + ','.join(se.valid_attributes) + ':' + ','.join(se.valid_agent_roles) + ':' + ','.join(se.valid_patient_roles)) entity_index += 1 print len(docs), "valid documents" print entity_index, "entities" print t_i, "tuples" print len(vocab), "word types" print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts) output_filename = os.path.join(dirs.lda_dir, 'tuple_vocab.json') fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'tuple_role.json') fh.write_to_json(list(tuple_role), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'tuple_entity.json') fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'entity_doc.json') fh.write_to_json(list(entity_doc), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'vocab.json') fh.write_to_json(vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'docs.json') fh.write_to_json(list(docs), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'article_map.json') fh.write_to_json(list(article_mapping), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab.json') fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab.json') fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab_list.json') fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_entity_list.json') fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab_list.json') fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_entity_list.json') fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False)
def get_bamman_entities(all_trees, clustered_entity_indices, word2vec_file=None, min_role_vocab=4, min_tuples=3): ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 tokens = defaultdict(int) heads = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} for basename, trees in all_trees.items(): story_elements[basename] = [] article_clusters = clustered_entity_indices[basename] # go through each entity, represented by a list of tree/node locations for c_i, cluster_indices in enumerate(article_clusters): # create an entity for each cluster in this document entity = BammanEntity(basename) # for each appearance, create an appearance object for this entity for t_i, n_i in cluster_indices: word = trees[t_i].node_dict[n_i].word compound_word = get_compound_noun(trees[t_i], n_i) mention_attributes = get_attributes(trees[t_i], n_i) mention_agent_roles = get_agent_roles(trees[t_i], n_i) mention_patient_roles = get_patient_roles(trees[t_i], n_i) appearance = BammanEntityAppearance(t_i, n_i, word, mention_attributes, mention_agent_roles, mention_patient_roles, compound_word) entity.add_appearance(appearance) # count the total mentions of these words to build vocabularies heads[word] += 1 for t in mention_attributes: attributes[t[0]] += 1 for t in mention_agent_roles: agent_roles[t[0]] += 1 for t in mention_patient_roles: patient_roles[t[0]] += 1 # add the newly created entity to a dict story_elements[basename].append(entity) print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.persona_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.persona_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.persona_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.persona_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) # filter vocabularies based on frequency and stopwords most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} # save these vocabularies output_filename = os.path.join(dirs.persona_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" filtered_indices = {} valid_elements = defaultdict(list) for basename, entity_list in story_elements.items(): #filtered_indices[basename] = [] for e_index, entity in enumerate(entity_list): appearances = entity.get_appearances() valid_heads = [] for ap in appearances: if ap.head_word not in pronoun_list: valid_heads.append(ap.head_word) ap.valid_heads = [ap.head_word] ap.valid_compound_heads = [ap.compound_word] else: ap.valid_heads = [] ap.valid_compound_heads = [] if len(valid_heads) > 0: for ap in appearances: ap.valid_attributes = [t for t in ap.attributes if t[0] in most_common_attributes] ap.valid_agent_roles = [t for t in ap.agent_roles if t[0] in most_common_agent_roles] ap.valid_patient_roles = [t for t in ap.patient_roles if t[0] in most_common_patient_roles] ap.tuples = [(ATTRIBUTE, t[0], t[1], t[2], t[3]) for t in ap.valid_attributes] + \ [(AGENT_ROLE, t[0], t[1], t[2], t[3]) for t in ap.valid_agent_roles] + \ [(PATIENT_ROLE, t[0], t[1], t[2], t[3]) for t in ap.valid_patient_roles] if entity.get_n_tuples() >= min_tuples: valid_elements[basename].append(entity) #filtered_indices[basename].append(clustered_entity_indices[basename][se_index]) print "Constructing vocabulary" n_tuples = 0 vocab = VocabWithCounts('', add_oov=False) n_entities = 0 n_mentions = 0 for basename, element_list in valid_elements.items(): for se in element_list: for appearance in se.appearances: tokens = [token for role, token, relation, pos, tuple_token_index in appearance.tuples] vocab.add_tokens(tokens) n_tuples += len(tokens) if len(appearance.tuples) > 0: n_mentions += 1 n_entities += 1 head_word_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: for appearance in se.appearances: tokens = [token for token in appearance.valid_heads] head_word_vocab.add_tokens(tokens) head_phrase_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: for appearance in se.appearances: tokens = [token for token in appearance.valid_compound_heads] head_phrase_vocab.add_tokens(tokens) print "Building indices" tuple_vocab = np.zeros(n_tuples, dtype=int) # vocab index of the ith word tuple_entity = np.zeros(n_tuples, dtype=int) tuple_role = [] mention_entity = np.zeros(n_mentions, dtype=int) tuple_mention = np.zeros(n_tuples, dtype=int) entity_doc = np.zeros(n_entities, dtype=int) # topic of the ith word docs = valid_elements.keys() docs.sort() """ vocab_vectors = None if word2vec_file is not None: import gensim dx = 300 vocab_vectors = np.zeros((len(vocab), dx)) # load pre-trained word vectors print "Loading pre-trained word vectors" all_vectors = gensim.models.Word2Vec.load_word2vec_format(word2vec_file, binary=True) word2vec_vocab = set() for v in vocab.get_all_tokens(): v_i = vocab.get_index(v) if v in all_vectors: vocab_vectors[v_i, :] = all_vectors[v] word2vec_vocab.add(v) else: vocab_vectors[v_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx)) print len(list(set(vocab.get_all_tokens()) - word2vec_vocab)), "words in training vocabulary with no word2vec vector" """ vocab_counts = np.zeros(len(vocab), dtype=int) entity_appearances = {} entity_index = 0 mention_index = 0 head_word_vocab_list = [] head_word_entity_list = [] head_phrase_vocab_list = [] head_phrase_entity_list = [] entity_text_mentions = {} t_i = 0 for d_i, d in enumerate(docs): print d basename = os.path.basename(d) entity_appearances[basename] = {} element_list = valid_elements[d] entity_text_mentions[d] = {} for se in element_list: entity_text_mentions[d][entity_index] = {'sent_indices': [], 'token_indices': [], 'roles': []} entity_doc[entity_index] = d_i for appearance in se.appearances: entity_text_mentions[d][entity_index]['sent_indices'].append(appearance.tree_index) entity_text_mentions[d][entity_index]['token_indices'].append(appearance.token_index) for role, token, relation, pos, tuple_token_index in appearance.tuples: tuple_entity[t_i] = entity_index tuple_mention[t_i] = mention_index tuple_role.append(role) vocab_index = vocab.get_index(token) tuple_vocab[t_i] = vocab_index vocab_counts[vocab_index] += 1 t_i += 1 entity_text_mentions[d][entity_index]['roles'].append((role, token, appearance.tree_index, tuple_token_index)) for token in appearance.valid_heads: head_word_vocab_index = head_word_vocab.get_index(token) head_word_vocab_list.append(head_word_vocab_index) head_word_entity_list.append(entity_index) for token in appearance.valid_compound_heads: head_phrase_vocab_index = head_phrase_vocab.get_index(token) head_phrase_vocab_list.append(head_phrase_vocab_index) head_phrase_entity_list.append(entity_index) # keep track of which document / sentences this entity appears in s_i = appearance.tree_index if s_i in entity_appearances[basename]: entity_appearances[basename][s_i].append(entity_index) else: entity_appearances[basename][s_i] = [entity_index] if len(appearance.tuples): mention_entity[mention_index] = entity_index mention_index += 1 entity_index += 1 # as initial testing for Gaussian LDA, export a small vector for each tuple """ tuple_vectors = None if word2vec_file is not None: vec_size = 10 tuple_vectors = np.zeros([n_tuples, vec_size]) for v_i, v in enumerate(tuple_vocab): tuple_vectors[v_i, :] = vocab_vectors[v, :vec_size] """ # export network data rnn_data = [] t_i = 0 entity_index = 0 mention_index = 0 for d_i, d in enumerate(docs): element_list = valid_elements[d] for entity in element_list: appearance_list = [] for appearance in entity.appearances: tuple_list = [] head_word = appearance.head_word head_phrase = appearance.compound_word for role, token, relation, pos, tuple_token_index in appearance.tuples: tuple_list.append((t_i, role, token, relation, head_word, pos, head_phrase)) t_i += 1 if len(tuple_list) > 0: appearance_list.append(tuple_list) rnn_data.append([d_i, entity_index, appearance_list]) entity_index += 1 output_filename = os.path.join(dirs.persona_dir, 'rnn_data.json') fh.write_to_json(rnn_data, output_filename, sort_keys=False) print len(docs), "valid documents" print entity_index, "entities" print t_i, "tuples" print len(vocab), "word types" print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts) output_filename = os.path.join(dirs.persona_dir, 'tuple_vocab.json') fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'tuple_role.json') fh.write_to_json(list(tuple_role), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'tuple_entity.json') fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'tuple_mention.json') fh.write_to_json(list(tuple_mention), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'mention_entity.json') fh.write_to_json(list(mention_entity), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'entity_doc.json') fh.write_to_json(list(entity_doc), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'vocab.json') fh.write_to_json(vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'docs.json') fh.write_to_json(list(docs), output_filename, sort_keys=False) #output_filename = os.path.join(dirs.persona_dir, 'article_map.json') #fh.write_to_json(list(article_mapping), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_word_vocab.json') fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_word_vocab_list.json') fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_word_entity_list.json') fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'entity_appearances.json') fh.write_to_json(entity_appearances, output_filename, sort_keys=False) #if tuple_vectors is not None: # output_filename = os.path.join(dirs.persona_dir, 'tuple_vectors.json') # fh.write_to_json(tuple_vectors.tolist(), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_phrase_vocab.json') fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_phrase_vocab_list.json') fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_phrase_entity_list.json') fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'entity_text_mentions.json') fh.write_to_json(entity_text_mentions, output_filename, sort_keys=False) return filtered_indices, valid_elements