def main(): usage = "%prog exp_dir_test_fold_dir" parser = OptionParser(usage=usage) parser.add_option('-t', dest='test_fold', default=0, help='Test fold; default=%default') (options, args) = parser.parse_args() test_fold = options.test_fold exp_dir = args[0] results = pd.DataFrame(columns=('masked', 'test', 'valid', 'dir')) run_dirs = glob.glob(os.path.join(exp_dir, 'bayes*reuse*')) for i, dir in enumerate(run_dirs): run_num = int(fh.get_basename_wo_ext(dir).split('_')[-1]) if run_num <= 40 and '1_' not in fh.get_basename_wo_ext(dir): results_dir = os.path.join(dir, 'results') test_file = fh.make_filename(results_dir, 'test_macro_f1', 'csv') valid_file = fh.make_filename(results_dir, 'valid_cv_macro_f1', 'csv') masked_valid_file = fh.make_filename(results_dir, 'masked_valid_cv_macro_f1', 'csv') try: test = pd.read_csv(test_file, header=False, index_col=0) valid = pd.read_csv(valid_file, header=False, index_col=0) masked_valid = pd.read_csv(masked_valid_file, header=False, index_col=0) #results.loc[run_num, 'iteration'] = run_num results.loc[i, 'masked'] = masked_valid['overall'].mean() results.loc[i, 'test'] = test['overall'].mean() results.loc[i, 'valid'] = valid['overall'].mean() results.loc[i, 'dir'] = fh.get_basename_wo_ext(dir) except: continue results.to_csv(fh.make_filename(exp_dir, 'summary', 'csv'), columns=results.columns) sorted = results.sort('valid') print sorted print "best by masked" sorted = results.sort('masked') print sorted.values[-1, :] print "best by valid" sorted = results.sort('valid') print sorted.values[-1, :]
def find_entities(n_files=None, use_lemmas=False): parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) trees = {} clustered_indices = {} print "Building trees and finding story elements" if n_files is None: n_files = len(parsed_files) else: n_files = int(n_files) for f_i, f in enumerate(parsed_files[:n_files]): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) trees[f] = build_tree(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename, use_lemmas) clustered_indices[f] = find_entities_in_article(trees[f]) if f_i % 1000 == 0 and f_i > 0: print f_i return trees, clustered_indices
def parse_tokens(tokenized_dir): sentences = {} files = glob.glob(os.path.join(tokenized_dir, '*.tokenized')) for f in files: basename = fh.get_basename_wo_ext(f) sentences[basename] = [] with codecs.open(f, 'r') as input_file: for line in input_file: tokens = line.split() sentences[basename].append(tokens) return sentences
def write_tagged_text(parsed_filename, output_filename): data = fh.read_json(parsed_filename) tagged_text = {} for key, sentences in data.items(): tagged_sentences = [] for sentence in sentences: tagged_tokens = [] for token in sentence: word = token.get('word', '__MISSING__') POS = token.get('POS', '__MISSING__') lemma = token.get('lemma', '__MISSING__') NER = token.get('NER', '__MISSING__') #tagged = word + '_' + POS tagged = POS + '_POS_' tagged_tokens.append(tagged) tagged_sentence = ' '.join(tagged_tokens) tagged_sentences.append(tagged_sentence) tagged_text[fh.get_basename_wo_ext(key)] = ' '.join(tagged_sentences) fh.write_to_json(tagged_text, output_filename, sort_keys=False)
def main(): exp_dir = defines.exp_dir exp_name = 'bayes_opt_LR_alphas_reuse' df = pd.DataFrame() basenames = ['test_acc.csv', 'test_micro_f1.csv', 'test_macro_f1.csv', 'test_pp.csv'] rownames = ['model accuracy', 'model micro f1', 'model macro f1', 'model percent perfect'] for i, basename in enumerate(basenames): rowname = rownames[i] files = glob.glob(os.path.join(exp_dir, '*', 'test_fold_0', exp_name, 'results', basename)) gather_results(df, files, rowname) files = glob.glob(os.path.join(defines.data_raw_labels_dir, '*.csv')) for file in files: dataset = fh.get_basename_wo_ext(file) codes = label_reader.get_dataset_labels(dataset) if dataset in df.columns: df.loc['Number of responses', dataset] = codes.shape[0] df.loc['Number of labels', dataset] = codes.shape[1] output_dir = '/Users/dcard/Dropbox/CMU/DAP/results/' output_filename = fh.make_filename(output_dir, exp_name, 'csv') df.to_csv(output_filename)
def parse_semafor_output(fes_dir, sentences): semafor_dir = dirs.data_semafor_dir frames = {} frame_target = {} frame_arguments = {} frame_target_agruments = {} dicts = [frames, frame_target, frame_arguments, frame_target_agruments] files = glob.glob(os.path.join(fes_dir, '*.fes')) for f in files: basename = fh.get_basename_wo_ext(f) for d in dicts: d[basename] = [] for s in sentences[basename]: d[basename].append([]) #frames[basename] = [] #frame_target[basename] = [] #frame_arguments[basename] = [] #frame_target_agruments[basename] = [] with codecs.open(f, 'r') as input_file: for line in input_file: parts = line.split('\t') n_args = int(parts[1]) frame = parts[2] target_dot_pos = parts[3] target_index = parts[4] target_phrase = parts[5] sentence_num = int(parts[6]) arg_types = [] arg_inices = [] for j in range(n_args-1): arg_types.append(parts[7 + 2*j]) arg_inices.append(parts[7 + 2*j + 1]) # save the frame output_token = '<' + frame + '>' frames[basename][sentence_num].append(output_token) # save the frame_target output_token = '<' + frame + '>' target_indices = target_index.split('_') for ti in target_indices: output_token += '_' + sentences[basename][sentence_num][int(ti)] frame_target[basename][sentence_num].append(output_token) # save the frame and arguments output_token = '<' + frame + '>' for j, arg_type in enumerate(arg_types): output_token += '_<' + arg_type + '>' arg_index = arg_inices[j] indices = arg_index.split(':') for ai in indices: output_token += '_' + sentences[basename][sentence_num][int(ai)] frame_arguments[basename][sentence_num].append(output_token) # save the frame, token, and arguments output_token = '<' + frame + '>' target_indices = target_index.split('_') for ti in target_indices: output_token += '_' + sentences[basename][sentence_num][int(ti)] for j, arg_type in enumerate(arg_types): output_token += '_<' + arg_type + '>' arg_index = arg_inices[j] indices = arg_index.split(':') for ai in indices: output_token += '_' + sentences[basename][sentence_num][int(ai)] frame_target_agruments[basename][sentence_num].append(output_token) output_filename = os.path.join(semafor_dir, 'frames.json') print output_filename fh.write_to_json(frames, output_filename) output_filename = os.path.join(semafor_dir, 'frames_targets.json') print output_filename fh.write_to_json(frame_target, output_filename) output_filename = os.path.join(semafor_dir, 'frame_arguments.json') print output_filename fh.write_to_json(frame_arguments, output_filename) output_filename = os.path.join(semafor_dir, 'frames_target_arguments.json') print output_filename fh.write_to_json(frame_target_agruments, output_filename)
def extract_story_elements(): min_head_vocab = 5 min_role_vocab = 4 min_tuples = 3 ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) heads = defaultdict(int) tokens = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} print "Extracting story elements" for f_i, f in enumerate(parsed_files): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename) story_elements[basename] = element_list for element in element_list: for h in element.head_words: heads[h] += 1 for t in element.attributes: attributes[t] += 1 for t in element.agent_roles: agent_roles[t] += 1 for t in element.patient_roles: patient_roles[t] += 1 print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) """ common_tokens = [(v, k) for k, v in tokens.items()] common_tokens.sort() common_tokens.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json') fh.write_to_json(common_tokens, output_filename, sort_keys=False) """ common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) print pronoun_list #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list} most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" valid_elements = defaultdict(list) for basename, element_list in story_elements.items(): for se in element_list: se.valid_heads = [h for h in se.head_words if h not in pronoun_list] se.valid_phrases = [h for h in se.phrases if h not in pronoun_list] if len(se.valid_heads) > 0: se.valid_attributes = [t for t in se.attributes if t in most_common_attributes] se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles] se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles] se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \ [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \ [(PATIENT_ROLE, t) for t in se.valid_patient_roles] #[(SURFACE_FORM, t) for t in se.valid_heads] if len(se.tuples) >= min_tuples: valid_elements[basename].append(se) print "Constructing vocabulary" n_tuples = 0 vocab = VocabWithCounts('', add_oov=False) n_entities = 0 for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for role, token in se.tuples] vocab.add_tokens(tokens) n_tuples += len(tokens) n_entities += 1 head_word_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for token in se.valid_heads] head_word_vocab.add_tokens(tokens) head_phrase_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for token in se.valid_phrases] head_phrase_vocab.add_tokens(tokens) print "Building indices" tuple_vocab = np.zeros(n_tuples, dtype=int) # vocab index of the ith word tuple_entity = np.zeros(n_tuples, dtype=int) tuple_role = [] entity_doc = np.zeros(n_entities, dtype=int) # topic of the ith word docs = valid_elements.keys() docs.sort() vocab_counts = np.zeros(len(vocab), dtype=int) article_mapping = [] entity_index = 0 head_word_vocab_list = [] head_word_entity_list = [] head_phrase_vocab_list = [] head_phrase_entity_list = [] t_i = 0 for d_i, d in enumerate(docs): element_list = valid_elements[d] for se in element_list: entity_doc[entity_index] = d_i for role, token in se.tuples: tuple_entity[t_i] = entity_index tuple_role.append(role) vocab_index = vocab.get_index(token) tuple_vocab[t_i] = vocab_index vocab_counts[vocab_index] += 1 t_i += 1 for token in se.valid_heads: head_word_vocab_index = head_word_vocab.get_index(token) head_word_vocab_list.append(head_word_vocab_index) head_word_entity_list.append(entity_index) for token in se.valid_phrases: head_phrase_vocab_index = head_phrase_vocab.get_index(token) head_phrase_vocab_list.append(head_phrase_vocab_index) head_phrase_entity_list.append(entity_index) article_mapping.append(str(entity_index) + ':' + d + ':' + ','.join(se.head_words) + ':' + ','.join(se.valid_attributes) + ':' + ','.join(se.valid_agent_roles) + ':' + ','.join(se.valid_patient_roles)) entity_index += 1 print len(docs), "valid documents" print entity_index, "entities" print t_i, "tuples" print len(vocab), "word types" print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts) output_filename = os.path.join(dirs.lda_dir, 'tuple_vocab.json') fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'tuple_role.json') fh.write_to_json(list(tuple_role), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'tuple_entity.json') fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'entity_doc.json') fh.write_to_json(list(entity_doc), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'vocab.json') fh.write_to_json(vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'docs.json') fh.write_to_json(list(docs), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'article_map.json') fh.write_to_json(list(article_mapping), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab.json') fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab.json') fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab_list.json') fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_entity_list.json') fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab_list.json') fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_entity_list.json') fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False)
def parse_xml_files(xml_filelist_filename, output_dir): filelist = fh.read_text(xml_filelist_filename) parsed_files = {} sentiments = {} dependencies = {} dependency_tuples = {} entities = {} coref = {} coref_entities = {} coref_heads = {} all_groups = {} jk_grams = {} amalgram_pairs = {} for file in filelist: file = file.rstrip('\n') print file # peel off both .txt and .xml basename = fh.get_basename_wo_ext(fh.get_basename_wo_ext(file)) sentences, doc_sentiments, doc_dependencies, doc_dependency_tuples, doc_entities, doc_coref, groups, _,\ doc_coref_entities, doc_coref_heads = parse_xml_output(file) parsed_files[basename] = sentences sentiments[basename] = doc_sentiments dependencies[basename] = doc_dependencies dependency_tuples[basename] = doc_dependency_tuples entities[basename] = doc_entities coref[basename] = doc_coref coref_entities[basename] = doc_coref_entities coref_heads[basename] = doc_coref_heads doc_jk_grams, doc_jk_indices = find_jk_grams(sentences) jk_grams[basename] = doc_jk_grams # output documents to amalgram format #amalgram_dir = os.path.join(dirs.data_amalgram_dir, 'input') #if not os.path.exists(amalgram_dir): # os.makedirs(amalgram_dir) tagged_sents = ['\n'.join([t['word'] + '\t' + t['POS'] for t in s]) + '\n' for s in sentences] # save word/tag pairs for amalgram tagged_sents = [[(t['word'], t['POS']) for t in s] for s in sentences] amalgram_pairs[basename] = tagged_sents # uncomment for extracting story elements... parsed_dir = os.path.join(output_dir, 'parsed') if not os.path.exists(parsed_dir): os.makedirs(parsed_dir) parsed_filename = os.path.join(parsed_dir, basename + '.json') fh.write_to_json(sentences, parsed_filename, sort_keys=False) sentiment_filename = fh.make_filename(output_dir, 'sentiments', 'json') fh.write_to_json(sentiments, sentiment_filename, sort_keys=False) dependencies_filename = fh.make_filename(output_dir, 'dependency_tuple_ids', 'json') fh.write_to_json(dependency_tuples, dependencies_filename, sort_keys=False) coref_filename = fh.make_filename(output_dir, 'entities', 'json') fh.write_to_json(coref, coref_filename, sort_keys=False) jkgrams_filename = fh.make_filename(output_dir, 'jkgrams', 'json') fh.write_to_json(jk_grams, jkgrams_filename, sort_keys=False) coref_heads_filename = fh.make_filename(output_dir, 'coref_heads', 'json') fh.write_to_json(coref_heads, coref_heads_filename, sort_keys=False) amalgram_keys = amalgram_pairs.keys() amalgram_keys.sort() amalgram_data_file = os.path.join(dirs.data_amalgram_dir, 'input.txt') with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file: for k in amalgram_keys: sents = amalgram_pairs[k] for s in sents: for p in s: output_file.write(p[0] + '\t' + p[1] + '\n') output_file.write('\n') for k in amalgram_keys: amalgram_data_file = os.path.join(dirs.data_amalgram_dir, k + '.txt') with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file: sents = amalgram_pairs[k] for s in sents: for p in s: output_file.write(p[0] + '\t' + p[1] + '\n') output_file.write('\n') amalgram_index_file = os.path.join(dirs.data_amalgram_dir, 'index.txt') with codecs.open(amalgram_index_file, 'w', encoding='utf-8') as output_file: for k in amalgram_keys: sents = amalgram_pairs[k] for s in sents: output_file.write(k + '\n') #all_groups_filename = fh.make_filename(output_dir, 'all_groups', 'json') #fh.write_to_json(all_groups, all_groups_filename) return parsed_files, dependencies
def identify_rnn_targets(output_data_filename): min_head_vocab = 5 min_role_vocab = 4 min_tuples = 3 ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) heads = defaultdict(int) tokens = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} print "Extracting story elements" for f_i, f in enumerate(parsed_files): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) print f element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename) story_elements[basename] = element_list for element in element_list: for h in element.head_words: heads[h] += 1 for t in element.attributes: attributes[t] += 1 for t in element.agent_roles: agent_roles[t] += 1 for t in element.patient_roles: patient_roles[t] += 1 print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) """ common_tokens = [(v, k) for k, v in tokens.items()] common_tokens.sort() common_tokens.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json') fh.write_to_json(common_tokens, output_filename, sort_keys=False) """ common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) print pronoun_list #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list} most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" valid_elements = defaultdict(list) for basename, element_list in story_elements.items(): for se in element_list: # need at least one head word that is not a pronoun se.valid_heads = [h for h in se.head_words if h not in pronoun_list] if len(se.valid_heads) > 0: se.valid_attributes = [t for t in se.attributes if t in most_common_attributes] se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles] se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles] se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \ [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \ [(PATIENT_ROLE, t) for t in se.valid_patient_roles] #[(SURFACE_FORM, t) for t in se.valid_heads] if len(se.tuples) >= min_tuples: valid_elements[basename].append(se) output_data = [] for basename, element_list in valid_elements.items(): used_sentences = set() for se in element_list: for i in range(len(se.head_indices)): assert se.head_indices[i] < len(se.sentences[i].split()) if se.head_words[i] not in pronoun_list: if se.sentences[i] not in used_sentences: output_data.append((se.head_indices[i], se.sentences[i], basename)) # THIS IS TRYING SOMETHING NEW... used_sentences.add(se.sentences[i]) with codecs.open(output_data_filename, 'w', encoding='utf-8') as output_file: json.dump(output_data, output_file, indent=2, sort_keys=False) """