Exemplo n.º 1
0
def main():

    amr_file = r'test-data/amrs.txt'
    sentence_file = r'test-data/sentences.txt'
    if len(sys.argv) > 2:
        amr_file = sys.argv[1]
        sentence_file = sys.argv[2]

    failed_amrs = Counter()
    failed_words = Counter()
    with open(sentence_file, 'r', encoding='utf8') as f1:
        sentences = [s for s in re.split('\n\s*\n', f1.read()) if s]
        with open(amr_file, 'r', encoding='utf8') as f2:
            for i, amr in enumerate(AMR.amr_iter(f2.read())):
                print('#' + str(i + 1))
                words = sentences[i].strip().split()
                amr = AMR(amr)
                # test_rules(amr, words)
                alignments, amr_unal, words_unal = align_amr(amr, words)
                print('# AMR:')
                print('\n'.join('# ' + l for l in str(amr).split('\n')))
                print('# Sentence:')
                print('# ' + ' '.join(words))
                print('# Alignments:')
                for a in alignments:
                    print('#', a.readible())
                for a in alignments:
                    print(a)
                print()
Exemplo n.º 2
0
 def html(text, delete_x_ids=True):
     amr = AMR(text)
     elems = [e for e in amr.text_elements]
     nodes = [id for id in amr.node_ids()]
     edges = [id for id in amr.edge_ids()]
     node_indices = [i for i,e in enumerate(amr.text_elements) if amr.NODE_RE.match(e)]
     edge_indices = [i for i,e in enumerate(amr.text_elements) if amr.EDGE_RE.match(e)]
     Named_Entity_RE = re.compile('x[0-9]+/".*?"')
     for i,e in enumerate(elems):
         if i in node_indices:
             id = nodes.pop(0)
             frame = e.split('/')[-1] if '/' in e else '_'
             node = e
             if delete_x_ids:
                 node = re.sub('^x[0-9]+/', '', e, 1)
             if frame in propbank_frames_dictionary:
                 description = propbank_frames_dictionary[frame].replace('\t','\n')
                 elems[i] = f'<span class="amr-frame" tok-id="{id}" title="{description}">{node}</span>'
             elif Named_Entity_RE.match(e):
                 elems[i] = f'<span class="amr-entity" tok-id="{id}">{node}</span>'
             else:
                 elems[i] = f'<span class="amr-node" tok-id="{id}">{node}</span>'
         elif i in edge_indices:
             id = edges.pop(0)
             elems[i] = f'<span class="amr-edge" tok-id="{id}">{e}</span>'
     text = ''.join(elems)
     return '\n<div class="amr-container">\n<pre>\n'+text+'\n</pre>\n</div>\n'
Exemplo n.º 3
0
    def latex(text):
        amr = AMR(text)
        text = str(amr)
        for x in re.findall('x[0-9]+ ?/ ?[^()\s]+', text):
            text = text.replace(x, '(' + x + ')')
        edges = [(e, id) for e, id in zip(amr.edges(), amr.edge_ids())]
        elems = []
        max_depth = paren_utils.max_depth(text)
        prev_depth = 0
        depth = 0

        i = 0
        node_depth = {}
        for t in paren_utils.paren_iter(text):
            node = amr.NODE_RE.match(t).group()
            id = node.split('/')[0].strip()
            # clean node
            if re.match('x[0-9]+/', node):
                node = node.split('/')[1]
            node = node.replace('"', '``', 1).replace('"', "''", 1)
            prev_depth = depth
            depth = paren_utils.depth_at(text, text.index(t))
            if depth > prev_depth:
                i = 0
            node_depth[id] = depth
            num_nodes = paren_utils.mark_depth(text).count(f'<{depth}>')
            x = AMR_Latex.get_x(i, num_nodes)
            y = AMR_Latex.get_y(depth, max_depth)
            color = AMR_Latex.get_color(i)
            elems.append(f'\t\\node[{color}]({id}) at ({x},{y}) {{{node}}};')
            i += 1
        for edge, id in edges:
            source = id.split('_')[0]
            target = id.split('_')[2]
            dir1 = 'south'
            dir2 = 'north'
            if node_depth[source] > node_depth[target]:
                dir1 = 'north'
                dir2 = 'south'
            if node_depth[source] == node_depth[target]:
                dir1 = 'north'
                dir2 = 'north'
            elems.append(
                f'\t\draw[->, thick] ({source}.{dir1}) -- ({target}.{dir2}) node[midway, above, sloped] {{{edge}}};'
            )
        latex = '\n\\begin{tikzpicture}[\n'
        latex += 'red/.style={rectangle, draw=red!60, fill=red!5, very thick, minimum size=7mm},\n'
        latex += 'blue/.style={rectangle, draw=blue!60, fill=blue!5, very thick, minimum size=7mm},\n'
        latex += 'green/.style={rectangle, draw=green!60, fill=green!5, very thick, minimum size=7mm},\n'
        latex += 'purple/.style={rectangle, draw=purple!60, fill=purple!5, very thick, minimum size=7mm},\n'
        latex += 'orange/.style={rectangle, draw=orange!60, fill=orange!5, very thick, minimum size=7mm},\n'
        latex += ']\n'
        latex += '\n'.join(elems)
        latex += '\n\end{tikzpicture}\n'

        return latex
Exemplo n.º 4
0
def normalize_entity(root, nodes, edges):
    normalize_ids = {
        id: i
        for i, id in enumerate(sorted(nodes, key=lambda x: nodes[x]))
    }
    normalized_entity = AMR()
    for n in nodes:
        normalized_entity.nodes[normalize_ids[n]] = nodes[n]
    for s, r, t in edges:
        normalized_entity.edges.append((normalize_ids[s], r, normalize_ids[t]))
    normalized_entity.edges = sorted(normalized_entity.edges)
    normalized_entity.root = normalize_ids[root]
    return normalized_entity
Exemplo n.º 5
0
    def __init__(self, tokens, verbose=False, add_unaligned=0):
        tokens = tokens.copy()

        # add unaligned
        if add_unaligned and '<unaligned>' not in tokens:
            for i in range(add_unaligned):
                tokens.append('<unaligned>')
        # add root
        if '<ROOT>' not in tokens:
            tokens.append("<ROOT>")
        # init stack, buffer
        self.stack = []
        self.buffer = list(
            reversed([
                i + 1 for i, tok in enumerate(tokens) if tok != '<unaligned>'
            ]))
        self.latent = list(
            reversed([
                i + 1 for i, tok in enumerate(tokens) if tok == '<unaligned>'
            ]))

        # init amr
        self.amr = AMR(tokens=tokens)
        for i, tok in enumerate(tokens):
            if tok != "<ROOT>":
                self.amr.nodes[i + 1] = tok
        # add root
        self.buffer[0] = -1
        self.amr.nodes[-1] = "<ROOT>"

        self.new_id = len(tokens) + 1
        self.verbose = verbose
        # parser target output
        self.actions = []
        self.labels = []
        self.labelsA = []
        self.predicates = []

        # information for oracle
        self.merged_tokens = {}
        self.entities = []
        self.is_confirmed = set()
        self.is_confirmed.add(-1)
        self.swapped_words = {}

        if self.verbose:
            print('INIT')
            print(self.printStackBuffer())
Exemplo n.º 6
0
def main(args):

    # First, let's read the graphs and surface forms
    with open(args.input_amr) as f:
        amrs = f.readlines()
    with open(args.input_surface) as f:
        surfs = f.readlines()

    if args.triples_output is not None:
        triples_out = open(args.triples_output, 'w')

    # Iterate
    anon_surfs = []
    anon_maps = []
    anon_surfs_scope = []
    i = 0
    with open(args.output, 'w') as out, open(args.output_surface,
                                             'w') as surf_out:
        for amr, surf in zip(amrs, surfs):
            graph = AMR(amr, surf.split())

            # Get variable: concept map for reentrancies
            #v2c = graph.var2concept()

            if args.mode == 'LIN':
                # Linearisation mode for seq2seq

                tokens = amr.split()
                new_tokens = simplify(tokens, v2c)
                out.write(' '.join(new_tokens) + '\n')

            elif args.mode == 'GRAPH':
                # Triples mode for graph2seq
                #import ipdb; ipdb.set_trace()
                # Get concepts and generate IDs
                v_ids, rev_v_ids = get_nodes2(graph)

                # Triples
                triples = get_triples(graph, v_ids, rev_v_ids)

                # Print concepts/constants and triples
                #cs = [get_name(c) for c in rev_c_ids]
                cs = [get_name(v, v2c) for v in rev_v_ids]
                out.write(' '.join(cs) + '\n')
                triples_out.write(
                    ' '.join(['(' + ','.join(adj) + ')'
                              for adj in triples]) + '\n')

            elif args.mode == 'LINE_GRAPH':
                # Similar to GRAPH, but with edges as extra nodes
                #import ipdb; ipdb.set_trace()
                print(i)
                i += 1
                #if i == 98:
                #    import ipdb; ipdb.set_trace()
                nodes, triples, anon_surf, anon_map, anon_surf_scope = get_line_graph(
                    graph, surf, anon=args.anon)
                out.write(' '.join(nodes) + '\n')
                triples_out.write(
                    ' '.join(['(%d,%d,%s)' % adj for adj in triples]) + '\n')
                #surf = ' '.join(new_surf)
                anon_surfs.append(anon_surf)
                anon_maps.append(json.dumps(anon_map))
                anon_surfs_scope.append(anon_surf_scope)

            # Process the surface form
            surf_out.write(surf.lower())
    if args.anon:
        with open(args.anon_surface, 'w') as f:
            for anon_surf in anon_surfs:
                f.write(anon_surf + '\n')
        with open(args.map_output, 'w') as f:
            for anon_map in anon_maps:
                f.write(anon_map + '\n')
        with open(args.anon_surface_scope, 'w') as f:
            for anon_surf_scope in anon_surfs_scope:
                f.write(anon_surf_scope + '\n')
#!/usr/bin/env python2.7
#coding=utf-8
'''

@author: Nathan Schneider ([email protected])
@since: 2015-05-06
'''
from __future__ import print_function
import sys, re, fileinput, codecs
from collections import Counter, defaultdict

from amr import AMR, AMRSyntaxError, AMRError, Concept, AMRConstant

c = Counter()
for ln in fileinput.input():
    try:
        a = AMR(ln)
        c.update(map(repr, a.nodes.keys()))    # vars, concepts, constants: count once per AMR
        c.update('.'+repr(x) for _,r,x in a.triples(rel=':instance-of'))  # concepts count once per variable
        c.update(map((lambda x: x[1]), a.triples()))    # relations
        c.update('.'+repr(x) for _,_,x in a.triples() if isinstance(x,AMRConstant))  # constants count once per relation
    except AMRSyntaxError as ex:
        print(ex, file=sys.stderr)
    except AMRError as ex:
        print(ex, file=sys.stderr)
    
for k,n in c.most_common():
    print(k,n, sep='\t')
Exemplo n.º 8
0
            args.p_ctx), str(args.p_proj))
    logging.basicConfig(filename=logfilename,
                        level=logging.INFO,
                        format='%(asctime)s :: %(levelname)s :: %(message)s')
    logging.info('log info to ' + logfilename)

logging.info(args)
if args.dataset == 'amazon':
    ds = ds_amazon(logging, args)
else:
    raise Exception('no dataset' + args.dataset)

if args.model == 'bpr':
    model = BPR(ds, args, logging)
elif args.model == 'cbpr':
    model = CBPR(ds, args, logging)
elif args.model == 'vbpr':
    model = VBPR(ds, args, logging)
elif args.model == 'amr':
    model = AMR(ds, args, logging)
elif args.model == 'mtpr':
    model = MTPR(ds, args, logging)
else:
    raise Exception('unknown model type', args.model)

model.train()

weight_filename = 'weights/%s_%s_%s_%s_%s.npy' % (
    args.dataset, args.model, str(args.p_emb), str(args.p_ctx), str(
        args.p_proj))
model.save(weight_filename)
Exemplo n.º 9
0
def main(arguments):
	parser = argparse.ArgumentParser(
		description=__doc__,
		formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('--input_file', help="Path of the file containing AMRs of each sentence", type=str, 
				default='/home/shibhansh/UGP-2/data/LDC2015E86_DEFT_Phase_2_AMR_Annotation_R1/' + \
				'data/amrs/split/test/deft-p2-amr-r1-amrs-test-alignments-proxy.txt')
	parser.add_argument('--dataset', help="Name of dataset",
				type=str, default='')
	parser.add_argument('--display', help="Path of the file containing AMRs of each sentence",
				type=bool, default=False)

	args = parser.parse_args(arguments)

	input_file = args.input_file
	dataset = args.dataset

	'''
	'docs' is a list of 'documents', each 'document' is list a dictionary. Each dictionary contains
	information about a sentence. Each dicitonary has 'alignments', 'amr' etc. keys. Corresponding
	to each key we have the relevant information like the amr, text, alignment etc.
	'''

	# Remove alignments from the new file
	os.system('cp '+ input_file +' auxiliary/temp')
	with codecs.open('auxiliary/temp', 'r') as data_file:
		original_data = data_file.readlines()

	os.system('sed -i \'s/~e.[	0-9]*//g\' auxiliary/temp')
	os.system('sed -i \'s/,[	0-9]*//g\' auxiliary/temp')

	with codecs.open('auxiliary/temp', 'r') as data_file:
		data = data_file.readlines()
	for index_line,line in enumerate(data):
		if line.startswith('#'):
			data[index_line] = original_data[index_line]

	with codecs.open('auxiliary/temp', 'w') as data_file:
		for line in data:
			data_file.write(line)

	input_file = 'auxiliary/temp'

	docs, target_summaries, stories = read_data(input_file)

	os.system('rm auxiliary/temp')
	save_stories(stories,'auxiliary/stories.txt')

	with open('auxiliary/target_summaries.txt','w') as f:
		for summary in target_summaries:
			f.write(tok_to_std_format_convertor(summary)+'\n')
	idf = {}
	with open('auxiliary/'+dataset+'_idf.txt','r') as f:
		idf = pickle.load(f) 

	f = open('auxiliary/predicted_summaries.txt','w')
	summary_sentences_per_story = []
	# currently all the information of a node is stored as a list, changing it to a dictionary
	debug = False
	# 'document_amrs' is the list of document amrs formed after joining nodes and collapsing same entities etc.
	target_summaries_amrs = []
	predicted_summaries_amrs = []
	document_amrs = []
	selected_sents = []
	for index_doc, doc in enumerate(docs):
		current_doc_sent_amr_list = []
		current_target_summary_sent_amr_list = []
		for index_dict, dict_sentence in enumerate(doc):
			if dict_sentence['amr'] != []:
				if dict_sentence['tok'].strip()[-1] != '.': dict_sentence['tok'] = dict_sentence['tok'] + ' .' 
				# Get the AMR class for each sentence using just the text
				if dict_sentence['snt-type'] == 'summary':
					current_target_summary_sent_amr_list.append(AMR(dict_sentence['amr'],
													amr_with_attributes=False,
													text=dict_sentence['tok'],
													alignments=dict_sentence['alignments']))
				if dict_sentence['snt-type'] == 'body':
					docs[index_doc][index_dict]['amr'] = AMR(dict_sentence['amr'],
														amr_with_attributes=False,
														text=dict_sentence['tok'],
														alignments=dict_sentence['alignments'])
					current_doc_sent_amr_list.append(docs[index_doc][index_dict]['amr'])
		# merging the sentence AMRs to form a single AMR
		amr_as_list, document_text, document_alignments,var_to_sent = \
												merge_sentence_amrs(current_doc_sent_amr_list,debug=False)
		new_document_amr = AMR(text_list=amr_as_list,
							text=document_text,
							alignments=document_alignments,
							amr_with_attributes=True,
							var_to_sent=var_to_sent)
		document_amrs.append(new_document_amr)
		target_summaries_amrs.append(current_target_summary_sent_amr_list)

		# number of nodes required in summary

		imp_doc = index_doc
		# imp_doc = 1000
		if imp_doc == 1000:
			# just the first sentence of the story is the summary
			predicted_summaries_amrs.append([current_doc_sent_amr_list[0]])
		if imp_doc == 2000:
			# just the first two sentences of the story is the summary
			predicted_summaries_amrs.append([current_doc_sent_amr_list[0],current_doc_sent_amr_list[1]])
		if imp_doc == 3000:
			# just the first two sentences of the story is the summary
			predicted_summaries_amrs.append([current_doc_sent_amr_list[0],current_doc_sent_amr_list[1]\
												,current_doc_sent_amr_list[2]])
		if imp_doc == -1:
			# all sentences of the story is the summary
			predicted_summaries_amrs.append(current_doc_sent_amr_list)
		if index_doc == imp_doc:
			document_amrs[index_doc], phrases,idf_vars = resolve_coref_doc_AMR(amr=document_amrs[index_doc], 
									resolved=True,story=' '.join(document_amrs[index_doc].text),
									location_of_resolved_story='auxiliary/'+dataset+'_predicted_resolutions.txt',
									location_of_story_in_file=index_doc,
									location_of_resolver='.',
									idf=idf,
									debug=False)

			cn_freq_dict,cn_sent_lists,cn_var_lists=document_amrs[index_doc].get_common_nouns(phrases=phrases)
			idf_vars = document_amrs[index_doc].get_idf_vars(idf_vars=idf_vars,idf=idf)
		
			# range equal to the std_deviation of the summary size in the dataset
			if dataset == '':
				current_summary_nodes = []
				for target_summary_amr in current_target_summary_sent_amr_list:
					current_summary_nodes.extend(target_summary_amr.get_nodes() )

				num_summary_nodes = len(current_summary_nodes)
				range_num_nodes = 0
				range_num_nodes = int((len(document_amrs[index_doc].get_nodes())*4)/100)

			document_amrs[index_doc].get_concept_relation_list(story_index=index_doc,debug=False)

			pr = document_amrs[index_doc].directed_graph.rank_sent_in_degree()

			# rank the nodes with the 'meta_nodes'
			pr = document_amrs[index_doc].directed_graph.rank_with_meta_nodes(var_freq_list=pr,
																			cn_freq_dict=cn_freq_dict,
																			cn_sent_lists=cn_sent_lists,
																			cn_var_dict=cn_var_lists)
			ranks, weights, _ = zip(*pr)
			print ranks
			print weights

			pr = document_amrs[index_doc].directed_graph.add_idf_ranking(var_freq_list=pr,
																		default_idf=5.477,
																		idf_vars=idf_vars,
																		num_vars_to_add=5)

			ranks, weights, _ = zip(*pr)
			print ranks
			print weights

			new_graph = document_amrs[index_doc].directed_graph.construct_greedily_first(ranks=ranks,weights=weights,
							concept_relation_list=document_amrs[index_doc].concept_relation_list,
							use_true_sent_rank=False,num_nodes=num_summary_nodes,range_num_nodes=range_num_nodes)

			# generate AMR from the graphical representation
			new_amr_graph = document_amrs[index_doc].get_AMR_from_directed_graph(sub_graph=new_graph)
			new_amr_graph.print_amr()
			predicted_summaries_amrs.append([new_amr_graph])

	with open('auxiliary/'+dataset+'_eos_stories.txt','w') as f:
		for document_amr in document_amrs:
			f.write(' <eos> '.join(document_amr.text)+'\n')

	f.close()
	with open('auxiliary/num_sent_per_story.txt','w') as f3:
		pickle.dump(summary_sentences_per_story,f3)
	# save document AMR in file
	with open('auxiliary/text_amr.txt','w') as f2:
		f2.write('# :id PROXY_AFP_ENG_20050317_010.10 ::amr-annotator SDL-AMR-09  ::preferred ::snt-type body\n')
		f2.write('# ::snt On 21 March 2005\n')
		f2.write('# ::tok On 21 March 2005\n')
		if imp_doc >= 0 and imp_doc < len(document_amrs):
			for index_node, node in enumerate(document_amrs[imp_doc].amr):
				f2.write('\t'*node['depth']+node['text']+'\n')

	target_summaries_nodes = []
	for target_summary_amrs in target_summaries_amrs:
		current_summary_nodes = []
		for target_summary_amr in target_summary_amrs:
			# current_summary_nodes.extend(target_summary_amr.get_edge_tuples() )
			current_summary_nodes.extend(target_summary_amr.get_nodes() )
		target_summaries_nodes.append(current_summary_nodes)

	target_summary_lengths = [len(i) for i in target_summaries_nodes]
	document_lengths = [len(i.get_nodes()) for i in document_amrs]

	ratios = []
	for i in range(len(document_lengths)):
		ratios.append(float(target_summary_lengths[i]/document_lengths[i])*100)

	average_ratio = (float(sum(ratios)) / len(ratios))
	deviations = [abs(ratio - average_ratio) for ratio in ratios]

	mean_deviation = (float(sum(deviations)) / len(deviations))

	# average ratio in 'gold' dataset is 9%, and deviation is 4%
	print 'average_ratio', average_ratio, 'mean_deviation', mean_deviation

	with open('auxiliary/target_summary_nodes.txt','w') as f6:
		for node_list in target_summaries_nodes:
			f6.write(' '.join([node for node in node_list]) + '\n')

	predicted_summaries_nodes = []
	for predicted_summary_amrs in predicted_summaries_amrs:
		current_summary_nodes = []
		for predicted_summary_amr in predicted_summary_amrs:
			# current_summary_nodes.extend(predicted_summary_amr.get_edge_tuples() )
			current_summary_nodes.extend(predicted_summary_amr.get_nodes() )
		predicted_summaries_nodes.append(current_summary_nodes)

	with open('auxiliary/predicted_summary_nodes.txt','w') as f7:
		for node_list in predicted_summaries_nodes:
			f7.write(' '.join([node for node in node_list]) + '\n')
Exemplo n.º 10
0
def main(arguments):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--input_file', help="Path of the file containing AMRs of each sentence", type=str,
       default='/home/prerna/Documents/thesis_work/LDC2015E86_DEFT_Phase_2_AMR_Annotation_R1/' + \
       'data/amrs/split/test/deft-p2-amr-r1-amrs-test-alignments-proxy.txt')
    parser.add_argument('--dataset',
                        help="Name of dataset",
                        type=str,
                        default='')
    parser.add_argument(
        '--display',
        help="Path of the file containing AMRs of each sentence",
        type=bool,
        default=False)

    args = parser.parse_args(arguments)

    input_file = args.input_file
    dataset = args.dataset

    # '''
    # 'docs' is a list of 'documents', each 'document' is list a dictionary. Each dictionary contains
    # information about a sentence. Each dicitonary has 'alignments', 'amr' etc. keys. Corresponding
    # to each key we have the relevant information like the amr, text, alignment etc.
    # '''

    # Remove alignments from the new file
    os.system('cp ' + input_file + ' auxiliary/temp')
    with codecs.open('auxiliary/temp', 'r') as data_file:
        original_data = data_file.readlines()

    os.system('sed -i \'s/~e.[	0-9]*//g\' auxiliary/temp')
    os.system('sed -i \'s/,[	0-9]*//g\' auxiliary/temp')

    with codecs.open('auxiliary/temp', 'r') as data_file:
        data = data_file.readlines()
    for index_line, line in enumerate(data):
        if line.startswith('#'):
            data[index_line] = original_data[index_line]

    with codecs.open('auxiliary/temp', 'w') as data_file:
        for line in data:
            data_file.write(line)

    input_file = 'auxiliary/temp'

    docs, target_summaries, stories = read_data(input_file)

    os.system('rm auxiliary/temp')
    save_stories(stories, 'auxiliary/stories.txt')

    with open('auxiliary/target_summaries.txt', 'w') as f:
        for summary in target_summaries:
            f.write(tok_to_std_format_convertor(summary) + '\n')

    f = open('auxiliary/predicted_summaries.txt', 'w')
    summary_sentences_per_story = []
    # currently all the information of a node is stored as a list, changing it to a dictionary
    debug = False
    # 'document_amrs' is the list of document amrs formed after joining nodes and collapsing same entities etc.
    target_summaries_amrs = []
    predicted_summaries_amrs = []
    document_amrs = []
    selected_sents = []
    for index_doc, doc in enumerate(docs):
        current_doc_sent_amr_list = []
        current_target_summary_sent_amr_list = []
        for index_dict, dict_sentence in enumerate(doc):
            if dict_sentence['amr'] != []:
                if dict_sentence['tok'].strip()[-1] != '.':
                    dict_sentence['tok'] = dict_sentence['tok'] + ' .'
                # Get the AMR class for each sentence using just the text
                if dict_sentence['snt-type'] == 'summary':
                    current_target_summary_sent_amr_list.append(
                        AMR(dict_sentence['amr'],
                            amr_with_attributes=False,
                            text=dict_sentence['tok'],
                            alignments=dict_sentence['alignments']))
                if dict_sentence['snt-type'] == 'body':
                    docs[index_doc][index_dict]['amr'] = AMR(
                        dict_sentence['amr'],
                        amr_with_attributes=False,
                        text=dict_sentence['tok'],
                        alignments=dict_sentence['alignments'])
                    current_doc_sent_amr_list.append(
                        docs[index_doc][index_dict]['amr'])
        # merging the sentence AMRs to form a single AMR
        amr_as_list, document_text, document_alignments,var_to_sent = \
                  merge_sentence_amrs(current_doc_sent_amr_list,debug=False)
        new_document_amr = AMR(text_list=amr_as_list,
                               text=document_text,
                               alignments=document_alignments,
                               amr_with_attributes=True,
                               var_to_sent=var_to_sent)
        document_amrs.append(new_document_amr)
        target_summaries_amrs.append(current_target_summary_sent_amr_list)
        imp_doc = index_doc
        if imp_doc == 1000:
            # just the first sentence of the story is the summary
            predicted_summaries_amrs.append([current_doc_sent_amr_list[0]])

        print index_doc
        if index_doc == imp_doc:
            document_amrs[index_doc] = resolve_coref_doc_AMR(
                amr=document_amrs[index_doc],
                resolved=True,
                story=' '.join(document_amrs[index_doc].text),
                # location_of_resolved_story='auxiliary/human_corefs.txt',
                location_of_resolved_story='auxiliary/' + dataset +
                '_predicted_resolutions.txt',
                location_of_story_in_file=index_doc,
                location_of_resolver='.',
                debug=False)

            pr = document_amrs[index_doc].directed_graph.rank_sent_in_degree()
            ranks, weights = zip(*pr)
            print ranks
            print weights

            # get pairs in order of importance
            ranked_pairs = document_amrs[index_doc].directed_graph.rank_pairs(
                ranks=ranks, weights=weights, pairs_to_rank=3)
            # print 'ranked_pairs', ranked_pairs
            paths_and_sub_graphs = document_amrs[
                index_doc].directed_graph.max_imp_path(
                    ordered_pairs=ranked_pairs)

            # add method to check no repeated sub_graph
            summary_paths = []
            summary_amrs = []
            summary_amrs_text = []
            for path_and_sub_graph in paths_and_sub_graphs:
                path, sub_graph, sent = path_and_sub_graph

                path_sent_dict = {}
                if sent == -1:
                    path_sent_dict = document_amrs[
                        index_doc].break_path_by_sentences(path=path)
                else:
                    path_sent_dict[sent] = path

                for key in path_sent_dict.keys():
                    temp_path = path_sent_dict[key]

                    # path = document_amrs[index_doc].concept_relation_list.get_concepts_given_path(sent_index=key,path=temp_path)
                    path = -1
                    # key = 0
                    if path == -1:
                        path = document_amrs[index_doc].get_sent_amr(
                            sent_index=key)

                    nodes, sub_graph = document_amrs[
                        index_doc].directed_graph.get_name_path(nodes=path)

                    new_amr_graph = document_amrs[
                        index_doc].get_AMR_from_directed_graph(
                            sub_graph=sub_graph)

                    repeated_path = False
                    # removing repreating sents/amrs
                    for var_set in summary_paths:
                        if set(var_set) == set(nodes): repeated_path = True

                    if repeated_path: continue

                    summary_paths.append(list(nodes))
                    summary_amrs_text.append(
                        new_amr_graph.print_amr(file=f,
                                                print_indices=False,
                                                write_in_file=True,
                                                one_line_output=True,
                                                return_str=True,
                                                to_print=False))
                    print ''
                    summary_amrs.append(new_amr_graph)

            final_summary_amrs_text = []
            final_summary_amrs = []
            for index, path in enumerate(summary_paths):
                indices_to_search_at = range(len(summary_paths))
                indices_to_search_at.remove(index)
                to_print = True
                for index_2 in indices_to_search_at:
                    if set(path) < set(summary_paths[index_2]):
                        to_print = False
                if to_print:
                    final_summary_amrs_text.append(summary_amrs_text[index])
                    final_summary_amrs.append(summary_amrs[index])

            for summary_amr in final_summary_amrs_text:
                try:
                    summary_sentences_per_story[index_doc] += 1
                except:
                    summary_sentences_per_story.append(1)

                print summary_amr

            predicted_summaries_amrs.append(final_summary_amrs)

    with open('auxiliary/' + dataset + '_eos_stories.txt', 'w') as f:
        for document_amr in document_amrs:
            f.write(' <eos> '.join(document_amr.text) + '\n')

    f.close()
    with open('auxiliary/num_sent_per_story.txt', 'w') as f3:
        pickle.dump(summary_sentences_per_story, f3)
    # save document AMR in file
    with open('auxiliary/text_amr.txt', 'w') as f2:
        f2.write(
            '# :id PROXY_AFP_ENG_20050317_010.10 ::amr-annotator SDL-AMR-09  ::preferred ::snt-type body\n'
        )
        f2.write('# ::snt On 21 March 2005\n')
        f2.write('# ::tok On 21 March 2005\n')
        if imp_doc >= 0 and imp_doc < len(document_amrs):
            for index_node, node in enumerate(document_amrs[imp_doc].amr):
                f2.write('\t' * node['depth'] + node['text'] + '\n')

        # an option to generate the graphical representations
        # return document_amrs
    target_summaries_nodes = []
    for target_summary_amrs in target_summaries_amrs:
        current_summary_nodes = []
        for target_summary_amr in target_summary_amrs:
            current_summary_nodes.extend(target_summary_amr.get_nodes())
        target_summaries_nodes.append(current_summary_nodes)

    with open('auxiliary/target_summary_nodes.txt', 'w') as f6:
        for node_list in target_summaries_nodes:
            f6.write(' '.join([node for node in node_list]) + '\n')

    predicted_summaries_nodes = []
    for predicted_summary_amrs in predicted_summaries_amrs:
        current_summary_nodes = []
        for predicted_summary_amr in predicted_summary_amrs:
            current_summary_nodes.extend(predicted_summary_amr.get_nodes())
        predicted_summaries_nodes.append(current_summary_nodes)

    with open('auxiliary/predicted_summary_nodes.txt', 'w') as f7:
        for node_list in predicted_summaries_nodes:
            f7.write(' '.join([node for node in node_list]) + '\n')