if direction == 'left': neighbor.append('*ST*') else: neighbor.append('*EN*') return neighbor ed = EditDistance(None) if __name__ == '__main__': # annotation_file = sys.argv[1] annotations = codecs.open('../web/annotation.txt', 'r', 'utf-8').read().strip() sentence_obj_list = [] for s_idx, sentence in enumerate(annotations.split('===')): if sentence.strip() != '': S = Sentence(s_idx, '', '', '') prev_matches = None for sent in sentence.split('\n'): if sent.strip() != '': action = sent.split(':')[1].strip() matches = re.findall(r'\[.*?\]', sent.strip()) matches = [m[1:-1] for m in matches] if action.strip() == '': S.graphs = [] nodes_in_visible_order = [] for m_idx, m in enumerate(matches): g = Graph(m_idx) S.graphs.append(g) for w_idx, w in enumerate(m.split()): n = Node(id=len(g.nodes), s=w, en_id=w_idx, de_id=None, lang='en', visible=True) n.visible = True
neighbor.append('*ST*') else: neighbor.append('*EN*') return neighbor ed = EditDistance(None) if __name__ == '__main__': # annotation_file = sys.argv[1] annotations = codecs.open('../web/annotation.txt', 'r', 'utf-8').read().strip() sentence_obj_list = [] for s_idx, sentence in enumerate(annotations.split('===')): if sentence.strip() != '': S = Sentence(s_idx, '', '', '') prev_matches = None for sent in sentence.split('\n'): if sent.strip() != '': action = sent.split(':')[1].strip() matches = re.findall(r'\[.*?\]', sent.strip()) matches = [m[1:-1] for m in matches] if action.strip() == '': S.graphs = [] nodes_in_visible_order = [] for m_idx, m in enumerate(matches): g = Graph(m_idx) S.graphs.append(g) for w_idx, w in enumerate(m.split()): n = Node(id=len(g.nodes), s=w,
sys.stderr.write('SENT' + str(sent_idx) + '\n') input_sent = input_line.strip().split() output_items = output_line.strip().split('|') output_phrases = [oi.strip() for idx, oi in enumerate(output_items) if idx % 2 == 0 and oi.strip() != ''] output_sent = ' '.join(output_phrases).split() output_spans = get_output_phrase_as_spans(output_phrases) output_meta = [tuple(om.split(',wa=')) for idx, om in enumerate(output_items) if idx % 2 != 0] input_spans = [tuple([int(i) for i in om[0].split('-')]) for om in output_meta] wa_per_span = [[tuple([int(i) for i in a.split('-')]) for a in om[1].split()] for om in output_meta] input_tok_group = [-1] * len(input_sent) output_tok_group = [-1] * len(output_sent) sys.stderr.write('input sent:' + ' '.join(input_sent) + '\n') sys.stderr.write('output sent:' + ' '.join(output_sent) + '\n') coe_sentence = Sentence(sent_idx, ' '.join(input_sent), ' '.join(output_sent), None) coe_sentence.initial_order_by = VIS_LANG sent_idx += 1 assert len(wa_per_span) == len(input_spans) == len(output_spans) phrase_dict = {} input_coverage = [0] * len(input_sent) group_idx = 0 for idx, (out_span, inp_span, wa) in enumerate(zip(output_spans, input_spans, wa_per_span)): out_phrase = output_sent[out_span[0]:out_span[1] + 1] inp_phrase = input_sent[inp_span[0]:inp_span[1] + 1] # print '\t phrases:', input_sent[inp_span[0]:inp_span[1] + 1], '-', output_sent[out_span[0]:out_span[1] + 1] # print '\t phrase spans:', inp_span, '-', out_span # print '\twa:', wa wa_no_null = insert_epsilon_edge(wa, input_sent[inp_span[0]:inp_span[1] + 1], output_sent[out_span[0]:out_span[1] + 1]) sym_coverage, sym_wa = make_symmetric(wa_no_null)
tuple(om.split(',wa=')) for idx, om in enumerate(output_items) if idx % 2 != 0 ] input_spans = [ tuple([int(i) for i in om[0].split('-')]) for om in output_meta ] wa_per_span = [[ tuple([int(i) for i in a.split('-')]) for a in om[1].split() ] for om in output_meta] input_tok_group = [-1] * len(input_sent) output_tok_group = [-1] * len(output_sent) sys.stderr.write('input sent:' + ' '.join(input_sent) + '\n') sys.stderr.write('output sent:' + ' '.join(output_sent) + '\n') coe_sentence = Sentence(sent_idx, ' '.join(input_sent), ' '.join(output_sent), None) coe_sentence.initial_order_by = VIS_LANG sent_idx += 1 assert len(wa_per_span) == len(input_spans) == len(output_spans) phrase_dict = {} input_coverage = [0] * len(input_sent) group_idx = 0 for idx, (out_span, inp_span, wa) in enumerate(zip(output_spans, input_spans, wa_per_span)): out_phrase = output_sent[out_span[0]:out_span[1] + 1] inp_phrase = input_sent[inp_span[0]:inp_span[1] + 1] # print '\t phrases:', input_sent[inp_span[0]:inp_span[1] + 1], '-', output_sent[out_span[0]:out_span[1] + 1] # print '\t phrase spans:', inp_span, '-', out_span # print '\twa:', wa wa_no_null = insert_epsilon_edge(