def full_trajectories(partial_trajectories): for k, ((src, trg), actions) in enumerate(partial_trajectories): state = State(src, trg) for action in actions: new_state = state.transition(action) yield k, state.src, state.trg, new_state.trg, action state = new_state
def extract_from_ter(ter_filename, src_filename): """ Yields trajectories (lists of operations): SUB index word DEL index INS index word MOVE index position operations are applied from left to right (indices refer to the current state of the hypothesis) """ with uopen(ter_filename) as f, uopen(src_filename) as src_file: while True: src_sent = next(src_file).strip() ops = [] lines = list(takewhile(lambda line: line.strip(), f)) if not lines: break ref = re.match(r'Original Ref:\s*(.*?)\n', lines[1]).group(1) hyp = re.match(r'Original Hyp:\s*(.*?)\n', lines[2]).group(1) hyp_after_shift = re.match(r'Hyp After Shift:\s*(.*?)\n', lines[3]).group(1) align = re.match(r'Alignment:\s*\((.*?)\)', lines[4]).group(1) numshifts = int(re.match(r'NumShifts: (\d+)', lines[5]).group(1)) regex = re.compile(r'\s*\[(\d+), (\d+), .*?/(.*?)\] \(\[(.*?)\]\)') shifts = [ regex.match(lines[6 + i]).groups() for i in range(numshifts) ] shifts = [(int(i), int(j), int(k), re.sub(r',\s+', ' ', words)) for i, j, k, words in shifts] shift_indices = get_shifts(shifts, hyp.split(), hyp_after_shift.split()) for i, j, k in shift_indices: l = j - i for x in range(l): if k >= i: op = ('MOVE', i, k + l - 1) else: op = ('MOVE', i + x, k + x) ops.append(op) ref_iter = iter(ref.split()) hyp_iter = iter(hyp_after_shift.split()) i = 0 for op in align: # insert and delete are reversed in TERCOM if op != 'D': next(hyp_iter) if op != 'I': inserted = next(ref_iter) if op == 'S': ops.append(('SUB', i, inserted)) elif op == 'D': ops.append(('INS', i, inserted)) elif op == 'I': ops.append(('DEL', i)) i -= 1 i += 1 ops.append(('STOP', )) # try to reconstruct reference state = State(src_sent, hyp) for op in ops: state = state.transition(op) if state.trg != ref: # in some weird and rare cases (likely due to a bug in TERCOM) yield (src_sent, hyp), [ ] # empty trajectory (index is skipped in the output) continue #assert(state.trg == ref) yield (src_sent, hyp), ops