Пример #1
0
def tercom_scores_unicode(hypotheses, references):
    """
    Returns a list of TERCOM scores
    """
    writer = codecs.getwriter('utf-8')
    with NamedTemporaryFile('w') as hypothesis_file, NamedTemporaryFile('w') as reference_file:
        hypothesis_file = writer(hypothesis_file)
        reference_file = writer(reference_file)

        for i, (hypothesis, reference) in enumerate(zip(hypotheses, references)):
            hypothesis_file.write(u'{} ({})\n'.format(hypothesis, i))
            reference_file.write(u'{} ({})\n'.format(reference, i))

        hypothesis_file.flush()
        reference_file.flush()

        filename = mktemp()

        cmd = ['java', '-jar', 'tercom.jar', '-h', hypothesis_file.name, '-r', reference_file.name,
               '-o', 'ter', '-n', filename]
        output = uopen('/dev/null', 'w')
        call(cmd, stdout=output, stderr=output)

    with uopen(filename + '.ter') as f:
        lines = list(f)
        scores = [float(line.split(' ')[-1]) for line in lines[2:]]

    os.remove(filename + '.ter')
    return scores
Пример #2
0
def extract_from_ibm(filename1, filename2):
    """
    Takes two GIZA++ alignment file (first file is source->target, second file is target->source),
    and yields trajectories.
    """
    with uopen(filename1) as file1, uopen(filename2) as file2:
        for line1, line2 in zip(islice(file1, 2, None, 3),
                                islice(file2, 2, None, 3)):
            pairs1, words1 = get_pairs(line1.strip())
            pairs2, words2 = get_pairs(line2.strip())
            pairs2 = [pair[::-1] for pair in pairs2]
            sentence = ' '.join(words1)

            intersection = list(set(pairs1).intersection(set(pairs2)))
            ops = []
            pairs = intersection[:]
            pairs.append((len(words1), len(words2)))
            i = 0
            while i < len(words2):
                x = next((x for x, y in pairs if y == i), None)

                if not any(s == i for s, _ in pairs):
                    ops.append(('DEL', i))
                    del words1[i]
                    pairs = [(k - int(i < k), l) for k, l in pairs]
                elif x == i:
                    if words1[i] != words2[i]:
                        ops.append(('SUB', i, words2[i]))
                    pairs.remove((i, i))
                    words1[i] = words2[i]
                    i += 1
                elif x is not None:
                    # move
                    # TODO: fix
                    ops.append(('MOVE', x, i))
                    words1.insert(i, words1.pop(x))
                    pairs = [(i, l) if k == x and l == i else
                             (k + int(i <= k < x), l) for k, l in pairs]
                else:
                    # insertion
                    ops.append(('INS', i, words2[i]))
                    words1.insert(i, words2[i])
                    pairs = [(k + int(i <= k), l) for k, l in pairs]
                    i += 1

            ops.append(('STOP', ))

            yield sentence, ops
Пример #3
0
    def read_trajectories(filename):
        """
        A trajectory file contains entries each corresponding to a transition, whose
         fields are delimited by '|||'.
         Transitions contain those fields: id, src, s, s', a, score(s), score(s')
         The first field is the id of this transition's trajectory.
        """
        trajectories = []

        with uopen(filename) as f:
            current_idx = None
            current_traj = []
            for line in f:
                idx, src, trg1, trg2, action, score1, score2 = line.split(
                    '|||')
                score1, score2 = float(score1), float(score2)

                action = action.split()
                op = action[0]
                if op in ['SUB', 'INS', 'DEL']:
                    action[1] = int(action[1])
                elif op == 'MOVE':
                    action[1:] = map(int, action[1:])
                elif op != 'STOP':
                    raise Exception('Unknown action type')

                action = tuple(action)

                # a state is a pair (source sentence, translation hypothesis)
                # source sentence is the same for all transitions in a given trajectory
                state1 = State(src, trg1)
                state2 = State(src, trg2)
                transition = (state1, state2, action, score1, score2)

                # a different index marks the end of a trajectory
                if current_idx is not None and idx != current_idx:
                    trajectories.append(current_traj)
                    current_traj = []

                current_traj.append(transition)
                current_idx = idx

        return trajectories
Пример #4
0
    def read_trajectories(filename):
        """
        A trajectory file contains entries each corresponding to a transition, whose
         fields are delimited by '|||'.
         Transitions contain those fields: id, src, s, s', a, score(s), score(s')
         The first field is the id of this transition's trajectory.
        """
        trajectories = []

        with uopen(filename) as f:
            current_idx = None
            current_traj = []
            for line in f:
                idx, src, trg1, trg2, action, score1, score2 = line.split('|||')
                score1, score2 = float(score1), float(score2)

                action = action.split()
                op = action[0]
                if op in ['SUB', 'INS', 'DEL']:
                    action[1] = int(action[1])
                elif op == 'MOVE':
                    action[1:] = map(int, action[1:])
                elif op != 'STOP':
                    raise Exception('Unknown action type')

                action = tuple(action)

                # a state is a pair (source sentence, translation hypothesis)
                # source sentence is the same for all transitions in a given trajectory
                state1 = State(src, trg1)
                state2 = State(src, trg2)
                transition = (state1, state2, action, score1, score2)

                # a different index marks the end of a trajectory
                if current_idx is not None and idx != current_idx:
                    trajectories.append(current_traj)
                    current_traj = []

                current_traj.append(transition)
                current_idx = idx

        return trajectories
Пример #5
0
def extract_from_ter(ter_filename, src_filename):
    """
    Yields trajectories (lists of operations):
        SUB index word
        DEL index
        INS index word
        MOVE index position
    operations are applied from left to right (indices refer to the current state of the hypothesis)
    """
    with uopen(ter_filename) as f, uopen(src_filename) as src_file:
        while True:
            src_sent = next(src_file).strip()

            ops = []
            lines = list(takewhile(lambda line: line.strip(), f))
            if not lines:
                break
            ref = re.match(r'Original Ref:\s*(.*?)\n', lines[1]).group(1)
            hyp = re.match(r'Original Hyp:\s*(.*?)\n', lines[2]).group(1)
            hyp_after_shift = re.match(r'Hyp After Shift:\s*(.*?)\n',
                                       lines[3]).group(1)
            align = re.match(r'Alignment:\s*\((.*?)\)', lines[4]).group(1)

            numshifts = int(re.match(r'NumShifts: (\d+)', lines[5]).group(1))
            regex = re.compile(r'\s*\[(\d+), (\d+), .*?/(.*?)\] \(\[(.*?)\]\)')
            shifts = [
                regex.match(lines[6 + i]).groups() for i in range(numshifts)
            ]
            shifts = [(int(i), int(j), int(k), re.sub(r',\s+', ' ', words))
                      for i, j, k, words in shifts]

            shift_indices = get_shifts(shifts, hyp.split(),
                                       hyp_after_shift.split())

            for i, j, k in shift_indices:
                l = j - i
                for x in range(l):
                    if k >= i:
                        op = ('MOVE', i, k + l - 1)
                    else:
                        op = ('MOVE', i + x, k + x)
                    ops.append(op)

            ref_iter = iter(ref.split())
            hyp_iter = iter(hyp_after_shift.split())
            i = 0
            for op in align:
                # insert and delete are reversed in TERCOM
                if op != 'D':
                    next(hyp_iter)
                if op != 'I':
                    inserted = next(ref_iter)

                if op == 'S':
                    ops.append(('SUB', i, inserted))
                elif op == 'D':
                    ops.append(('INS', i, inserted))
                elif op == 'I':
                    ops.append(('DEL', i))
                    i -= 1

                i += 1

            ops.append(('STOP', ))

            # try to reconstruct reference
            state = State(src_sent, hyp)
            for op in ops:
                state = state.transition(op)

            if state.trg != ref:  # in some weird and rare cases (likely due to a bug in TERCOM)
                yield (src_sent, hyp), [
                ]  # empty trajectory (index is skipped in the output)
                continue

            #assert(state.trg == ref)

            yield (src_sent, hyp), ops