Exemplo n.º 1
0
def fetch_alignment(basename, langs, backend='hunalign'):
    assert langs
    real_langs = list(set(lang[:2]
                          for lang in langs))

    if len(real_langs) == 1:
        text_len = len(fetch_sentences(basename, real_langs[0]))
        return Alignment.create_straight(text_len, len(langs))
    elif len(real_langs) == 2:
        try:
            a = Alignment.from_file("%s/%s-%s.%s" %
                                    (basename, real_langs[0], real_langs[1], backend))
        except IOError:
            real_langs.reverse()
            a = Alignment.from_file("%s/%s-%s.%s" %
                                    (basename, real_langs[0], real_langs[1], backend))

    else: # len(real_langs) == 3 :(
        a1 = Alignment.from_file('%s/pl-cu.%s' % (basename, backend)).as_ladder()
        a2 = Alignment.from_file('%s/cu-el.%s' % (basename, backend)).as_ladder()
        a3 = Alignment.from_file('%s/pl-el.%s' % (basename, backend)).as_ladder()
        a3 = [(b, a) for (a, b) in a3] # reversed
        a = merge_3_alignments(a1, a2, a3)
        real_langs = ['pl', 'cu', 'el'] # needed later

    columns = _transpose(a.data)
    columns_map = { real_langs[i] : columns[i]
                    for i in range(len(real_langs)) }

    # common part for 2 and 3
    chosen_columns = [columns_map[lang[:2]] for lang in langs]
    chosen_columns.append(columns[2])
    return Alignment(_transpose(chosen_columns))
Exemplo n.º 2
0
def extract_bisents(file1, lang1, file2, lang2, alignment_file):
    assert isinstance(lang1, (unicode, str)) and len(lang1)==2
    assert isinstance(lang2, (unicode, str)) and len(lang2)==2
    t1 = Text.from_file(file1, lang1)
    t2 = Text.from_file(file2, lang2)
    alignment = Alignment.from_file(alignment_file)
    bisents = alignment.as_pairs(t1.as_sentences_flat(),
                                 t2.as_sentences_flat())
    return bisents
Exemplo n.º 3
0
    def get_alignment(self, langs, backend=None):
        """like fetcher"""
        assert len(langs) >= 2
        assert not backend or backend in possible_backends

        real_langs = list(set(lang[:2]
                              for lang in langs))

        if len(real_langs) == 1:
            text_len = len(fetch_sentences(basename, real_langs[0]))
            return Alignment.create_straight(text_len, len(langs))
        elif len(real_langs) == 2:
            a = None
            for i in range(2):
                for b in ([backend] if backend else possible_backends):
                    try:
                        langs_string = '-'.join(str(l) for l in real_langs)
                        a = Alignment.from_file(self._p(langs_string + '.' + b))
                        break
                    except IOError:
                        continue
                if a:
                    break
                real_langs.reverse()
            if not a:
                raise IOError

        else: # len(real_langs) == 3 :(
            a1 = self.get_alignment(['pl', 'cu'], backend).as_ladder()
            a2 = self.get_alignment(['cu', 'el'], backend).as_ladder()
            a3 = self.get_alignment(['pl', 'el'], backend).as_ladder()
#            a3 = [(b, a) for (a, b) in a3] # reversed
            a = merge_3_alignments(a1, a2, a3)
            real_langs = ['pl', 'cu', 'el'] # needed later

        columns = _transpose(a.data)
        columns_map = { real_langs[i] : columns[i]
                        for i in range(len(real_langs)) }

        # common part for 2 and 3
        chosen_columns = [columns_map[lang[:2]] for lang in langs]
        chosen_columns.append(columns[2])
        return Alignment(_transpose(chosen_columns))
Exemplo n.º 4
0
    def gen():
        yield (0, 0, 0)
        prev_i2 = 0
        for (i1, i2) in al12:
            for _i2 in range(prev_i2+1, i2+1):
                try:
                    i3s = map23[_i2]
                    for i3 in i3s:
                        # if map31[i3] == i1:
                        yield (i1, _i2, i3)
                except KeyError:
                    pass
            prev_i2 = i2
    return Alignment(list(gen()), no_costs=True)

if __name__ == '__main__':
    import sys
    name = sys.argv[1]

    a1 = Alignment.from_file(name + '/pl-cu.my').as_ladder()
    a2 = Alignment.from_file(name + '/cu-el.my').as_ladder()
    a3 = Alignment.from_file(name + '/pl-el.my').as_ladder()
    a3 = [(b, a) for (a, b) in a3]

    ma = merge_3_alignments(a1, a2, a3)

    ma.pretty_print(Text.from_file(name + '/pl.txt', lang='pl').as_sentences_flat(),
                    Text.from_file(name + '/cu.txt', lang='cu').as_sentences_flat(),
                    Text.from_file(name + '/el.txt', lang='el').as_sentences_flat())