def fetch_alignment(basename, langs, backend='hunalign'): assert langs real_langs = list(set(lang[:2] for lang in langs)) if len(real_langs) == 1: text_len = len(fetch_sentences(basename, real_langs[0])) return Alignment.create_straight(text_len, len(langs)) elif len(real_langs) == 2: try: a = Alignment.from_file("%s/%s-%s.%s" % (basename, real_langs[0], real_langs[1], backend)) except IOError: real_langs.reverse() a = Alignment.from_file("%s/%s-%s.%s" % (basename, real_langs[0], real_langs[1], backend)) else: # len(real_langs) == 3 :( a1 = Alignment.from_file('%s/pl-cu.%s' % (basename, backend)).as_ladder() a2 = Alignment.from_file('%s/cu-el.%s' % (basename, backend)).as_ladder() a3 = Alignment.from_file('%s/pl-el.%s' % (basename, backend)).as_ladder() a3 = [(b, a) for (a, b) in a3] # reversed a = merge_3_alignments(a1, a2, a3) real_langs = ['pl', 'cu', 'el'] # needed later columns = _transpose(a.data) columns_map = { real_langs[i] : columns[i] for i in range(len(real_langs)) } # common part for 2 and 3 chosen_columns = [columns_map[lang[:2]] for lang in langs] chosen_columns.append(columns[2]) return Alignment(_transpose(chosen_columns))
def extract_bisents(file1, lang1, file2, lang2, alignment_file): assert isinstance(lang1, (unicode, str)) and len(lang1)==2 assert isinstance(lang2, (unicode, str)) and len(lang2)==2 t1 = Text.from_file(file1, lang1) t2 = Text.from_file(file2, lang2) alignment = Alignment.from_file(alignment_file) bisents = alignment.as_pairs(t1.as_sentences_flat(), t2.as_sentences_flat()) return bisents
def get_alignment(self, langs, backend=None): """like fetcher""" assert len(langs) >= 2 assert not backend or backend in possible_backends real_langs = list(set(lang[:2] for lang in langs)) if len(real_langs) == 1: text_len = len(fetch_sentences(basename, real_langs[0])) return Alignment.create_straight(text_len, len(langs)) elif len(real_langs) == 2: a = None for i in range(2): for b in ([backend] if backend else possible_backends): try: langs_string = '-'.join(str(l) for l in real_langs) a = Alignment.from_file(self._p(langs_string + '.' + b)) break except IOError: continue if a: break real_langs.reverse() if not a: raise IOError else: # len(real_langs) == 3 :( a1 = self.get_alignment(['pl', 'cu'], backend).as_ladder() a2 = self.get_alignment(['cu', 'el'], backend).as_ladder() a3 = self.get_alignment(['pl', 'el'], backend).as_ladder() # a3 = [(b, a) for (a, b) in a3] # reversed a = merge_3_alignments(a1, a2, a3) real_langs = ['pl', 'cu', 'el'] # needed later columns = _transpose(a.data) columns_map = { real_langs[i] : columns[i] for i in range(len(real_langs)) } # common part for 2 and 3 chosen_columns = [columns_map[lang[:2]] for lang in langs] chosen_columns.append(columns[2]) return Alignment(_transpose(chosen_columns))
def gen(): yield (0, 0, 0) prev_i2 = 0 for (i1, i2) in al12: for _i2 in range(prev_i2+1, i2+1): try: i3s = map23[_i2] for i3 in i3s: # if map31[i3] == i1: yield (i1, _i2, i3) except KeyError: pass prev_i2 = i2 return Alignment(list(gen()), no_costs=True) if __name__ == '__main__': import sys name = sys.argv[1] a1 = Alignment.from_file(name + '/pl-cu.my').as_ladder() a2 = Alignment.from_file(name + '/cu-el.my').as_ladder() a3 = Alignment.from_file(name + '/pl-el.my').as_ladder() a3 = [(b, a) for (a, b) in a3] ma = merge_3_alignments(a1, a2, a3) ma.pretty_print(Text.from_file(name + '/pl.txt', lang='pl').as_sentences_flat(), Text.from_file(name + '/cu.txt', lang='cu').as_sentences_flat(), Text.from_file(name + '/el.txt', lang='el').as_sentences_flat())