示例#1
0
文件: align.py 项目: filipok/eunlp
def jsalign_with_error(texts, s_lang, t_lang, note, align_file):
    source_list = convert.file_to_list(texts[0], s_lang)
    s_sentence_splitter = util.sentence_splitter(s_lang)
    source_list = text_sent_splitter(source_list, s_sentence_splitter)
    if s_lang == 'fr'and note[0] == '6':
        french_quotes(source_list)
    target_list = convert.file_to_list(texts[1], t_lang)
    t_sentence_splitter = util.sentence_splitter(t_lang)
    target_list = text_sent_splitter(target_list, t_sentence_splitter)
    if t_lang == 'fr'and note[0] == 6:
        french_quotes(target_list)
    tag_list = ['none'] * max(len(source_list), len(target_list))
    jsalign = convert.jsalign_table(source_list, target_list, tag_list, s_lang,
                                    t_lang, note)
    with codecs.open(align_file + '_manual.html', 'w', 'utf-8') as fout:
        fout.write(jsalign)
示例#2
0
文件: tests.py 项目: filipok/eunlp
 def test_file_to_list_tries_3(self):
     # Not testing numbering_separator here.
     text = (' \t\nnon-breaking'
             u"\u00A0"
             'space \n  \t  '
             'u\n'
             'uu\n'
             'uuu\n'
             '    Another  line!\n \n \n \n \n \n \n \n \n \n ')
     result = ['non-breaking space', 'Another line!']
     self.assertEqual(result, convert.file_to_list(text, 'ro', 3))
示例#3
0
文件: align.py 项目: filipok/eunlp
def smart_aligner(texts, s_lang, t_lang, dictionary,
                  align_file, note, over=True, para_size=PARA_MAX,
                  para_size_small=PARA_MIN, make_dic=True, compress=False):
    # functions.smart_aligner(texts, "en",
    # "ro", "enro.dic", "bi_test", "/home/filip/eunlp/", "A720120002")
    """

    :type texts: list
    :type s_lang: str
    :type t_lang: str
    :type dictionary: str
    :type align_file: str
    :type note: str
    :type over: bool
    :type para_size: int
    :type para_size_small: int
    :type make_dic: bool
    :type compress: bool
    :rtype: None
    """
    if (not over) and (
            os.path.isfile(align_file + '.tmx') or
            os.path.isfile(align_file + '_manual.html') or
            os.path.isfile(align_file + '.tmx.gz')):
        logging.warning("File pair already aligned: %s", align_file)
        return  # exit if already aligned and over=False
    source_list = convert.file_to_list(texts[0], s_lang)
    target_list = convert.file_to_list(texts[1], t_lang)
    # when debugging:
    # jsalign = convert.jsalign_table(source_list, target_list, s_lang,
    #                                 t_lang, note)
    # with codecs.open(align_file + '_manual_0.html', 'w', 'utf-8') as fout:
    #     fout.write(jsalign)

    if len(source_list) != len(target_list):
        logging.error('Smart alignment failed in %s: %s-%s', note,
                      s_lang, t_lang)
        jsalign_with_error(texts, s_lang, t_lang, note, align_file)
        return

    try:
        tab_file = parallel_aligner(source_list, target_list, s_lang, t_lang,
                                    dictionary, para_size=para_size,
                                    para_size_small=para_size_small,
                                    note=note, make_dic=make_dic)
        # turn alignment into tmx and manual html alignment
        tmx_file = convert.tab_to_tmx(tab_file, s_lang, t_lang, note)

        with codecs.open(align_file + '.tmx', "w", "utf-8") as fout:
            fout.write(tmx_file)
        source_list, target_list, tag_list = convert.tab_to_separate(tab_file)
        jsalign = convert.jsalign_table(source_list, target_list, tag_list,
                                        s_lang, t_lang, note)
        with codecs.open(align_file + '_manual.html', 'w', 'utf-8') as fout:
            fout.write(jsalign)
        if compress:
            convert.gzipper(align_file + '.tmx')
            convert.gzipper(align_file + '_manual.html')

    except StopIteration:
        logging.error('StopIteration in %s -> %s, %s', note, s_lang, t_lang)
        jsalign_with_error(texts, s_lang, t_lang, note, align_file)