예제 #1
0
def get_alignments(src_file,
                   tg_file,
                   trained_model=None,
                   src_train='',
                   tg_train='',
                   align_model='align_model',
                   label='alignments'):
    if trained_model is None:
        trained_model = train_alignments(src_train, tg_train, align_model)
        if trained_model == '':
            sys.stderr.write('No alignment model trained\n')
            return []

    aligner = Aligner(trained_model + '.fwd_params',
                      trained_model + '.fwd_err',
                      trained_model + '.rev_params',
                      trained_model + '.rev_err')
    src = open(src_file)
    tg = open(tg_file)
    align_file = src_file + '_' + os.path.basename(tg_file) + '.aligned'
    aligned = open(align_file, 'w')
    for src_line, tg_line in zip(src, tg):
        aligned.write(
            aligner.align(src_line[:-1].decode('utf-8') + u' ||| ' +
                          tg_line[:-1].decode('utf-8')) + u'\n')
    aligned.close()
    aligner.close()

    return (label, align_file)
 def get_alignments(self, src, tg, align_model):
     alignments = [[[] for j in range(len(tg[i]))] for i in range(len(tg))]
     aligner = Aligner(align_model+'.fwd_params', align_model+'.fwd_err', align_model+'.rev_params', align_model+'.rev_err')
     for idx, (src_list, tg_list) in enumerate(zip(src, tg)):
         align_string = aligner.align(' '.join(src_list) + ' ||| ' + ' '.join(tg_list))
         pairs = align_string.split()
         for p_str in pairs:
             p = p_str.split('-')
             alignments[idx][int(p[1])].append(int(p[0]))
     aligner.close() 
     return alignments
예제 #3
0
def align_files(src_file, tg_file, align_model, align_file):
    '''
    align 2 files and put the alignments in a new file
    :align_model: - alignment model prefix
    :align_file: - new file to store the alignments
    '''
    aligner = Aligner(align_model+'.fwd_params', align_model+'.fwd_err', align_model+'.rev_params', align_model+'.rev_err')
    align_out = open(align_file, 'w')
    for src_line, tg_line in zip(open(src_file), open(tg_file)):
        align_out.write('%s\n' % aligner.align(src_line[:-1].decode('utf-8') + u' ||| ' + tg_line[:-1].decode('utf-8')))
    aligner.close()
    align_out.close()
예제 #4
0
def align_files(src_file, tg_file, align_model, align_file):
    '''
    align 2 files and put the alignments in a new file
    :align_model: - alignment model prefix
    :align_file: - new file to store the alignments
    '''
    aligner = Aligner(align_model + '.fwd_params', align_model + '.fwd_err',
                      align_model + '.rev_params', align_model + '.rev_err')
    align_out = open(align_file, 'w')
    for src_line, tg_line in zip(open(src_file), open(tg_file)):
        align_out.write(
            '%s\n' % aligner.align(src_line[:-1].decode('utf-8') + u' ||| ' +
                                   tg_line[:-1].decode('utf-8')))
    aligner.close()
    align_out.close()
예제 #5
0
def align_sentence(src_line, tg_line, align_model):
    # TODO: there is an error here if one or both fields are missing -- we cannot align a sentence without both src_line and tg_line
    # throw an error prompting the user to specify another dataset or context creator
    # if not src_line or not tg_line:

    cur_alignments = [[] for i in range(len(tg_line))]

    aligner = Aligner(align_model+'.fwd_params', align_model+'.fwd_err', align_model+'.rev_params', align_model+'.rev_err')
    align_str = aligner.align(' '.join(src_line)+u' ||| '+' '.join(tg_line))
    # parse the return value from the aligner
    for pair in align_str.split():
        pair = pair.split('-')
        cur_alignments[int(pair[1])].append(int(pair[0]))
    aligner.close()

    return cur_alignments
예제 #6
0
def force_alignments(src_file, tg_file, trained_model):
    alignments = []
    aligner = Aligner(trained_model+'.fwd_params',trained_model+'.fwd_err',trained_model+'.rev_params',trained_model+'.rev_err')
    src = open(src_file)
    tg = open(tg_file)
    for src_line, tg_line in zip(src, tg):
        align_str = aligner.align( src_line[:-1].decode('utf-8')+u' ||| '+tg_line[:-1].decode('utf-8') )
        cur_alignments = [ [] for i in range(len(tg_line.split())) ]
        for pair in align_str.split():
            pair = pair.split('-')
            cur_alignments[int(pair[1])].append( pair[0] )
        alignments.append(cur_alignments)
    src.close()
    tg.close()

    aligner.close()

    return alignments
예제 #7
0
def get_alignments(src_file, tg_file, trained_model=None, src_train='', tg_train='', align_model='align_model', label='alignments'):
    if trained_model is None:
        trained_model = train_alignments(src_train, tg_train, align_model)
        if trained_model == '':
            sys.stderr.write('No alignment model trained\n')
            return []

    aligner = Aligner(trained_model+'.fwd_params', trained_model+'.fwd_err', trained_model+'.rev_params', trained_model+'.rev_err')
    src = open(src_file)
    tg = open(tg_file)
    align_file = src_file+'_'+os.path.basename(tg_file)+'.aligned'
    aligned = open(align_file, 'w')
    for src_line, tg_line in zip(src, tg):
        aligned.write(aligner.align(src_line[:-1].decode('utf-8')+u' ||| '+tg_line[:-1].decode('utf-8'))+u'\n')
    aligned.close()
    aligner.close()

    return (label, align_file)
예제 #8
0
def align_sentence(src_line, tg_line, align_model):
    # TODO: there is an error here if one or both fields are missing -- we cannot align a sentence without both src_line and tg_line
    # throw an error prompting the user to specify another dataset or context creator
    # if not src_line or not tg_line:

    cur_alignments = [[] for i in range(len(tg_line))]

    aligner = Aligner(align_model + '.fwd_params', align_model + '.fwd_err',
                      align_model + '.rev_params', align_model + '.rev_err')
    align_str = aligner.align(' '.join(src_line) + u' ||| ' +
                              ' '.join(tg_line))
    # parse the return value from the aligner
    for pair in align_str.split():
        pair = pair.split('-')
        cur_alignments[int(pair[1])].append(int(pair[0]))
    aligner.close()

    return cur_alignments
예제 #9
0
def force_alignments(src_file, tg_file, trained_model):
    alignments = []
    aligner = Aligner(trained_model + '.fwd_params',
                      trained_model + '.fwd_err',
                      trained_model + '.rev_params',
                      trained_model + '.rev_err')
    src = open(src_file)
    tg = open(tg_file)
    for src_line, tg_line in zip(src, tg):
        align_str = aligner.align(src_line[:-1].decode('utf-8') + u' ||| ' +
                                  tg_line[:-1].decode('utf-8'))
        cur_alignments = [[] for i in range(len(tg_line.split()))]
        for pair in align_str.split():
            pair = pair.split('-')
            cur_alignments[int(pair[1])].append(pair[0])
        alignments.append(cur_alignments)
    src.close()
    tg.close()

    aligner.close()

    return alignments