def get_alignments(src_file, tg_file, trained_model=None, src_train='', tg_train='', align_model='align_model', label='alignments'): if trained_model is None: trained_model = train_alignments(src_train, tg_train, align_model) if trained_model == '': sys.stderr.write('No alignment model trained\n') return [] aligner = Aligner(trained_model + '.fwd_params', trained_model + '.fwd_err', trained_model + '.rev_params', trained_model + '.rev_err') src = open(src_file) tg = open(tg_file) align_file = src_file + '_' + os.path.basename(tg_file) + '.aligned' aligned = open(align_file, 'w') for src_line, tg_line in zip(src, tg): aligned.write( aligner.align(src_line[:-1].decode('utf-8') + u' ||| ' + tg_line[:-1].decode('utf-8')) + u'\n') aligned.close() aligner.close() return (label, align_file)
def get_alignments(self, src, tg, align_model): alignments = [[[] for j in range(len(tg[i]))] for i in range(len(tg))] aligner = Aligner(align_model+'.fwd_params', align_model+'.fwd_err', align_model+'.rev_params', align_model+'.rev_err') for idx, (src_list, tg_list) in enumerate(zip(src, tg)): align_string = aligner.align(' '.join(src_list) + ' ||| ' + ' '.join(tg_list)) pairs = align_string.split() for p_str in pairs: p = p_str.split('-') alignments[idx][int(p[1])].append(int(p[0])) aligner.close() return alignments
def align_files(src_file, tg_file, align_model, align_file): ''' align 2 files and put the alignments in a new file :align_model: - alignment model prefix :align_file: - new file to store the alignments ''' aligner = Aligner(align_model+'.fwd_params', align_model+'.fwd_err', align_model+'.rev_params', align_model+'.rev_err') align_out = open(align_file, 'w') for src_line, tg_line in zip(open(src_file), open(tg_file)): align_out.write('%s\n' % aligner.align(src_line[:-1].decode('utf-8') + u' ||| ' + tg_line[:-1].decode('utf-8'))) aligner.close() align_out.close()
def align_files(src_file, tg_file, align_model, align_file): ''' align 2 files and put the alignments in a new file :align_model: - alignment model prefix :align_file: - new file to store the alignments ''' aligner = Aligner(align_model + '.fwd_params', align_model + '.fwd_err', align_model + '.rev_params', align_model + '.rev_err') align_out = open(align_file, 'w') for src_line, tg_line in zip(open(src_file), open(tg_file)): align_out.write( '%s\n' % aligner.align(src_line[:-1].decode('utf-8') + u' ||| ' + tg_line[:-1].decode('utf-8'))) aligner.close() align_out.close()
def align_sentence(src_line, tg_line, align_model): # TODO: there is an error here if one or both fields are missing -- we cannot align a sentence without both src_line and tg_line # throw an error prompting the user to specify another dataset or context creator # if not src_line or not tg_line: cur_alignments = [[] for i in range(len(tg_line))] aligner = Aligner(align_model+'.fwd_params', align_model+'.fwd_err', align_model+'.rev_params', align_model+'.rev_err') align_str = aligner.align(' '.join(src_line)+u' ||| '+' '.join(tg_line)) # parse the return value from the aligner for pair in align_str.split(): pair = pair.split('-') cur_alignments[int(pair[1])].append(int(pair[0])) aligner.close() return cur_alignments
def force_alignments(src_file, tg_file, trained_model): alignments = [] aligner = Aligner(trained_model+'.fwd_params',trained_model+'.fwd_err',trained_model+'.rev_params',trained_model+'.rev_err') src = open(src_file) tg = open(tg_file) for src_line, tg_line in zip(src, tg): align_str = aligner.align( src_line[:-1].decode('utf-8')+u' ||| '+tg_line[:-1].decode('utf-8') ) cur_alignments = [ [] for i in range(len(tg_line.split())) ] for pair in align_str.split(): pair = pair.split('-') cur_alignments[int(pair[1])].append( pair[0] ) alignments.append(cur_alignments) src.close() tg.close() aligner.close() return alignments
def get_alignments(src_file, tg_file, trained_model=None, src_train='', tg_train='', align_model='align_model', label='alignments'): if trained_model is None: trained_model = train_alignments(src_train, tg_train, align_model) if trained_model == '': sys.stderr.write('No alignment model trained\n') return [] aligner = Aligner(trained_model+'.fwd_params', trained_model+'.fwd_err', trained_model+'.rev_params', trained_model+'.rev_err') src = open(src_file) tg = open(tg_file) align_file = src_file+'_'+os.path.basename(tg_file)+'.aligned' aligned = open(align_file, 'w') for src_line, tg_line in zip(src, tg): aligned.write(aligner.align(src_line[:-1].decode('utf-8')+u' ||| '+tg_line[:-1].decode('utf-8'))+u'\n') aligned.close() aligner.close() return (label, align_file)
def align_sentence(src_line, tg_line, align_model): # TODO: there is an error here if one or both fields are missing -- we cannot align a sentence without both src_line and tg_line # throw an error prompting the user to specify another dataset or context creator # if not src_line or not tg_line: cur_alignments = [[] for i in range(len(tg_line))] aligner = Aligner(align_model + '.fwd_params', align_model + '.fwd_err', align_model + '.rev_params', align_model + '.rev_err') align_str = aligner.align(' '.join(src_line) + u' ||| ' + ' '.join(tg_line)) # parse the return value from the aligner for pair in align_str.split(): pair = pair.split('-') cur_alignments[int(pair[1])].append(int(pair[0])) aligner.close() return cur_alignments
def force_alignments(src_file, tg_file, trained_model): alignments = [] aligner = Aligner(trained_model + '.fwd_params', trained_model + '.fwd_err', trained_model + '.rev_params', trained_model + '.rev_err') src = open(src_file) tg = open(tg_file) for src_line, tg_line in zip(src, tg): align_str = aligner.align(src_line[:-1].decode('utf-8') + u' ||| ' + tg_line[:-1].decode('utf-8')) cur_alignments = [[] for i in range(len(tg_line.split()))] for pair in align_str.split(): pair = pair.split('-') cur_alignments[int(pair[1])].append(pair[0]) alignments.append(cur_alignments) src.close() tg.close() aligner.close() return alignments