def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) input_lines_symbols = codecs.open(args.input , 'r', 'utf-8') input_lines = codecs.open(args.input + '.nounk', 'r', 'utf-8') gold_lines = codecs.open(args.gold + '.nounk', 'r', 'utf-8') index = 0 found = 0 for input_line, input_line_symbols, gold_line in zip(input_lines, input_lines_symbols, gold_lines): unk_map = build_unk_map(input_line_symbols, input_line) # logging.info(unk_map) translations_with_scores = tm.translate(input_line_symbols, k=args.k) translations = [data[1] for data in translations_with_scores] translations_replaced = [replace_symbols(translation, unk_map) for translation in translations] match_index = find_match(gold_line, translations_replaced) logging.info('Index: %d Match: %d'%(index, match_index)) if match_index != -1: found += 1 index += 1 recall_k = 0.0 recall_k += found recall_k /= index logging.info('Recall@%d: %f (%d/%d)'% (args.k, recall_k, found, index))
def main(): args = setup_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(args) src_lines = codecs.open(args.source, 'r', 'utf-8').readlines() src_lines_nounk = codecs.open(args.source + args.suffix, 'r', 'utf-8').readlines() gold_lines = codecs.open(args.gold + args.suffix, 'r', 'utf-8').readlines() fw = codecs.open(args.model + SVM_RANK_DATA, 'w', 'utf-8') tm = TranslationModel(args.model) num_all_zeros = 0 train_id = 0 for sentence_idx, (src_line, src_line_nounk, gold_line) in enumerate(zip(src_lines, src_lines_nounk, gold_lines)): translations = tm.translate(src_line, k=args.num) logging.info('Source_line: %s'% src_line_nounk) logging.info('Gold_line: %s' % gold_line) unk_map = build_unk_map(src_line, src_line_nounk) logging.info('UNK_map: %s'% str(unk_map)) scores = [] translations_nounk = [] for idx, translation in enumerate(translations): translation_nounk = replace_symbols(translation[1], unk_map) translations_nounk.append(translation_nounk) bleu_nounk = get_bleu_score(gold_line, translation_nounk) scores.append(bleu_nounk) #logging.info('Tr:%d ::%s BLEU:%s'%(idx, translation_nounk, bleu_nounk)) if sum(scores) == 0.0: num_all_zeros += 1 continue scores_index = sorted(range(len(scores)), key=lambda k: scores[k], reverse=True) write_train_data(fw, sentence_idx, train_id, translations_nounk, scores, scores_index, src_line_nounk) train_id += 0 for index in scores_index: logging.info('Tr: %d Text:%s Pr:%f BLEU:%f'%(index, translations[index][1], translations[index][0], scores[index])) logging.info('Num all zeros: %d'%num_all_zeros)
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) fw_out = codecs.open(args.out, 'w', 'utf-8') line_num = 0 for input_line in codecs.open(args.input, 'r', 'utf-8'): results = tm.translate(input_line.strip(), k = 20) if args.all: index, best_bleu_score = find_best_translation(input_line, results) else: best_bleu_score = -1.0 index = 0 logging.info('Line:%d best_index:%d best_bleu:%f'% (line_num, index, best_bleu_score)) fw_out.write(results[index][1] + '\n') line_num += 1 fw_out.close()
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) f = codecs.open('%s-%s.csv'% (args.out, args.suffix), 'w') csv_f = csv.writer(f, delimiter=',', encoding='utf-8') data = ['Src', 'Target', 'Gold Standard'] csv_f.writerow(data) input_lines = codecs.open(args.input, 'r', 'utf-8').readlines() gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines() fw_sents = codecs.open('%s-%s-sents.out', 'w', 'utf-8') for input_line, gold_line in zip(input_lines, gold_lines): data = [] data.append(input_line.strip()) results = tm.translate(input_line.strip()) data.append(results[0][1]) data.append(gold_line.strip()) csv_f.writerow(data) fw_sents.write(results[0][1] + '\n')
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) input_lines_symbols = codecs.open(args.input, 'r', 'utf-8') input_lines = codecs.open(args.input + '.nounk', 'r', 'utf-8') gold_lines = codecs.open(args.gold + '.nounk', 'r', 'utf-8') index = 0 found = 0 for input_line, input_line_symbols, gold_line in zip( input_lines, input_lines_symbols, gold_lines): unk_map = build_unk_map(input_line_symbols, input_line) # logging.info(unk_map) translations_with_scores = tm.translate(input_line_symbols, k=args.k) translations = [data[1] for data in translations_with_scores] translations_replaced = [ replace_symbols(translation, unk_map) for translation in translations ] match_index = find_match(gold_line, translations_replaced) logging.info('Index: %d Match: %d' % (index, match_index)) if match_index != -1: found += 1 index += 1 recall_k = 0.0 recall_k += found recall_k /= index logging.info('Recall@%d: %f (%d/%d)' % (args.k, recall_k, found, index))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) f = codecs.open('%s-%s.csv' % (args.out, args.suffix), 'w') csv_f = csv.writer(f, delimiter=',', encoding='utf-8') data = ['Src', 'Target', 'Gold Standard'] csv_f.writerow(data) input_lines = codecs.open(args.input, 'r', 'utf-8').readlines() gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines() fw_sents = codecs.open('%s-%s-sents.out', 'w', 'utf-8') for input_line, gold_line in zip(input_lines, gold_lines): data = [] data.append(input_line.strip()) results = tm.translate(input_line.strip()) data.append(results[0][1]) data.append(gold_line.strip()) csv_f.writerow(data) fw_sents.write(results[0][1] + '\n')