Exemplo n.º 1
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    input_lines_symbols = codecs.open(args.input , 'r', 'utf-8')
    input_lines = codecs.open(args.input + '.nounk',  'r', 'utf-8')
    gold_lines = codecs.open(args.gold + '.nounk', 'r', 'utf-8')

    index = 0
    found = 0
    for input_line, input_line_symbols, gold_line in zip(input_lines, input_lines_symbols, gold_lines):
        unk_map = build_unk_map(input_line_symbols, input_line)
        # logging.info(unk_map)

        translations_with_scores = tm.translate(input_line_symbols, k=args.k)
        translations = [data[1] for data in translations_with_scores]

        translations_replaced = [replace_symbols(translation, unk_map) for translation in translations]
        match_index = find_match(gold_line, translations_replaced)
        logging.info('Index: %d Match: %d'%(index, match_index))

        if match_index != -1:
            found += 1

        index += 1

    recall_k = 0.0
    recall_k += found
    recall_k /= index
    logging.info('Recall@%d: %f (%d/%d)'% (args.k, recall_k, found, index))
Exemplo n.º 2
0
def main():
    args = setup_args()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(args)

    src_lines = codecs.open(args.source, 'r', 'utf-8').readlines()
    src_lines_nounk = codecs.open(args.source + args.suffix, 'r', 'utf-8').readlines()
    gold_lines = codecs.open(args.gold + args.suffix, 'r', 'utf-8').readlines()

    fw = codecs.open(args.model + SVM_RANK_DATA, 'w', 'utf-8')

    tm = TranslationModel(args.model)
    num_all_zeros = 0

    train_id = 0
    for sentence_idx, (src_line, src_line_nounk, gold_line) in enumerate(zip(src_lines, src_lines_nounk, gold_lines)):
        translations = tm.translate(src_line, k=args.num)
        logging.info('Source_line: %s'% src_line_nounk)
        logging.info('Gold_line: %s' % gold_line)

        unk_map = build_unk_map(src_line, src_line_nounk)
        logging.info('UNK_map: %s'% str(unk_map))

        scores = []
        translations_nounk = []
        for idx, translation in enumerate(translations):
            translation_nounk = replace_symbols(translation[1], unk_map)
            translations_nounk.append(translation_nounk)
            bleu_nounk = get_bleu_score(gold_line, translation_nounk)
            scores.append(bleu_nounk)
            #logging.info('Tr:%d ::%s BLEU:%s'%(idx, translation_nounk, bleu_nounk))

        if sum(scores) == 0.0:
            num_all_zeros += 1
            continue

        scores_index = sorted(range(len(scores)), key=lambda k: scores[k], reverse=True)
        write_train_data(fw, sentence_idx, train_id, translations_nounk, scores, scores_index, src_line_nounk)
        train_id += 0

        for index in scores_index:
            logging.info('Tr: %d Text:%s Pr:%f BLEU:%f'%(index, translations[index][1],
                                                              translations[index][0], scores[index]))
    logging.info('Num all zeros: %d'%num_all_zeros)
Exemplo n.º 3
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    fw_out = codecs.open(args.out, 'w', 'utf-8')

    line_num = 0
    for input_line in codecs.open(args.input, 'r', 'utf-8'):
        results = tm.translate(input_line.strip(), k = 20)
        if args.all:
            index, best_bleu_score = find_best_translation(input_line, results)
        else:
            best_bleu_score = -1.0
            index = 0

        logging.info('Line:%d best_index:%d best_bleu:%f'% (line_num, index, best_bleu_score))
        fw_out.write(results[index][1] + '\n')
        line_num += 1
    fw_out.close()
Exemplo n.º 4
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    f = codecs.open('%s-%s.csv'% (args.out, args.suffix), 'w')
    csv_f = csv.writer(f, delimiter=',', encoding='utf-8')

    data = ['Src', 'Target', 'Gold Standard']
    csv_f.writerow(data)
    input_lines = codecs.open(args.input, 'r', 'utf-8').readlines()
    gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines()

    fw_sents = codecs.open('%s-%s-sents.out', 'w', 'utf-8')
    for input_line, gold_line in zip(input_lines, gold_lines):
        data = []
        data.append(input_line.strip())
        results = tm.translate(input_line.strip())
        data.append(results[0][1])
        data.append(gold_line.strip())
        csv_f.writerow(data)
        fw_sents.write(results[0][1] + '\n')
Exemplo n.º 5
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    input_lines_symbols = codecs.open(args.input, 'r', 'utf-8')
    input_lines = codecs.open(args.input + '.nounk', 'r', 'utf-8')
    gold_lines = codecs.open(args.gold + '.nounk', 'r', 'utf-8')

    index = 0
    found = 0
    for input_line, input_line_symbols, gold_line in zip(
            input_lines, input_lines_symbols, gold_lines):
        unk_map = build_unk_map(input_line_symbols, input_line)
        # logging.info(unk_map)

        translations_with_scores = tm.translate(input_line_symbols, k=args.k)
        translations = [data[1] for data in translations_with_scores]

        translations_replaced = [
            replace_symbols(translation, unk_map)
            for translation in translations
        ]
        match_index = find_match(gold_line, translations_replaced)
        logging.info('Index: %d Match: %d' % (index, match_index))

        if match_index != -1:
            found += 1

        index += 1

    recall_k = 0.0
    recall_k += found
    recall_k /= index
    logging.info('Recall@%d: %f (%d/%d)' % (args.k, recall_k, found, index))
Exemplo n.º 6
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    f = codecs.open('%s-%s.csv' % (args.out, args.suffix), 'w')
    csv_f = csv.writer(f, delimiter=',', encoding='utf-8')

    data = ['Src', 'Target', 'Gold Standard']
    csv_f.writerow(data)
    input_lines = codecs.open(args.input, 'r', 'utf-8').readlines()
    gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines()

    fw_sents = codecs.open('%s-%s-sents.out', 'w', 'utf-8')
    for input_line, gold_line in zip(input_lines, gold_lines):
        data = []
        data.append(input_line.strip())
        results = tm.translate(input_line.strip())
        data.append(results[0][1])
        data.append(gold_line.strip())
        csv_f.writerow(data)
        fw_sents.write(results[0][1] + '\n')