Exemplo n.º 1
0
        num_iterations,
        src_corpus_tags=src_corpus_tags,
        trg_corpus_tags=trg_corpus_tags)
    return align_corpus_given_models(src_corpus,
                                     trg_corpus,
                                     prior_model,
                                     translation_model,
                                     src_corpus_tags=src_corpus_tags,
                                     trg_corpus_tags=trg_corpus_tags)


if __name__ == "__main__":
    if not len(sys.argv) == 6:
        print "Usage ./word_alignment.py src_corpus trg_corpus iterations output_prefix."
        sys.exit(0)
    src_corpus = utils.read_all_tokens(sys.argv[1])
    trg_corpus = utils.read_all_tokens(sys.argv[2])
    with_tags = sys.argv[5] == 'tags'
    num_iterations = int(sys.argv[3])
    output_prefix = sys.argv[4]
    assert len(src_corpus) == len(trg_corpus), "Corpora should be same size!"
    if with_tags:
        src_corpus_tags = utils.read_all_tokens(
            sys.argv[1][:sys.argv[1].find('tokens')] + 'tags.' +
            sys.argv[1][sys.argv[1].find('1'):])

        if sys.argv[2].find('tokens') == -1:
            trg_corpus_tags = utils.read_all_tokens(
                sys.argv[2][:sys.argv[2].find('lemmas')] + 'tags.' +
                sys.argv[2][sys.argv[2].find('1'):])
        else:
Exemplo n.º 2
0
def initialize_models(src_corpus, trg_corpus):
    prior_model = PriorModel(src_corpus, trg_corpus)
    translation_model = TranslationModel(src_corpus, trg_corpus)
    return prior_model, translation_model


def normalize(src_corpus, trg_corpus):
    return src_corpus, trg_corpus


if __name__ == "__main__":
    if not len(sys.argv) == 5:
        print(
            "Usage ./align.py src_corpus trg_corpus iterations output_prefix.")
        sys.exit(0)
    src_corpus, trg_corpus = read_all_tokens(sys.argv[1]), read_all_tokens(
        sys.argv[2])
    src_corpus, trg_corpus = normalize(src_corpus, trg_corpus)
    num_iterations = int(sys.argv[3])
    output_prefix = sys.argv[4]
    assert len(src_corpus) == len(trg_corpus), "Corpora should be same size!"
    prior_model, translation_model = initialize_models(src_corpus, trg_corpus)
    prior_model, translation_model = estimate_models(src_corpus, trg_corpus,
                                                     prior_model,
                                                     translation_model,
                                                     num_iterations)
    alignments = align_corpus(src_corpus, trg_corpus, prior_model,
                              translation_model)
    output_alignments_per_test_set(alignments, output_prefix)
Exemplo n.º 3
0
        these_alignments = align_sentence_pair(src_corpus[i], trg_corpus[i],
                                               prior_model, translation_model)
        alignments.append(these_alignments)
    return alignments


def align_corpus(src_corpus, trg_corpus, num_iterations):
    "Learn models and then align the corpus using them."
    prior_model, translation_model = initialize_models(src_corpus, trg_corpus)
    prior_model, translation_model = estimate_models(src_corpus, trg_corpus,
                                                     prior_model,
                                                     translation_model,
                                                     num_iterations)
    return align_corpus_given_models(src_corpus, trg_corpus, prior_model,
                                     translation_model)


if __name__ == "__main__":
    if not len(sys.argv) == 5:
        print(
            "Usage: python word_alignment.py src_corpus trg_corpus iterations output_prefix."
        )
        sys.exit(0)
    src_corpus = utils.read_all_tokens(sys.argv[1])
    trg_corpus = utils.read_all_tokens(sys.argv[2])
    num_iterations = int(sys.argv[3])
    output_prefix = sys.argv[4]
    assert len(src_corpus) == len(trg_corpus), "Corpora should be same size!"
    alignments = align_corpus(src_corpus, trg_corpus, num_iterations)
    utils.output_alignments_per_test_set(alignments, output_prefix)