示例#1
0
def main():
    output_format = "plaintext"
    lang_a = sys.argv[1]
    lang_b = sys.argv[2]
    model_path = os.path.abspath(sys.argv[3])
    nltk.data.path += [model_path]
    model = YalignModel.load(model_path)

    pairing = read_pairing(open(sys.argv[4]), lang_a, lang_b)
    src_needed = set([a for a, _ in pairing])
    tgt_needed = set([a for _, a in pairing])
    src_articles = read_articles(open(sys.argv[5]), src_needed)
    tgt_articles = read_articles(open(sys.argv[6]), tgt_needed)
    for src, tgt in pairing:
        try:
            text_a = "\n".join(src_articles[src])
            text_b = "\n".join(tgt_articles[tgt])
            document_a = text_to_document(text_a, lang_a)
            document_b = text_to_document(text_b, lang_b)
            pairs = model.align(document_a, document_b)
            sys.stderr.write(u"{0} pairs in {1}-{2}\n".format(len(pairs), src, tgt).encode("utf-8"))

            write_plaintext(sys.stdout, pairs)
        except KeyError:
            sys.stderr.write(u"KeyError with {0}-{1}\n".format(src, tgt).encode("utf-8"))
            continue
示例#2
0
 def setUp(self):
     random.seed(hash("Y U NO?"))
     base_path = os.path.dirname(os.path.abspath(__file__))
     word_scores = os.path.join(base_path, "data",
                                "test_word_scores_big.csv")
     parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt")
     A, B = parallel_corpus_to_documents(parallel_corpus)
     A = A[:25]
     B = B[:25]
     self.alignments = list(training_alignments_from_documents(A, B))
     self.A, self.B, self.correct_alignments = \
                              list(training_scrambling_from_documents(A, B))
     # Word score
     word_pair_score = WordPairScore(word_scores)
     # Sentence Score
     sentence_pair_score = SentencePairScore()
     sentence_pair_score.train(self.alignments, word_pair_score)
     # Yalign model
     self.min_ = sentence_pair_score.min_bound
     self.max_ = sentence_pair_score.max_bound
     gap_penalty = (self.min_ + self.max_) / 2.0
     document_aligner = SequenceAligner(sentence_pair_score, gap_penalty)
     self.model = YalignModel(document_aligner, 1)
示例#3
0
    def test_save_load_and_align(self):
        doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])]
        doc2 = [Sentence([u"Casa"])]
        result_before_save = self.model.align(doc1, doc2)

        # Save
        tmp_folder = tempfile.mkdtemp()
        self.model.save(tmp_folder)

        # Load
        new_model = YalignModel.load(tmp_folder)
        result_after_load = new_model.align(doc1, doc2)
        self.assertEqual(result_before_save, result_after_load)
        self.assertEqual(self.model.threshold, new_model.threshold)
        self.assertEqual(self.model.document_pair_aligner.penalty, new_model.document_pair_aligner.penalty)
示例#4
0
    def test_save_load_and_align(self):
        doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])]
        doc2 = [Sentence([u"Casa"])]
        result_before_save = self.model.align(doc1, doc2)

        # Save
        tmp_folder = tempfile.mkdtemp()
        self.model.save(tmp_folder)

        # Load
        new_model = YalignModel.load(tmp_folder)
        result_after_load = new_model.align(doc1, doc2)
        self.assertEqual(result_before_save, result_after_load)
        self.assertEqual(self.model.threshold, new_model.threshold)
        self.assertEqual(self.model.document_pair_aligner.penalty,
                         new_model.document_pair_aligner.penalty)
示例#5
0
 def setUp(self):
     word_scores = os.path.join(data_path, "test_word_scores_big.csv")
     self.parallel_corpus = os.path.join(data_path, "parallel-en-es.txt")
     # Documents
     A, B = parallel_corpus_to_documents(self.parallel_corpus)
     self.document_a = A[:30]
     self.document_b = B[:30]
     training = training_alignments_from_documents(self.document_a,
                                                   self.document_b)
     # Word score
     word_pair_score = WordPairScore(word_scores)
     # Sentence Score
     sentence_pair_score = SentencePairScore()
     sentence_pair_score.train(training, word_pair_score)
     # Yalign model
     document_aligner = SequenceAligner(sentence_pair_score, 0.49)
     self.model = YalignModel(document_aligner)
示例#6
0
    def test_command_tool(self):
        if self.cmdline is None:
            return

        tmpdir = tempfile.mkdtemp()
        _, tmpfile = tempfile.mkstemp()
        self.model.save(tmpdir)

        cmd = self.cmdline.format(corpus=self.parallel_corpus, model=tmpdir)
        outputfh = open(tmpfile, "w")
        subprocess.call(cmd, shell=True, stdout=outputfh)
        outputfh = open(tmpfile)
        output = outputfh.read()

        A, B = parallel_corpus_to_documents(self.parallel_corpus)
        model = YalignModel.load(tmpdir)
        value = self.alignment_function(A, B, model)

        self.assertIn("{}%".format(value), output)
示例#7
0
    def test_command_tool(self):
        if self.cmdline is None:
            return

        tmpdir = tempfile.mkdtemp()
        _, tmpfile = tempfile.mkstemp()
        self.model.save(tmpdir)

        cmd = self.cmdline.format(corpus=self.parallel_corpus, model=tmpdir)
        outputfh = open(tmpfile, "w")
        subprocess.call(cmd, shell=True, stdout=outputfh)
        outputfh = open(tmpfile)
        output = outputfh.read()

        A, B = parallel_corpus_to_documents(self.parallel_corpus)
        model = YalignModel.load(tmpdir)
        value = self.alignment_function(A, B, model)

        self.assertIn("{}%".format(value), output)
示例#8
0
 def setUp(self):
     random.seed(hash("Y U NO?"))
     base_path = os.path.dirname(os.path.abspath(__file__))
     word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv")
     parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt")
     A, B = parallel_corpus_to_documents(parallel_corpus)
     A = A[:25]
     B = B[:25]
     self.alignments = list(training_alignments_from_documents(A, B))
     self.A, self.B, self.correct_alignments = list(training_scrambling_from_documents(A, B))
     # Word score
     word_pair_score = WordPairScore(word_scores)
     # Sentence Score
     sentence_pair_score = SentencePairScore()
     sentence_pair_score.train(self.alignments, word_pair_score)
     # Yalign model
     self.min_ = sentence_pair_score.min_bound
     self.max_ = sentence_pair_score.max_bound
     gap_penalty = (self.min_ + self.max_) / 2.0
     document_aligner = SequenceAligner(sentence_pair_score, gap_penalty)
     self.model = YalignModel(document_aligner, 1)
示例#9
0
class TestYalignModel(unittest.TestCase):
    def setUp(self):
        random.seed(hash("Y U NO?"))
        base_path = os.path.dirname(os.path.abspath(__file__))
        word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv")
        parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt")
        A, B = parallel_corpus_to_documents(parallel_corpus)
        A = A[:25]
        B = B[:25]
        self.alignments = list(training_alignments_from_documents(A, B))
        self.A, self.B, self.correct_alignments = \
                                 list(training_scrambling_from_documents(A, B))
        # Word score
        word_pair_score = WordPairScore(word_scores)
        # Sentence Score
        sentence_pair_score = SentencePairScore()
        sentence_pair_score.train(self.alignments, word_pair_score)
        # Yalign model
        self.min_ = sentence_pair_score.min_bound
        self.max_ = sentence_pair_score.max_bound
        gap_penalty = (self.min_ + self.max_) / 2.0
        document_aligner = SequenceAligner(sentence_pair_score, gap_penalty)
        self.model = YalignModel(document_aligner, 1)

    def test_save_file_created(self):
        tmp_folder = tempfile.mkdtemp()
        self.model.save(tmp_folder)
        model_path = os.path.join(tmp_folder, "aligner.pickle")
        metadata_path = os.path.join(tmp_folder, "metadata.json")
        self.assertTrue(os.path.exists(model_path))
        self.assertTrue(os.path.exists(metadata_path))

    def test_save_load_and_align(self):
        doc1 = [Sentence([u"House"]),
                Sentence([u"asoidfhuioasgh"])]
        doc2 = [Sentence([u"Casa"])]
        result_before_save = self.model.align(doc1, doc2)

        # Save
        tmp_folder = tempfile.mkdtemp()
        self.model.save(tmp_folder)

        # Load
        new_model = YalignModel.load(tmp_folder)
        result_after_load = new_model.align(doc1, doc2)
        self.assertEqual(result_before_save, result_after_load)
        self.assertEqual(self.model.threshold, new_model.threshold)
        self.assertEqual(self.model.document_pair_aligner.penalty,
                         new_model.document_pair_aligner.penalty)

    def test_reasonable_alignment(self):
        doc1 = [Sentence([u"House"]),
                Sentence([u"asoidfhuioasgh"])]
        doc2 = [Sentence([u"Casa"])]
        result = self.model.align(doc1, doc2)
        result = [(list(x), list(y)) for x, y in result]
        self.assertIn((list(doc1[0]), list(doc2[0])), result)

    def test_optimize_gap_penalty_and_threshold_finishes(self):
        self.model.optimize_gap_penalty_and_threshold(self.A, self.B,
                                                      self.correct_alignments)

    def test_optimize_gap_penalty_and_threshold_is_best(self):
        def evaluate(penalty, threshold):
            self.model.document_pair_aligner.penalty = penalty
            self.model.threshold = threshold
            predicted = self.model.align_indexes(self.A, self.B)
            return F_score(predicted, self.correct_alignments)[0]

        random.seed(hash("12345"))
        self.model.optimize_gap_penalty_and_threshold(self.A, self.B,
                                                      self.correct_alignments)
        best_score = evaluate(self.model.document_pair_aligner.penalty,
                              self.model.threshold)
        for _ in xrange(50):
            penalty = random.uniform(self.min_, self.max_ / 2.0)
            threshold = random.uniform(self.min_, self.max_)
            score = evaluate(penalty, threshold)
            self.assertGreaterEqual(best_score, score)
示例#10
0
class TestYalignModel(unittest.TestCase):
    def setUp(self):
        random.seed(hash("Y U NO?"))
        base_path = os.path.dirname(os.path.abspath(__file__))
        word_scores = os.path.join(base_path, "data",
                                   "test_word_scores_big.csv")
        parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt")
        A, B = parallel_corpus_to_documents(parallel_corpus)
        A = A[:25]
        B = B[:25]
        self.alignments = list(training_alignments_from_documents(A, B))
        self.A, self.B, self.correct_alignments = \
                                 list(training_scrambling_from_documents(A, B))
        # Word score
        word_pair_score = WordPairScore(word_scores)
        # Sentence Score
        sentence_pair_score = SentencePairScore()
        sentence_pair_score.train(self.alignments, word_pair_score)
        # Yalign model
        self.min_ = sentence_pair_score.min_bound
        self.max_ = sentence_pair_score.max_bound
        gap_penalty = (self.min_ + self.max_) / 2.0
        document_aligner = SequenceAligner(sentence_pair_score, gap_penalty)
        self.model = YalignModel(document_aligner, 1)

    def test_save_file_created(self):
        tmp_folder = tempfile.mkdtemp()
        self.model.save(tmp_folder)
        model_path = os.path.join(tmp_folder, "aligner.pickle")
        metadata_path = os.path.join(tmp_folder, "metadata.json")
        self.assertTrue(os.path.exists(model_path))
        self.assertTrue(os.path.exists(metadata_path))

    def test_save_load_and_align(self):
        doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])]
        doc2 = [Sentence([u"Casa"])]
        result_before_save = self.model.align(doc1, doc2)

        # Save
        tmp_folder = tempfile.mkdtemp()
        self.model.save(tmp_folder)

        # Load
        new_model = YalignModel.load(tmp_folder)
        result_after_load = new_model.align(doc1, doc2)
        self.assertEqual(result_before_save, result_after_load)
        self.assertEqual(self.model.threshold, new_model.threshold)
        self.assertEqual(self.model.document_pair_aligner.penalty,
                         new_model.document_pair_aligner.penalty)

    def test_reasonable_alignment(self):
        doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])]
        doc2 = [Sentence([u"Casa"])]
        result = self.model.align(doc1, doc2)
        result = [(list(x), list(y)) for x, y in result]
        self.assertIn((list(doc1[0]), list(doc2[0])), result)

    def test_optimize_gap_penalty_and_threshold_finishes(self):
        self.model.optimize_gap_penalty_and_threshold(self.A, self.B,
                                                      self.correct_alignments)

    def test_optimize_gap_penalty_and_threshold_is_best(self):
        def evaluate(penalty, threshold):
            self.model.document_pair_aligner.penalty = penalty
            self.model.threshold = threshold
            predicted = self.model.align_indexes(self.A, self.B)
            return F_score(predicted, self.correct_alignments)[0]

        random.seed(hash("12345"))
        self.model.optimize_gap_penalty_and_threshold(self.A, self.B,
                                                      self.correct_alignments)
        best_score = evaluate(self.model.document_pair_aligner.penalty,
                              self.model.threshold)
        for _ in xrange(50):
            penalty = random.uniform(self.min_, self.max_ / 2.0)
            threshold = random.uniform(self.min_, self.max_)
            score = evaluate(penalty, threshold)
            self.assertGreaterEqual(best_score, score)