예제 #1
0
 def setUp(self):
     base_path = os.path.dirname(os.path.abspath(__file__))
     word_scores = os.path.join(base_path, "data",
                                "test_word_scores_big.csv")
     word_pair_score = WordPairScore(word_scores)
     fin = os.path.join(base_path, "data", "parallel-en-es.txt")
     A, B = parallel_corpus_to_documents(fin)
     self.alignments = list(training_alignments_from_documents(A, B))
     self.score = SentencePairScore()
     self.score.train(self.alignments, word_pair_score)
예제 #2
0
    def test_correlation_values(self):
        base_path = os.path.dirname(os.path.abspath(__file__))
        parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt")
        word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv")
        A, B = parallel_corpus_to_documents(parallel_corpus)
        self.alignments = [x for x in training_alignments_from_documents(A, B)]
        # Word score
        word_pair_score = WordPairScore(word_scores)
        # Sentence Score
        sentence_pair_score = SentencePairScore()
        sentence_pair_score.train(self.alignments, word_pair_score)

        cor = correlation(sentence_pair_score.classifier)
        for attr, value in cor.iteritems():
            if value is not numpy.nan:
                self.assertTrue(-1 <= value <= 1)
예제 #3
0
class TestSentencePairScore(unittest.TestCase):
    def setUp(self):
        base_path = os.path.dirname(os.path.abspath(__file__))
        word_scores = os.path.join(base_path, "data",
                                   "test_word_scores_big.csv")
        word_pair_score = WordPairScore(word_scores)
        fin = os.path.join(base_path, "data", "parallel-en-es.txt")
        A, B = parallel_corpus_to_documents(fin)
        self.alignments = list(training_alignments_from_documents(A, B))
        self.score = SentencePairScore()
        self.score.train(self.alignments, word_pair_score)

    def test_generates_numbers(self):
        a = Sentence(u"house you".split())
        b = Sentence(u"casa usted".split())
        x = self.score(a, b)
        self.assertIsInstance(x, (int, float))
        a = Sentence(u"Valar Morghulis".split())
        b = Sentence(u"Dracarys".split())
        x = self.score(a, b)
        self.assertIsInstance(x, (int, float))

    def test_score_order(self):
        a = Sentence(u"Call History .".split())
        b = Sentence(u"Historial de llamadas .".split())
        score1 = self.score(a, b)
        a = Sentence(u"Replace the cover .".split())
        b = Sentence(u"Vuelva a ingresar un nuevo código de bloqueo .".split())
        score2 = self.score(a, b)
        self.assertLess(score1, score2)

    def test_score_in_bounds(self):
        for alignment in self.alignments:
            score = self.score(*alignment)
            self.assertGreaterEqual(score, self.score.min_bound)
            self.assertLessEqual(score, self.score.max_bound)

    def test_number_of_word_pair_scores_better_than_all_mismatchs(self):
        a = Sentence(u"house µa µb µc µd".split())
        b = Sentence(u"casa  µ1 µ2 µ3 µ4".split())
        s1 = self.score.problem.number_of_word_pair_scores(SentencePair(a, b))

        c = Sentence(u"µx µa µb µc µd".split())
        d = Sentence(u"µ5 µ1 µ2 µ3 µ4".split())
        s2 = self.score.problem.number_of_word_pair_scores(SentencePair(c, d))

        self.assertGreater(s1, s2)
예제 #4
0
    def test_correlation_values(self):
        base_path = os.path.dirname(os.path.abspath(__file__))
        parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt")
        word_scores = os.path.join(base_path, "data",
                                   "test_word_scores_big.csv")
        A, B = parallel_corpus_to_documents(parallel_corpus)
        self.alignments = [x for x in training_alignments_from_documents(A, B)]
        # Word score
        word_pair_score = WordPairScore(word_scores)
        # Sentence Score
        sentence_pair_score = SentencePairScore()
        sentence_pair_score.train(self.alignments, word_pair_score)

        cor = correlation(sentence_pair_score.classifier)
        for attr, value in cor.iteritems():
            if value is not numpy.nan:
                self.assertTrue(-1 <= value <= 1)
예제 #5
0
 def setUp(self):
     word_scores = os.path.join(data_path, "test_word_scores_big.csv")
     self.parallel_corpus = os.path.join(data_path, "parallel-en-es.txt")
     # Documents
     A, B = parallel_corpus_to_documents(self.parallel_corpus)
     self.document_a = A[:30]
     self.document_b = B[:30]
     training = training_alignments_from_documents(self.document_a,
                                                   self.document_b)
     # Word score
     word_pair_score = WordPairScore(word_scores)
     # Sentence Score
     sentence_pair_score = SentencePairScore()
     sentence_pair_score.train(training, word_pair_score)
     # Yalign model
     document_aligner = SequenceAligner(sentence_pair_score, 0.49)
     self.model = YalignModel(document_aligner)
class TestSentencePairScore(unittest.TestCase):
    def setUp(self):
        base_path = os.path.dirname(os.path.abspath(__file__))
        word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv")
        word_pair_score = WordPairScore(word_scores)
        fin = os.path.join(base_path, "data", "parallel-en-es.txt")
        A, B = parallel_corpus_to_documents(fin)
        self.alignments = list(training_alignments_from_documents(A, B))
        self.score = SentencePairScore()
        self.score.train(self.alignments, word_pair_score)

    def test_generates_numbers(self):
        a = Sentence(u"house you".split())
        b = Sentence(u"casa usted".split())
        x = self.score(a, b)
        self.assertIsInstance(x, (int, float))
        a = Sentence(u"Valar Morghulis".split())
        b = Sentence(u"Dracarys".split())
        x = self.score(a, b)
        self.assertIsInstance(x, (int, float))

    def test_score_order(self):
        a = Sentence(u"Call History .".split())
        b = Sentence(u"Historial de llamadas .".split())
        score1 = self.score(a, b)
        a = Sentence(u"Replace the cover .".split())
        b = Sentence(u"Vuelva a ingresar un nuevo código de bloqueo .".split())
        score2 = self.score(a, b)
        self.assertLess(score1, score2)

    def test_score_in_bounds(self):
        for alignment in self.alignments:
            score = self.score(*alignment)
            self.assertGreaterEqual(score, self.score.min_bound)
            self.assertLessEqual(score, self.score.max_bound)

    def test_number_of_word_pair_scores_better_than_all_mismatchs(self):
        a = Sentence(u"house µa µb µc µd".split())
        b = Sentence(u"casa  µ1 µ2 µ3 µ4".split())
        s1 = self.score.problem.number_of_word_pair_scores(SentencePair(a, b))

        c = Sentence(u"µx µa µb µc µd".split())
        d = Sentence(u"µ5 µ1 µ2 µ3 µ4".split())
        s2 = self.score.problem.number_of_word_pair_scores(SentencePair(c, d))

        self.assertGreater(s1, s2)
예제 #7
0
 def setUp(self):
     word_scores = os.path.join(data_path, "test_word_scores_big.csv")
     self.parallel_corpus = os.path.join(data_path, "parallel-en-es.txt")
     # Documents
     A, B = parallel_corpus_to_documents(self.parallel_corpus)
     self.document_a = A[:30]
     self.document_b = B[:30]
     training = training_alignments_from_documents(self.document_a,
                                                   self.document_b)
     # Word score
     word_pair_score = WordPairScore(word_scores)
     # Sentence Score
     sentence_pair_score = SentencePairScore()
     sentence_pair_score.train(training, word_pair_score)
     # Yalign model
     document_aligner = SequenceAligner(sentence_pair_score, 0.49)
     self.model = YalignModel(document_aligner)
 def setUp(self):
     base_path = os.path.dirname(os.path.abspath(__file__))
     word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv")
     word_pair_score = WordPairScore(word_scores)
     fin = os.path.join(base_path, "data", "parallel-en-es.txt")
     A, B = parallel_corpus_to_documents(fin)
     self.alignments = list(training_alignments_from_documents(A, B))
     self.score = SentencePairScore()
     self.score.train(self.alignments, word_pair_score)
예제 #9
0
def basic_model(corpus_filepath,
                word_scores_filepath,
                lang_a=None,
                lang_b=None):
    """
    Creates and trains a `YalignModel` with the basic configuration and
    default values.

    `corpus_filepath` is the path to a parallel corpus used for training,
    it can be:
        - a csv file with two sentences and alignement information, or
        - a tmx file with correct alignments (a regular parallel corpus), or
        - a text file with interleaved sentences (one line in language A, the
          next in language B)

    `word_scores_filepath` is the path to a csv file (possibly gzipped) with
    word dictionary data. (for ex. "house,casa,0.91").

    `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx
    file. In the other cases is not necesary because it's assumed that the
    words are already tokenized.
    """
    # Word score
    word_pair_score = WordPairScore(word_scores_filepath)

    if corpus_filepath.endswith(".tmx"):
        A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b)
    else:
        A, B = parallel_corpus_to_documents(corpus_filepath)
    alignments = training_alignments_from_documents(A, B)

    sentence_pair_score = SentencePairScore()
    sentence_pair_score.train(alignments, word_pair_score)
    # Yalign model
    metadata = {"lang_a": lang_a, "lang_b": lang_b}
    gap_penalty = 0.49
    threshold = 1.0
    document_aligner = SequenceAligner(sentence_pair_score, gap_penalty)
    model = YalignModel(document_aligner, threshold, metadata=metadata)
    A, B, correct = training_scrambling_from_documents(
        A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE])
    model.optimize_gap_penalty_and_threshold(A, B, correct)
    return model
예제 #10
0
 def setUp(self):
     random.seed(hash("Y U NO?"))
     base_path = os.path.dirname(os.path.abspath(__file__))
     word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv")
     parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt")
     A, B = parallel_corpus_to_documents(parallel_corpus)
     A = A[:25]
     B = B[:25]
     self.alignments = list(training_alignments_from_documents(A, B))
     self.A, self.B, self.correct_alignments = list(training_scrambling_from_documents(A, B))
     # Word score
     word_pair_score = WordPairScore(word_scores)
     # Sentence Score
     sentence_pair_score = SentencePairScore()
     sentence_pair_score.train(self.alignments, word_pair_score)
     # Yalign model
     self.min_ = sentence_pair_score.min_bound
     self.max_ = sentence_pair_score.max_bound
     gap_penalty = (self.min_ + self.max_) / 2.0
     document_aligner = SequenceAligner(sentence_pair_score, gap_penalty)
     self.model = YalignModel(document_aligner, 1)
예제 #11
0
def basic_model(corpus_filepath, word_scores_filepath,
                lang_a=None, lang_b=None):
    """
    Creates and trains a `YalignModel` with the basic configuration and
    default values.

    `corpus_filepath` is the path to a parallel corpus used for training,
    it can be:
        - a csv file with two sentences and alignement information, or
        - a tmx file with correct alignments (a regular parallel corpus), or
        - a text file with interleaved sentences (one line in language A, the
          next in language B)

    `word_scores_filepath` is the path to a csv file (possibly gzipped) with
    word dictionary data. (for ex. "house,casa,0.91").

    `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx
    file. In the other cases is not necesary because it's assumed that the
    words are already tokenized.
    """
    # Word score
    word_pair_score = WordPairScore(word_scores_filepath)

    if corpus_filepath.endswith(".tmx"):
        A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b)
    else:
        A, B = parallel_corpus_to_documents(corpus_filepath)
    alignments = training_alignments_from_documents(A, B)

    sentence_pair_score = SentencePairScore()
    sentence_pair_score.train(alignments, word_pair_score)
    # Yalign model
    metadata = {"lang_a": lang_a, "lang_b": lang_b}
    gap_penalty = 0.49
    threshold = 1.0
    document_aligner = SequenceAligner(sentence_pair_score, gap_penalty)
    model = YalignModel(document_aligner, threshold, metadata=metadata)
    A, B, correct = training_scrambling_from_documents(A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE])
    model.optimize_gap_penalty_and_threshold(A, B, correct)
    return model
예제 #12
0
 def setUp(self):
     random.seed(hash("Y U NO?"))
     base_path = os.path.dirname(os.path.abspath(__file__))
     word_scores = os.path.join(base_path, "data",
                                "test_word_scores_big.csv")
     parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt")
     A, B = parallel_corpus_to_documents(parallel_corpus)
     A = A[:25]
     B = B[:25]
     self.alignments = list(training_alignments_from_documents(A, B))
     self.A, self.B, self.correct_alignments = \
                              list(training_scrambling_from_documents(A, B))
     # Word score
     word_pair_score = WordPairScore(word_scores)
     # Sentence Score
     sentence_pair_score = SentencePairScore()
     sentence_pair_score.train(self.alignments, word_pair_score)
     # Yalign model
     self.min_ = sentence_pair_score.min_bound
     self.max_ = sentence_pair_score.max_bound
     gap_penalty = (self.min_ + self.max_) / 2.0
     document_aligner = SequenceAligner(sentence_pair_score, gap_penalty)
     self.model = YalignModel(document_aligner, 1)
예제 #13
0
def default_sentence_pair_score():
    base_path = os.path.dirname(os.path.abspath(__file__))
    word_scores = os.path.join(base_path, "data", "test_word_scores.csv")
    _, classifier_filepath = tempfile.mkstemp()
    training_file = os.path.join(base_path, "data", "test_training.csv")
    pairs = parse_training_file(training_file)
    classifier = SentencePairScore()
    classifier.train(pairs, WordPairScore(word_scores))
    classifier.save(classifier_filepath)
    return classifier, classifier_filepath