Пример #1
0
 def test_score_order(self):
     a = Sentence(u"Call History .".split())
     b = Sentence(u"Historial de llamadas .".split())
     score1 = self.score(a, b)
     a = Sentence(u"Replace the cover .".split())
     b = Sentence(u"Vuelva a ingresar un nuevo código de bloqueo .".split())
     score2 = self.score(a, b)
     self.assertLess(score1, score2)
 def test_sample_values(self):
     A, B = sentences([u'A', u'B']), sentences([u'Y', u'Z'])
     samples = list(_misaligned_samples(A, B, [(0, 0), (1, 1)]))
     s0 = SentencePair(Sentence([u'A']), Sentence([u'Z']))
     s1 = SentencePair(Sentence([u'B']), Sentence([u'Y']))
     self.assertEquals(2, len(samples))
     for sample in samples:
         self.assertTrue(sample in [s0, s1])
 def test_sample_values(self):
     A, B = sentences([u'A', u'B']), sentences([u'Y', u'Z'])
     samples = list(_aligned_samples(A, B, [(0, 1), (1, 0)]))
     # Note alignments swapped so A -> Z and B-> Y
     s0 = SentencePair(Sentence([u'A']), Sentence([u'Z']), aligned=True)
     s1 = SentencePair(Sentence([u'B']), Sentence([u'Y']), aligned=True)
     self.assertEquals(2, len(samples))
     self.assertEquals([s0, s1], samples)
Пример #4
0
 def test_generates_numbers(self):
     a = Sentence(u"house you".split())
     b = Sentence(u"casa usted".split())
     x = self.score(a, b)
     self.assertIsInstance(x, (int, float))
     a = Sentence(u"Valar Morghulis".split())
     b = Sentence(u"Dracarys".split())
     x = self.score(a, b)
     self.assertIsInstance(x, (int, float))
Пример #5
0
    def test_number_of_word_pair_scores_better_than_all_mismatchs(self):
        a = Sentence(u"house µa µb µc µd".split())
        b = Sentence(u"casa  µ1 µ2 µ3 µ4".split())
        s1 = self.score.problem.number_of_word_pair_scores(SentencePair(a, b))

        c = Sentence(u"µx µa µb µc µd".split())
        d = Sentence(u"µ5 µ1 µ2 µ3 µ4".split())
        s2 = self.score.problem.number_of_word_pair_scores(SentencePair(c, d))

        self.assertGreater(s1, s2)
Пример #6
0
def tokenize(text, language="en"):
    """
    Returns a Sentence with Words (ie, a list of unicode objects)
    """
    if not isinstance(text, unicode):
        raise ValueError("Can only tokenize unicode strings")
    return Sentence(_tokenizers[language].tokenize(text), text=text)
Пример #7
0
    def test_save_load_and_align(self):
        doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])]
        doc2 = [Sentence([u"Casa"])]
        result_before_save = self.model.align(doc1, doc2)

        # Save
        tmp_folder = tempfile.mkdtemp()
        self.model.save(tmp_folder)

        # Load
        new_model = YalignModel.load(tmp_folder)
        result_after_load = new_model.align(doc1, doc2)
        self.assertEqual(result_before_save, result_after_load)
        self.assertEqual(self.model.threshold, new_model.threshold)
        self.assertEqual(self.model.document_pair_aligner.penalty,
                         new_model.document_pair_aligner.penalty)
Пример #8
0
def srt_to_document(text, lang="en"):
    """ Convert a string of srt into a list of Sentences. """
    text = UnicodeDammit(text).markup
    d = []
    for m in SRT_REGEX.finditer(text):
        sent = m.group(1)
        sent = SRT_PRE_IGNORE.sub("", sent)
        sent = Sentence(x for x in tokenize(sent, lang)
                        if x not in SRT_POST_IGNORE)
        d.append(sent)
    return d
Пример #9
0
def _sentence_from_csv_elem(elem, label, labels):
    words = elem[labels[label]].decode("utf-8").split()
    sentence = Sentence(words)
    sentence.check_is_tokenized()
    return sentence
Пример #10
0
def _document(lines):
    doc = list([Sentence(line.split()) for line in lines])
    for sentence in doc:
        sentence.check_is_tokenized()
    return doc
Пример #11
0
def _sentence_from_csv_elem(elem, label, labels):
    words = elem[labels[label]].decode("utf-8").split()
    sentence = Sentence(words)
    sentence.check_is_tokenized()
    return sentence
def sentences(xs):
    return [Sentence([unicode(x)]) for x in xs]
Пример #13
0
 def test_reasonable_alignment(self):
     doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])]
     doc2 = [Sentence([u"Casa"])]
     result = self.model.align(doc1, doc2)
     result = [(list(x), list(y)) for x, y in result]
     self.assertIn((list(doc1[0]), list(doc2[0])), result)