def test_score_order(self): a = Sentence(u"Call History .".split()) b = Sentence(u"Historial de llamadas .".split()) score1 = self.score(a, b) a = Sentence(u"Replace the cover .".split()) b = Sentence(u"Vuelva a ingresar un nuevo código de bloqueo .".split()) score2 = self.score(a, b) self.assertLess(score1, score2)
def test_sample_values(self): A, B = sentences([u'A', u'B']), sentences([u'Y', u'Z']) samples = list(_misaligned_samples(A, B, [(0, 0), (1, 1)])) s0 = SentencePair(Sentence([u'A']), Sentence([u'Z'])) s1 = SentencePair(Sentence([u'B']), Sentence([u'Y'])) self.assertEquals(2, len(samples)) for sample in samples: self.assertTrue(sample in [s0, s1])
def test_sample_values(self): A, B = sentences([u'A', u'B']), sentences([u'Y', u'Z']) samples = list(_aligned_samples(A, B, [(0, 1), (1, 0)])) # Note alignments swapped so A -> Z and B-> Y s0 = SentencePair(Sentence([u'A']), Sentence([u'Z']), aligned=True) s1 = SentencePair(Sentence([u'B']), Sentence([u'Y']), aligned=True) self.assertEquals(2, len(samples)) self.assertEquals([s0, s1], samples)
def test_generates_numbers(self): a = Sentence(u"house you".split()) b = Sentence(u"casa usted".split()) x = self.score(a, b) self.assertIsInstance(x, (int, float)) a = Sentence(u"Valar Morghulis".split()) b = Sentence(u"Dracarys".split()) x = self.score(a, b) self.assertIsInstance(x, (int, float))
def test_number_of_word_pair_scores_better_than_all_mismatchs(self): a = Sentence(u"house µa µb µc µd".split()) b = Sentence(u"casa µ1 µ2 µ3 µ4".split()) s1 = self.score.problem.number_of_word_pair_scores(SentencePair(a, b)) c = Sentence(u"µx µa µb µc µd".split()) d = Sentence(u"µ5 µ1 µ2 µ3 µ4".split()) s2 = self.score.problem.number_of_word_pair_scores(SentencePair(c, d)) self.assertGreater(s1, s2)
def tokenize(text, language="en"): """ Returns a Sentence with Words (ie, a list of unicode objects) """ if not isinstance(text, unicode): raise ValueError("Can only tokenize unicode strings") return Sentence(_tokenizers[language].tokenize(text), text=text)
def test_save_load_and_align(self): doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])] doc2 = [Sentence([u"Casa"])] result_before_save = self.model.align(doc1, doc2) # Save tmp_folder = tempfile.mkdtemp() self.model.save(tmp_folder) # Load new_model = YalignModel.load(tmp_folder) result_after_load = new_model.align(doc1, doc2) self.assertEqual(result_before_save, result_after_load) self.assertEqual(self.model.threshold, new_model.threshold) self.assertEqual(self.model.document_pair_aligner.penalty, new_model.document_pair_aligner.penalty)
def srt_to_document(text, lang="en"): """ Convert a string of srt into a list of Sentences. """ text = UnicodeDammit(text).markup d = [] for m in SRT_REGEX.finditer(text): sent = m.group(1) sent = SRT_PRE_IGNORE.sub("", sent) sent = Sentence(x for x in tokenize(sent, lang) if x not in SRT_POST_IGNORE) d.append(sent) return d
def _sentence_from_csv_elem(elem, label, labels): words = elem[labels[label]].decode("utf-8").split() sentence = Sentence(words) sentence.check_is_tokenized() return sentence
def _document(lines): doc = list([Sentence(line.split()) for line in lines]) for sentence in doc: sentence.check_is_tokenized() return doc
def sentences(xs): return [Sentence([unicode(x)]) for x in xs]
def test_reasonable_alignment(self): doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])] doc2 = [Sentence([u"Casa"])] result = self.model.align(doc1, doc2) result = [(list(x), list(y)) for x, y in result] self.assertIn((list(doc1[0]), list(doc2[0])), result)