def test_language_shift(value, expected_language, normalized): shift = LanguageShift.of(value) equal = LanguageShift.of(value) other = ValueToken.of(r"%bar") assert shift.value == value assert shift.clean_value == value assert shift.get_key() == f"LanguageShift⁝{value}" assert shift.lemmatizable is False assert shift.normalized == normalized assert shift.language == expected_language serialized = { "type": "LanguageShift", "normalized": normalized, "language": shift.language.name, } assert_token_serialization(shift, serialized) assert shift == equal assert hash(shift) == hash(equal) assert shift != other assert hash(shift) != hash(other) assert shift != ValueToken.of(value)
def test_parse_normalized_akkadain_shift() -> None: word = "ha" line = f"1. {word} %n {word} %sux {word}" expected = Text((TextLine.of_iterable( LineNumber(1), ( Word.of((Reading.of_name(word), ), DEFAULT_LANGUAGE), LanguageShift.normalized_akkadian(), AkkadianWord.of((ValueToken.of(word), )), LanguageShift.of("%sux"), Word.of((Reading.of_name(word), ), Language.SUMERIAN), ), ), )) assert parse_atf_lark(line).lines == expected.lines
def test_text_line_of_iterable(code: str, language: Language) -> None: tokens = [ Word.of([Reading.of_name("first")]), LanguageShift.of(code), Word.of([Reading.of_name("second")]), LanguageShift.of("%sb"), LoneDeterminative.of([Determinative.of([Reading.of_name("third")])]), Word.of([BrokenAway.open(), Reading.of_name("fourth")]), UnknownNumberOfSigns.of(), BrokenAway.close(), ] expected_tokens = ( Word.of([Reading.of_name("first")], DEFAULT_LANGUAGE), LanguageShift.of(code), Word.of([Reading.of_name("second")], language), LanguageShift.of("%sb"), LoneDeterminative.of([Determinative.of([Reading.of_name("third")])], Language.AKKADIAN), Word.of( [ BrokenAway.open(), Reading.of((ValueToken( frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE, "fourth", ), )).set_enclosure_type(frozenset({EnclosureType.BROKEN_AWAY })), ], DEFAULT_LANGUAGE, ), UnknownNumberOfSigns(frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE), BrokenAway.close().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), ) line = TextLine.of_iterable(LINE_NUMBER, tokens) assert line.line_number == LINE_NUMBER assert line.content == expected_tokens assert ( line.key == f"TextLine⁞{line.atf}⟨{'⁚'.join(token.get_key() for token in expected_tokens)}⟩" ) assert line.atf == f"1. first {code} second %sb {{third}} [fourth ...]"
def test_text_line_of_iterable_normalized() -> None: tokens = [ LanguageShift.normalized_akkadian(), AkkadianWord.of((ValueToken.of("kur"), )), ] expected_tokens = ( LanguageShift.normalized_akkadian(), AkkadianWord.of((ValueToken.of("kur"), )), ) line = TextLine.of_iterable(LINE_NUMBER, tokens) assert line.content == expected_tokens assert ( line.key == f"TextLine⁞{line.atf}⟨{'⁚'.join(token.get_key() for token in expected_tokens)}⟩" ) assert line.atf == "1. %n kur"
def test_parse_atf_language_shifts(code: str, expected_language: Language) -> None: word = "ha-am" parts = [Reading.of_name("ha"), Joiner.hyphen(), Reading.of_name("am")] line = f"1. {word} {code} {word} %sb {word}" expected = Text((TextLine.of_iterable( LineNumber(1), ( Word.of(parts, DEFAULT_LANGUAGE), LanguageShift.of(code), Word.of(parts, expected_language), LanguageShift.of("%sb"), Word.of(parts, Language.AKKADIAN), ), ), )) assert parse_atf_lark(line).lines == expected.lines
def expected_transliteration(language: Language) -> Sequence[Token]: return ( Word.of([Reading.of_name("bu")], language), LanguageShift.of("%es"), Word.of( [ BrokenAway.open(), Reading.of((ValueToken( frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE, "kur", ), )).set_enclosure_type(frozenset({EnclosureType.BROKEN_AWAY })), ], Language.EMESAL, ), UnknownNumberOfSigns(frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE), BrokenAway.close().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), )
class LineVariantFactory(factory.Factory): class Meta: model = LineVariant class Params: manuscript_id = factory.Sequence(lambda n: n) manuscript = factory.SubFactory( ManuscriptLineFactory, manuscript_id=factory.SelfAttribute("..manuscript_id"), ) reconstruction = ( LanguageShift.normalized_akkadian(), AkkadianWord.of((ValueToken.of("buāru"),)), MetricalFootSeparator.uncertain(), BrokenAway.open(), UnknownNumberOfSigns.of(), Caesura.certain(), AkkadianWord.of( ( UnknownNumberOfSigns.of(), BrokenAway.close(), Joiner.hyphen(), ValueToken.of("buāru"), ), (Flag.DAMAGE,), ), ) note = factory.fuzzy.FuzzyChoice([None, NoteLine((StringPart("a note"),))]) manuscripts = factory.List([factory.SelfAttribute("..manuscript")], TupleFactory) intertext = factory.fuzzy.FuzzyChoice([tuple(), (StringPart("bar"),)]) parallel_lines = factory.List( [ factory.SubFactory(ParallelCompositionFactory), factory.SubFactory(ParallelTextFactory), factory.SubFactory(ParallelFragmentFactory), ], TupleFactory, )
LineNumber(1), [ Word.of(unique_lemma=(WordId("nu I"), ), parts=[Reading.of_name("bu")]) ], ), ), ( TextLine.of_iterable( LineNumber(1), [ Word.of(unique_lemma=(WordId("nu I"), ), parts=[Reading.of_name("bu")]) ], ), TextLine.of_iterable(LineNumber(1), [LanguageShift.of("%sux")]), TextLine.of_iterable(LineNumber(1), [LanguageShift.of("%sux")]), ), ( TextLine.of_iterable( LineNumber(1), [ Word.of(unique_lemma=(WordId("nu I"), ), parts=[Reading.of_name("bu")]) ], ), TextLine.of_iterable(LineNumber(1), [Word.of([Reading.of_name("mu")])]), TextLine.of_iterable(LineNumber(1), [Word.of([Reading.of_name("mu")])]), ),
class LemmatizedFragmentFactory(TransliteratedFragmentFactory): text = Text(( TextLine.of_iterable( LineNumber(1, True), ( Word.of([UnidentifiedSign.of()]), Word.of([ Logogram.of_name( "BA", surrogate=[ Reading.of_name("ku"), Joiner.hyphen(), Reading.of_name("u", 4), ], ) ]), Column.of(), Tabulation.of(), Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), Joiner.hyphen(), Reading.of_name("ku"), BrokenAway.close(), Joiner.hyphen(), Reading.of_name("nu"), Joiner.hyphen(), Reading.of_name("ši"), ]), Variant.of(Divider.of(":"), Reading.of_name("ku")), Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Column.of(2), Divider.of(":", ("@v", ), (Flag.DAMAGE, )), CommentaryProtocol.of("!qt"), Word.of([Number.of_name("10", flags=[Flag.DAMAGE])]), ), ), TextLine.of_iterable( LineNumber(2, True), ( Word.of([BrokenAway.open(), UnknownNumberOfSigns.of()]), Word.of([Logogram.of_name("GI", 6)], unique_lemma=(WordId("ginâ I"), )), Word.of([Reading.of_name("ana")], unique_lemma=(WordId("ana I"), )), Word.of( [ Reading.of_name("u₄"), Joiner.hyphen(), Reading.of_name("š[u"), ], unique_lemma=(WordId("ūsu I"), ), ), Word.of([UnknownNumberOfSigns.of(), BrokenAway.close()]), ), ), TextLine.of_iterable( LineNumber(3, True), ( Word.of([BrokenAway.open(), UnknownNumberOfSigns.of()]), Word.of( unique_lemma=(WordId("kīdu I"), ), parts=[ Reading.of(( ValueToken.of("k"), BrokenAway.close(), ValueToken.of("i"), )), Joiner.hyphen(), Reading.of_name("du"), ], ), Word.of(unique_lemma=(WordId("u I"), ), parts=[Reading.of_name("u")]), Word.of( unique_lemma=(WordId("bamātu I"), ), parts=[ Reading.of_name("ba"), Joiner.hyphen(), Reading.of_name("ma"), Joiner.hyphen(), Reading.of(( ValueToken.of("t"), BrokenAway.open(), ValueToken.of("i"), )), ], ), Word.of([UnknownNumberOfSigns.of(), BrokenAway.close()]), ), ), TextLine.of_iterable( LineNumber(6, True), ( Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Word.of([UnclearSign.of([Flag.DAMAGE])]), Word.of(unique_lemma=(WordId("mu I"), ), parts=[Reading.of_name("mu")]), Word.of( unique_lemma=(WordId("tamalāku I"), ), parts=[ Reading.of_name("ta"), Joiner.hyphen(), Reading.of_name("ma"), InWordNewline.of(), Joiner.hyphen(), Reading.of_name("tu", 2), ], ), ), ), TextLine.of_iterable( LineNumber(7, True), ( Word.of([ Variant.of(Reading.of_name("šu"), CompoundGrapheme.of(["BI×IS"])) ]), LanguageShift.normalized_akkadian(), AkkadianWord.of([ValueToken.of("kur")], unique_lemma=(WordId("normalized I"), )), ), ), StateDollarLine( atf.Qualification.AT_LEAST, 1, ScopeContainer(atf.Surface.OBVERSE, ""), atf.State.MISSING, None, ), ImageDollarLine("1", None, "numbered diagram of triangle"), RulingDollarLine(atf.Ruling.SINGLE), LooseDollarLine("this is a loose line"), SealDollarLine(1), SealAtLine(1), HeadingAtLine(1), ColumnAtLine(ColumnLabel([atf.Status.COLLATION], 1)), SurfaceAtLine( SurfaceLabel([atf.Status.COLLATION], atf.Surface.SURFACE, "stone wig")), ObjectAtLine( ObjectLabel([atf.Status.COLLATION], atf.Object.OBJECT, "stone wig")), DiscourseAtLine(atf.Discourse.DATE), DivisionAtLine("paragraph", 5), CompositeAtLine(atf.Composite.DIV, "part", 1), NoteLine(( StringPart("a note "), EmphasisPart("italic"), LanguagePart.of_transliteration( Language.AKKADIAN, (Word.of([Reading.of_name("bu")]), )), )), ParallelComposition(False, "my name", LineNumber(1)), ParallelText( True, TextId(CorpusGenre.LITERATURE, 1, 1), ChapterName(Stage.OLD_BABYLONIAN, "", "my name"), LineNumber(1), False, ), ParallelFragment(False, MuseumNumber.of("K.1"), True, Labels(), LineNumber(1), False), ))
class TransliteratedFragmentFactory(FragmentFactory): text = Text(( TextLine.of_iterable( LineNumber(1, True), ( Word.of([UnidentifiedSign.of()]), Word.of([ Logogram.of_name( "BA", surrogate=[ Reading.of_name("ku"), Joiner.hyphen(), Reading.of_name("u", 4), ], ) ]), Column.of(), Tabulation.of(), Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), Joiner.hyphen(), Reading.of_name("ku"), BrokenAway.close(), Joiner.hyphen(), Reading.of_name("nu"), Joiner.hyphen(), Reading.of_name("ši"), ]), Variant.of(Divider.of(":"), Reading.of_name("ku")), Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Column.of(2), Divider.of(":", ("@v", ), (Flag.DAMAGE, )), CommentaryProtocol.of("!qt"), Word.of([Number.of_name("10", flags=[Flag.DAMAGE])]), ), ), TextLine.of_iterable( LineNumber(2, True), ( Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Word.of([Logogram.of_name("GI", 6)]), Word.of([Reading.of_name("ana")]), Word.of([ Reading.of_name("u", 4), Joiner.hyphen(), Reading.of(( ValueToken.of("š"), BrokenAway.open(), ValueToken.of("u"), )), ]), Word.of([UnknownNumberOfSigns.of(), BrokenAway.close()]), ), ), TextLine.of_iterable( LineNumber(3, True), ( Word.of([BrokenAway.open(), UnknownNumberOfSigns.of()]), Word.of([ Reading.of(( ValueToken.of("k"), BrokenAway.close(), ValueToken.of("i"), )), Joiner.hyphen(), Reading.of_name("du"), ]), Word.of([Reading.of_name("u")]), Word.of([ Reading.of_name("ba"), Joiner.hyphen(), Reading.of_name("ma"), Joiner.hyphen(), Reading.of(( ValueToken.of("t"), BrokenAway.open(), ValueToken.of("i"), )), ]), Word.of([UnknownNumberOfSigns.of(), BrokenAway.close()]), ), ), TextLine.of_iterable( LineNumber(6, True), ( Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Word.of([UnclearSign.of([Flag.DAMAGE])]), Word.of([Reading.of_name("mu")]), Word.of([ Reading.of_name("ta"), Joiner.hyphen(), Reading.of_name("ma"), InWordNewline.of(), Joiner.hyphen(), Reading.of_name("tu", 2), ]), ), ), TextLine.of_iterable( LineNumber(7, True), ( Word.of([ Variant.of(Reading.of_name("šu"), CompoundGrapheme.of(["BI×IS"])) ]), LanguageShift.normalized_akkadian(), AkkadianWord.of([ValueToken.of("kur")]), ), ), StateDollarLine( atf.Qualification.AT_LEAST, 1, ScopeContainer(atf.Surface.OBVERSE, ""), atf.State.MISSING, None, ), ImageDollarLine("1", None, "numbered diagram of triangle"), RulingDollarLine(atf.Ruling.SINGLE), LooseDollarLine("this is a loose line"), SealDollarLine(1), SealAtLine(1), HeadingAtLine(1), ColumnAtLine(ColumnLabel([atf.Status.COLLATION], 1)), SurfaceAtLine( SurfaceLabel([atf.Status.COLLATION], atf.Surface.SURFACE, "stone wig")), ObjectAtLine( ObjectLabel([atf.Status.COLLATION], atf.Object.OBJECT, "stone wig")), DiscourseAtLine(atf.Discourse.DATE), DivisionAtLine("paragraph", 5), CompositeAtLine(atf.Composite.DIV, "part", 1), NoteLine(( StringPart("a note "), EmphasisPart("italic"), LanguagePart.of_transliteration( Language.AKKADIAN, (Word.of([Reading.of_name("bu")]), )), )), ParallelComposition(False, "my name", LineNumber(1)), ParallelText( True, TextId(CorpusGenre.LITERATURE, 1, 1), ChapterName(Stage.OLD_BABYLONIAN, "", "my name"), LineNumber(1), False, ), ParallelFragment(False, MuseumNumber.of("K.1"), True, Labels(), LineNumber(1), False), )) signs = ( "X BA KU ABZ075 ABZ207a\\u002F207b\\u0020X ABZ377n1/KU ABZ377n1 ABZ411\n" "MI DIŠ UD ŠU\n" "KI DU ABZ411 BA MA TI\n" "X MU TA MA UD\n" "ŠU/|BI×IS|") folios = Folios((Folio("WGL", "3"), Folio("XXX", "3"))) record = Record((RecordEntry("test", RecordType.TRANSLITERATION), )) line_to_vec = (( LineToVecEncoding.TEXT_LINE, LineToVecEncoding.TEXT_LINE, LineToVecEncoding.TEXT_LINE, LineToVecEncoding.TEXT_LINE, LineToVecEncoding.TEXT_LINE, LineToVecEncoding.SINGLE_RULING, ), )
def ebl_atf_text_line__greek_shift(self, value): return LanguageShift.of(str(value))
def ebl_atf_text_line__normalized_akkadian_shift(self, value): return LanguageShift.of(str(value))
def test_reconstructed_line(text, expected) -> None: assert parse_reconstructed_line(f"%n {text}") == ( LanguageShift.normalized_akkadian(), *expected, )
from ebl.transliteration.domain.sign_tokens import Divider, Reading from ebl.transliteration.domain.tokens import ( Column, CommentaryProtocol, ErasureState, Joiner, LanguageShift, Tabulation, UnknownNumberOfSigns, ValueToken, Variant, ) TOKENS = [ UnknownNumberOfSigns(frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE), LanguageShift.of("%sux"), DocumentOrientedGloss.open(), ] def test_value_token(): value = "value" token = ValueToken.of(value) equal = ValueToken.of(value) other = ValueToken.of("anothervalue") assert token.value == value assert token.clean_value == value assert token.get_key() == f"ValueToken⁝{value}" assert token.lemmatizable is False
from ebl.transliteration.domain.markup import EmphasisPart, LanguagePart, StringPart from ebl.transliteration.domain.note_line import NoteLine from ebl.transliteration.domain.sign_tokens import Reading from ebl.transliteration.domain.tokens import ( EnclosureType, ErasureState, LanguageShift, Token, UnknownNumberOfSigns, ValueToken, ) from ebl.transliteration.domain.word_tokens import Word TRANSLITERATION: Sequence[Token] = ( Word.of([Reading.of_name("bu")]), LanguageShift.of("%es"), Word.of([BrokenAway.open(), Reading.of_name("kur")]), UnknownNumberOfSigns.of(), BrokenAway.close(), ) EXPECTED_ATF = "bu %es [kur ...]" def expected_transliteration(language: Language) -> Sequence[Token]: return ( Word.of([Reading.of_name("bu")], language), LanguageShift.of("%es"), Word.of( [ BrokenAway.open(), Reading.of((ValueToken(
frozenset({EnclosureType.BROKEN_AWAY})), UnclearSign.of().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), )), Word.of(( UnknownNumberOfSigns.of().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), BrokenAway.close().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), )).set_enclosure_type(frozenset({EnclosureType.BROKEN_AWAY})), ), )), ( "%n [...]", ( LanguageShift.normalized_akkadian(), BrokenAway.open(), UnknownNumberOfSigns(frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE), BrokenAway.close().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), ), ), ( "%n (...)", ( LanguageShift.normalized_akkadian(), PerhapsBrokenAway.open(), UnknownNumberOfSigns(frozenset({EnclosureType.PERHAPS}), ErasureState.NONE), PerhapsBrokenAway.close().set_enclosure_type(
Joiner.hyphen(), UnknownNumberOfSigns.of(), Joiner.hyphen(), Reading.of_name("ad"), )), ), ) ], ), ( "1. %grc ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω", [ TextLine.of_iterable( LineNumber(1), ( LanguageShift.of("%grc"), GreekWord.of([ GreekLetter.of(letter) for letter in ("ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμ" "ΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω") ]), ), ) ], ), ( "1. %akkgrc α", [ TextLine.of_iterable( LineNumber(1), (
def make_token(self, data, **kwargs): return LanguageShift( frozenset(data["enclosure_type"]), data["erasure"], data["value"] )
def test_merging_lines(corpus, text_repository, bibliography, changelog, signs, sign_repository, user, when) -> None: reconstruction = ( LanguageShift.normalized_akkadian(), AkkadianWord.of((ValueToken.of("buāru"), )), ) is_second_line_of_parallelism = False is_beginning_of_section = False text_line = TextLine.of_iterable( LineNumber(1), ( Word.of([Reading.of_name("ku")], unique_lemma=(WordId("word1"), ), alignment=0), Word.of([Reading.of_name("nu")], unique_lemma=(WordId("word2"), ), alignment=1), ), ) manuscript_id = CHAPTER_WITHOUT_DOCUMENTS.manuscripts[0].id line = Line( LineNumber(1), (LineVariant( reconstruction, None, (ManuscriptLine(manuscript_id, tuple(), text_line), ), ), ), not is_second_line_of_parallelism, not is_beginning_of_section, ) new_text_line = TextLine.of_iterable( LineNumber(1), (Word.of([Reading.of_name("ku")]), Word.of([Reading.of_name("ba")])), ) new_line = Line( LineNumber(1), (LineVariant( reconstruction, None, (ManuscriptLine(manuscript_id, tuple(), text_line.merge(new_text_line)), ), ), ), is_second_line_of_parallelism, is_beginning_of_section, ) old_chapter = attr.evolve(CHAPTER_WITHOUT_DOCUMENTS, lines=(line, )) updated_chapter = attr.evolve( CHAPTER, lines=(new_line, ), signs=("KU BA\nKU\nABZ075", ), parser_version=ATF_PARSER_VERSION, ) expect_find_and_update_chapter( bibliography, changelog, old_chapter, updated_chapter, signs, sign_repository, text_repository, user, when, ) assert (corpus.update_lines( CHAPTER.id_, LinesUpdate( [], set(), { 0: Line( LineNumber(1), (LineVariant( reconstruction, None, (ManuscriptLine(manuscript_id, tuple(), new_text_line), ), ), ), is_second_line_of_parallelism, is_beginning_of_section, ) }, ), user, ) == updated_chapter)