Пример #1
0
 def test_unknown_token(self):
     """
     Tests process with unrecognizable tokens.
     """
     atf = ATFConverter(two_three=True)
     signs = ["a2", "☉", "be3"]
     target = ["a₂", "☉", "be₃"]
     output = atf.process(signs)
     self.assertEqual(output, target)
Пример #2
0
 def test_accents(self):
     """
     Tests process with two_three as inactive.
     """
     atf = ATFConverter(two_three=False)
     signs = ["a", "a2", "a3", "be2", "bad3", "buru14"]
     target = ["a", "á", "à", "bé", "bàd", "buru₁₄"]
     output = atf.process(signs)
     self.assertEqual(output, target)
Пример #3
0
 def test_single_sign(self):
     """
     Tests process with two_three as active.
     """
     atf = ATFConverter(two_three=True)
     signs = ["a", "a1", "a2", "a3", "be2", "be3", "bad2", "bad3"]
     target = ["a", "a₁", "a₂", "a₃", "be₂", "be₃", "bad₂", "bad₃"]
     output = atf.process(signs)
     self.assertEqual(output, target)
Пример #4
0
 def test_convert_num(self):
     """
     Tests _convert_num
     """
     atf = ATFConverter()
     signs = ["a2", "☉", "be3", 'bad3']
     target = ["a₂", "☉", "be₃", 'bad₃']
     output = [atf._convert_num(s) for s in signs]  # pylint: disable=protected-access
     self.assertEqual(output, target)
Пример #5
0
 def test_get_number_from_sign(self):
     """
     Tests get_number_from_sign.
     """
     atf = ATFConverter()
     signs = ["a", "a1", "be2", "bad3", "buru14"]
     target = [0, 1, 2, 3, 14]
     output = [atf._get_number_from_sign(s)[1] for s in signs]  # pylint: disable=protected-access
     self.assertEqual(output, target)
Пример #6
0
 def test_convert_consonant(self):
     """
     Tests convert_consonant.
     """
     atf = ATFConverter()
     signs = ['as,', 'S,ATU', 'tet,', 'T,et', 'sza', 'ASZ']
     target = ['aṣ', 'ṢATU', 'teṭ', 'Ṭet', 'ša', 'AŠ']
     output = [atf._convert_consonant(s) for s in signs]  # pylint: disable=protected-access
     self.assertEqual(output, target)
Пример #7
0
from collections import Counter
from Importer.file_importer import FileImport
from Importer.cdli_corpus import CDLICorpus
from ATFConverter.tokenizer import Tokenizer
from ATFConverter.atf_converter import ATFConverter


fi = FileImport('texts/Akkadian.txt')
fi.read_file()
cc = CDLICorpus()
cc.parse_file(fi.file_lines)
tk = Tokenizer()
atf = ATFConverter()
stopwords = ['a-na', 'u3', 'sza', '[...]', 'i-na', '=',
             'ARM', '01,', 'lang', 'akk', 'um-ma', 'la',
             'u2-ul', 'mesz_', 'asz-szum', '0.1', 'broken',
             'isz-tu', '_lu2_', 'ki-a-am', '1(disz)', 'ki-ma',
             'x', 'sza-a-ti', 'the', '_lu2', '...]', 'lu-u2',
             'sza#', 'a-na#', '_u4', 'beginning', 'of', '2(disz)',
             '[a-na', 'szum-ma', 'hi-a_', 'ana', 'a-di']

bag_of_words = []
for lines in cc.catalog['P249253']['transliteration']:
    for word in tk.word_tokenizer(lines):
        if word[0] not in stopwords:
            bag_of_words.append('-'.join(atf.process(word[0].split('-'))))
frequency_analysis = Counter(bag_of_words).most_common(11)
print(frequency_analysis)