def test_strip_word_data_large_string(self): input = u'''\\id 1CH \\h PREMIER LIVRE DES CHRONIQUES \\toc1 PREMIER LIVRE DES CHRONIQUES \\toc2 1 Chroniques \\toc3 1 Ch \\mt1 LES LIVRES DES CHRONIQUES \\mt1 PREMIER LIVRE DES CHRONIQUES \\s5 \\c 1 \\p \\v 1 \\w Adam|strong="H121"\\w*, \\w Seth|strong="H8352"\\w*, \\w Énosch|strong="H583"\\w*, \\v 2 \\w Kénan|strong="H7018"\\w*, \\w Mahalaleel|strong="H4111"\\w*, \\w Jéred|strong="H3382"\\w*, \\v 3 \\w Hénoc|strong="H2585"\\w*, \\w Metuschélah|strong="H4968"\\w*, \\w Lémec|strong="H3929"\\w*, \\v 4 \\w Noé|strong="H5146"\\w*, \\w Sem|strong="H8035"\\w*, \\w Cham|strong="H2526"\\w* et \\w Japhet|strong="H3315"\\w*. \\s5 \\v 5 \\w Fils|strong="H1121"\\w* de \\w Japhet|strong="H3315"\\w*: \\w Gomer|strong="H1586"\\w*, \\w Magog|strong="H4031"\\w*, \\w Madaï|strong="H4074"\\w*, \\w Javan|strong="H3120"\\w*, \\w Tubal|strong="H8422"\\w*, \\w Méschec|strong="H4902"\\w* et \\w Tiras|strong="H8494"\\w*. - \\v 6 \\w Fils|strong="H1121"\\w* de \\w Gomer|strong="H1586"\\w*: \\w Aschkenaz|strong="H813"\\w*, \\w Diphat|strong="H7384"\\w* et \\w Togarma|strong="H8425"\\w*. - \\v 7 \\w Fils|strong="H1121"\\w* de \\w Javan|strong="H3120"\\w*: \\w Élischa|strong="H473"\\w*, \\w Tarsisa|strong="H8659"\\w*, \\w Kittim|strong="H3794"\\w* et \\w Rodanim|strong="H1721"\\w*. \\s5 \\v 8 \\w Fils|strong="H1121"\\w* de \\w Cham|strong="H2526"\\w*: \\w Cusch|strong="H3568"\\w*, \\w Mitsraïm|strong="H4714"\\w*, \\w Puth|strong="H6316"\\w* et \\w Canaan|strong="H3667"\\w*. - \\v 9 \\w Fils|strong="H1121"\\w* de \\w Cusch|strong="H3568"\\w*: \\w Saba|strong="H5434"\\w*, \\w Havila|strong="H2341"\\w*, \\w Sabta|strong="H5454"\\w*, \\w Raema|strong="H7484"\\w* et \\w Sabteca|strong="H5455"\\w*. -\\w Fils|strong="H1121"\\w* de \\w Raema|strong="H7484"\\w*: \\w Séba|strong="H7614"\\w* et \\w Dedan|strong="H1719"\\w*. \\v 10 \\w Cusch|strong="H3568"\\w* \\w engendra|strong="H3205" x-morph="strongMorph:TH8804"\\w* \\w Nimrod|strong="H5248"\\w*; c'est lui qui \\w commença|strong="H2490" x-morph="strongMorph:TH8689"\\w* à être \\w puissant|strong="H1368"\\w* sur la \\w terre|strong="H776"\\w*. - ''' expected = read_file(os.path.join(self.resources_dir, 'uwapi_1ch.usfm')) output = strip_word_data(input) self.assertEqual(expected, output)
def _process_usfm(self, format): url = format['url'] usfm_file = os.path.join(self.temp_dir, md5(url).hexdigest()) self.download_file(url, usfm_file) usfm = read_file(usfm_file) return remove_unknown_markers( convert_chunk_markers(strip_word_data(usfm)))
def test_usfm3_file_to_usfm2(self): """ This ensures we are correctly converting content to be used in the uW api. This content wasn't getting converted correctly in the past. :return: """ input = read_file(os.path.join(self.resources_dir, 'fr_gen.usfm3')) expected = read_file(os.path.join(self.resources_dir, 'fr_gen.usfm2')) output = strip_word_data(input) self.assertEqual(expected, output)
def test_strip_word_data_from_file(self): """ This ensures we are correctly converting content to be used in the uW api. This content wasn't getting converted correctly in the past. :return: """ input = read_file(os.path.join(self.resources_dir, 'apiv3_1ch.usfm')) expected = read_file(os.path.join(self.resources_dir, 'uwapi_1ch.usfm')) output = strip_word_data(input) self.assertEqual(expected, output)
def build_usx(usfm_dir, usx_dir): """ Builds the usx from usfm after performing some custom processing :param usfm_dir: :param usx_dir: :return: """ # strip word data files = os.listdir(usfm_dir) for name in files: f = os.path.join(usfm_dir, name) usfm = read_file(f) write_file(f, convert_chunk_markers(strip_word_data(usfm))) UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
def build_usx(usfm_dir, usx_dir, logger=None): """ Builds the usx from usfm after performing some custom processing :param usfm_dir: :param usx_dir: :return: """ # strip word data files = os.listdir(usfm_dir) for name in files: f = os.path.join(usfm_dir, name) usfm = read_file(f) write_file(f, remove_unknown_markers(convert_chunk_markers(strip_word_data(usfm)))) if logger: logger.debug("Actual USX conversion into {}".format(usx_dir)) UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
def test_strip_usfm_mixed_word_data(self): """ This ensures we are correctly handling input that contains spaces on "blank" lines. :return: """ input = u'''\\v 7 \\w Fils|strong="H1121"\\w* de \\w Javan|strong="H3120"\\w*: \\w Élischa|strong="H473"\\w*, \\w Tarsisa|strong="H8659"\\w*, \\w Kittim|strong="H3794"\\w* et \\w Rodanim|strong="H1721"\\w*. \\s5 \\v 8 \\w Fils|strong="H1121"\\w* de \\w Cham|strong="H2526"\\w*: \\w Cusch|strong="H3568"\\w*, \\w Mitsraïm|strong="H4714"\\w*, \\w Puth|strong="H6316"\\w* et \\w Canaan|strong="H3667"\\w*. - \\v 9 \\w Fils|strong="H1121"\\w* de \\w Cusch|strong="H3568"\\w*: \\w Saba|strong="H5434"\\w*, \\w Havila|strong="H2341"\\w*, \\w Sabta|strong="H5454"\\w*, \\w Raema|strong="H7484"\\w* et \\w Sabteca|strong="H5455"\\w*. -\\w Fils|strong="H1121"\\w* de \\w Raema|strong="H7484"\\w*: \\w Séba|strong="H7614"\\w* et \\w Dedan|strong="H1719"\\w*. \\v 10 \\w Cusch|strong="H3568"\\w* \\w engendra|strong="H3205" x-morph="strongMorph:TH8804"\\w* \\w Nimrod|strong="H5248"\\w*; c'est lui qui \\w commença|strong="H2490" x-morph="strongMorph:TH8689"\\w* à être \\w puissant|strong="H1368"\\w* sur la \\w terre|strong="H776"\\w*. -''' expected = u'''\\v 7 Fils de Javan: Élischa, Tarsisa, Kittim et Rodanim. \\s5 \\v 8 Fils de Cham: Cusch, Mitsraïm, Puth et Canaan. - \\v 9 Fils de Cusch: Saba, Havila, Sabta, Raema et Sabteca. - Fils de Raema: Séba et Dedan. \\v 10 Cusch engendra Nimrod; c'est lui qui commença à être puissant sur la terre. -''' output = strip_word_data(input) self.assertEqual(expected, output)
def test_strip_usfm_word_data(self): input = u'\\v 1 Ce \\w qui|strong="G3739" \\w* \\w était|strong="G2258" x-morph="strongMorph:TG5713" \\w* \\w dès|strong="G575" \\w*' expected = u'\\v 1 Ce qui était dès' output = strip_word_data(input) self.assertEqual(expected, output)