def test_map_dir(self):
     rc = factory.load(os.path.join(self.resources_dir, 'tw_rc'))
     out_dir = os.path.join(self.temp_dir, 'mapped_usfm')
     maptwtousfm3.mapDir(os.path.join(self.resources_dir, 'usfm'), rc,
                         out_dir)
     mapped_usfm = read_file(os.path.join(out_dir, '41-MAT.usfm'))
     expected_usfm = read_file(
         os.path.join(self.resources_dir, 'mapped_mat.usfm'))
     self.assertEqual(mapped_usfm, expected_usfm)
Exemplo n.º 2
0
 def test_strip_word_data_from_file(self):
     """
     This ensures we are correctly converting content to be used in the
     uW api. This content wasn't getting converted correctly in the past.
     :return:
     """
     input = read_file(os.path.join(self.resources_dir, 'apiv3_1ch.usfm'))
     expected = read_file(os.path.join(self.resources_dir, 'uwapi_1ch.usfm'))
     output = strip_word_data(input)
     self.assertEqual(expected, output)
Exemplo n.º 3
0
 def test_usfm3_file_to_usfm2(self):
     """
     This ensures we are correctly converting content to be used in the
     uW api. This content wasn't getting converted correctly in the past.
     :return:
     """
     input = read_file(os.path.join(self.resources_dir, 'fr_gen.usfm3'))
     expected = read_file(os.path.join(self.resources_dir, 'fr_gen.usfm2'))
     output = strip_word_data(input)
     self.assertEqual(expected, output)
Exemplo n.º 4
0
    def test_transform_usfm_with_word_data(self, mock_reporter):
        mockS3 = MockS3Handler()
        mockS3._load_path(os.path.join(self.resources_dir, 'usfm_sources'))
        usx_dir = tempfile.mkdtemp('-usx_output')
        build_usx(mockS3.temp_dir, usx_dir)
        expected_usx_file = os.path.join(self.resources_dir,
                                         'expected_usx/1JN.usx')
        out_file = os.path.join(usx_dir, '1JN.usx')

        expected_usx = read_file(expected_usx_file)
        output = read_file(out_file)
        self.assertEqual(expected_usx, output)
 def test_map_usfm_by_occurrence(self):
     usfm = read_file(os.path.join(self.resources_dir, 'usfm/41-MAT.usfm'))
     rc = factory.load(os.path.join(self.resources_dir, 'tw_rc'))
     words_index = maptwtousfm3.indexWordsLocation(rc)
     category_index = maptwtousfm3.indexWordsCategory(rc)
     mappedUSFM = maptwtousfm3.mapUSFMByOccurrence(
         usfm=usfm,
         words_rc=rc,
         words_index=words_index['occurrences'],
         words_category_index=category_index)
     expected_usfm = read_file(
         os.path.join(self.resources_dir, 'mapped_mat.usfm'))
     self.assertEqual(mappedUSFM, expected_usfm)
 def test_map_usfm_by_global_search(self):
     usfm = read_file(os.path.join(self.resources_dir, 'usfm/41-MAT.usfm'))
     rc = factory.load(os.path.join(self.resources_dir, 'tw_rc'))
     locations_index = maptwtousfm3.indexWordsLocation(rc)
     strongs_index = maptwtousfm3.indexWordByStrongs(rc)
     category_index = maptwtousfm3.indexWordsCategory(rc)
     mappedUSFM = maptwtousfm3.mapUSFMByGlobalSearch(
         usfm=usfm,
         words_strongs_index=strongs_index,
         words_false_positives_index=locations_index['false_positives'],
         words_category_index=category_index)
     expected_usfm = read_file(
         os.path.join(self.resources_dir, 'mapped_mat_global.usfm'))
     self.assertEqual(mappedUSFM, expected_usfm)
    def test_processing_hbo(self):
        """
        Test downloading and processing some hebrew
        :return:
        """
        return
        rc_dir = download_rc('hbo', 'uhb',
                             'https://cdn.door43.org/hbo/uhb/v2.1.1/uhb.zip',
                             self.temp_dir)

        manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml')))
        usx_dir = os.path.join(rc_dir, 'usx')
        for project in manifest['projects']:
            pid = project['identifier']

            # copy usfm project file
            usfm_dir = os.path.join(self.temp_dir, 'usfm')
            if not os.path.exists(usfm_dir):
                os.makedirs(usfm_dir)
            usfm_dest_file = os.path.normpath(
                os.path.join(usfm_dir, project['path']))
            usfm_src_file = os.path.normpath(
                os.path.join(rc_dir, project['path']))
            shutil.copyfile(usfm_src_file, usfm_dest_file)

            # transform usfm to usx
            build_usx(usfm_dir, usx_dir)

            # clean up converted usfm file
            remove(usfm_dest_file, True)

            # convert USX to JSON
            path = os.path.normpath(
                os.path.join(usx_dir, '{}.usx'.format(pid.upper())))
            source = build_json_source_from_usx(path, 'hbo', pid, '2019')
Exemplo n.º 8
0
def build_usx(usfm_dir, usx_dir):
    """
    Builds the usx from usfm after performing some custom processing
    :param usfm_dir:
    :param usx_dir:
    :return:
    """
    # strip word data
    files = os.listdir(usfm_dir)
    usfm2_dir = tempfile.mkdtemp(prefix='usfm2')
    try:
        for name in files:
            if name == '.DS_Store':
                continue
            f = os.path.join(usfm_dir, name)
            usfm3 = read_file(f)
            usfm2 = usfm3_to_usfm2(usfm3)
            out_f = os.path.join(usfm2_dir, name)
            write_file(out_f, usfm2)

        UsfmTransform.buildUSX(usfm2_dir, usx_dir, '', True)
    finally:
        try:
            shutil.rmtree(usfm2_dir)
        finally:
            pass
 def _process_usfm(self, format):
     url = format['url']
     usfm_file = os.path.join(self.temp_dir, md5(url).hexdigest())
     self.download_file(url, usfm_file)
     usfm = read_file(usfm_file)
     return remove_unknown_markers(
         convert_chunk_markers(strip_word_data(usfm)))
Exemplo n.º 10
0
    def test_strip_word_data_large_string(self):
        input = u'''\\id 1CH
\\h PREMIER LIVRE DES CHRONIQUES
\\toc1 PREMIER LIVRE DES CHRONIQUES
\\toc2 1 Chroniques
\\toc3 1 Ch
\\mt1 LES LIVRES DES CHRONIQUES
\\mt1 PREMIER LIVRE DES CHRONIQUES

\\s5
\\c 1
\\p
\\v 1  \\w Adam|strong="H121"\\w*, \\w Seth|strong="H8352"\\w*, \\w Énosch|strong="H583"\\w*,
\\v 2  \\w Kénan|strong="H7018"\\w*, \\w Mahalaleel|strong="H4111"\\w*, \\w Jéred|strong="H3382"\\w*,
\\v 3  \\w Hénoc|strong="H2585"\\w*, \\w Metuschélah|strong="H4968"\\w*, \\w Lémec|strong="H3929"\\w*,
\\v 4  \\w Noé|strong="H5146"\\w*, \\w Sem|strong="H8035"\\w*, \\w Cham|strong="H2526"\\w* et \\w Japhet|strong="H3315"\\w*.

\\s5
\\v 5  \\w Fils|strong="H1121"\\w* de \\w Japhet|strong="H3315"\\w*: \\w Gomer|strong="H1586"\\w*, \\w Magog|strong="H4031"\\w*, \\w Madaï|strong="H4074"\\w*, \\w Javan|strong="H3120"\\w*, \\w Tubal|strong="H8422"\\w*, \\w Méschec|strong="H4902"\\w* et \\w Tiras|strong="H8494"\\w*. -
\\v 6  \\w Fils|strong="H1121"\\w* de \\w Gomer|strong="H1586"\\w*: \\w Aschkenaz|strong="H813"\\w*, \\w Diphat|strong="H7384"\\w* et \\w Togarma|strong="H8425"\\w*. -
\\v 7  \\w Fils|strong="H1121"\\w* de \\w Javan|strong="H3120"\\w*: \\w Élischa|strong="H473"\\w*, \\w Tarsisa|strong="H8659"\\w*, \\w Kittim|strong="H3794"\\w* et \\w Rodanim|strong="H1721"\\w*.

\\s5
\\v 8  \\w Fils|strong="H1121"\\w* de \\w Cham|strong="H2526"\\w*: \\w Cusch|strong="H3568"\\w*, \\w Mitsraïm|strong="H4714"\\w*, \\w Puth|strong="H6316"\\w* et \\w Canaan|strong="H3667"\\w*. -
\\v 9  \\w Fils|strong="H1121"\\w* de \\w Cusch|strong="H3568"\\w*: \\w Saba|strong="H5434"\\w*, \\w Havila|strong="H2341"\\w*, \\w Sabta|strong="H5454"\\w*, \\w Raema|strong="H7484"\\w* et \\w Sabteca|strong="H5455"\\w*. -\\w Fils|strong="H1121"\\w* de \\w Raema|strong="H7484"\\w*: \\w Séba|strong="H7614"\\w* et \\w Dedan|strong="H1719"\\w*.
\\v 10  \\w Cusch|strong="H3568"\\w* \\w engendra|strong="H3205" x-morph="strongMorph:TH8804"\\w* \\w Nimrod|strong="H5248"\\w*; c'est lui qui \\w commença|strong="H2490" x-morph="strongMorph:TH8689"\\w* à être \\w puissant|strong="H1368"\\w* sur la \\w terre|strong="H776"\\w*. -
'''
        expected = read_file(os.path.join(self.resources_dir, 'uwapi_1ch.usfm'))
        output = strip_word_data(input)
        self.assertEqual(expected, output)
Exemplo n.º 11
0
    def test_tw_phrase_print(self):
        phrase = tWPhrase(1)
        phrase.addLine(u'\w Ἰησοῦ|lemma="Ἰησοῦς" strong="G24240" x-morph="Gr,N,,,,,GMS," x-tw="rc://*/tw/dict/bible/kt/jesus" \w*')
        phrase.addLine(u'\w Χριστοῦ|lemma="χριστός" strong="G55470" x-morph="Gr,N,,,,,GMS," x-tw="rc://*/tw/dict/bible/kt/christ"  x-tw="rc://*/tw/dict/bible/kt/jesus" \w*,')

        expected = read_file(os.path.join(self.resources_dir, 'usfm_milestone.usfm'))
        self.assertEqual(unicode(expected), unicode(phrase))
 def test_convert_file(self):
     usfm = osistousfm3.convertFile(osis_file=os.path.join(
         self.resources_dir, 'osis/Hag.xml'),
                                    lexicon=self.lexicon)
     expected_usfm = read_file(
         os.path.join(self.resources_dir, 'usfm/37-HAG.usfm'))
     self.assertEqual(expected_usfm, usfm)
 def test_convert_osis_with_book_key_migration(self):
     usfm = osistousfm3.convertFile(osis_file=os.path.join(
         self.resources_dir, 'osis/2Sam.xml'),
                                    lexicon=self.lexicon)
     expected_usfm = read_file(
         os.path.join(self.resources_dir, 'usfm/10-2SA.usfm'))
     self.assertEqual(expected_usfm, usfm)
    def test_index_tn_tsv_rc(self, mock_reporter):
        tmp = os.path.join(self.temp_dir, 'index_tn_rc')
        rc = os.path.join(self.resources_dir, 'en_tn_tsv')
        expected_file = os.path.join(self.resources_dir,
                                     'en_tn_tsv/expected_gen_notes.json')
        converted_file = '{}/gen/en/notes.json'.format(tmp)
        expected = {
            'en_*_gen_tn': {
                'key': 'gen/en/notes.json',
                'path': converted_file
            }
        }

        to_upload = index_tn_rc('en', tmp, rc)
        self.assertEqual(expected, to_upload)
        self.assertEquals(read_file(expected_file), read_file(converted_file))
 def test_titus_multiple_word_match(self):
     """
     Ensures we are correctly finding multiple word matches in Titus.
     :return:
     """
     usfm = read_file(os.path.join(self.resources_dir, 'usfm/57-TIT.usfm'))
     rc = factory.load(os.path.join(self.resources_dir, 'tw_rc'))
     words_index = maptwtousfm3.indexWordsLocation(rc)
     category_index = maptwtousfm3.indexWordsCategory(rc)
     mappedUSFM = maptwtousfm3.mapUSFMByOccurrence(
         usfm=usfm,
         words_rc=rc,
         words_index=words_index['occurrences'],
         words_category_index=category_index)
     expected_usfm = read_file(
         os.path.join(self.resources_dir, 'mapped_tit.usfm'))
     self.assertEqual(mappedUSFM, expected_usfm)
Exemplo n.º 16
0
def mapDir(usfm_dir,
           words_rc,
           output_dir,
           global_search=False,
           map_phrases=True):
    """
    Maps tW to words within each USFM file found in the directory.
    :param usfm_dir: a directory containing USFM files generated by `csvtousfm3`
    :param words_rc: the tW resource container
    :type words_rc: ResourceContainer.RC
    :param output_dir: a directory where the newly mapped usfm will be saved
    :param global_search: performs a global word-by-word search in addition to the searcy by occurrence
    :return:
    """
    usfm_files = []
    strongs_index = {}
    for root, dirs, files in os.walk(usfm_dir):
        usfm_files.extend(files)
        break

    print('Generating occurrences index')
    location_index = indexWordsLocation(words_rc)
    category_index = indexWordsCategory(words_rc)
    if map_phrases:
        print('Phrase mapping enabled.')
    if global_search:
        print('Global search enabled.')
        print('Generating strongs index.')
        strongs_index = indexWordByStrongs(words_rc)

    for file_name in usfm_files:
        if not file_name.endswith('.usfm'):
            continue

        file = os.path.join(usfm_dir, file_name)
        print('{}'.format(file_name))
        usfm = read_file(file)
        usfm = mapUSFMByOccurrence(usfm=usfm,
                                   words_rc=words_rc,
                                   words_index=location_index['occurrences'],
                                   words_category_index=category_index)
        if map_phrases:
            usfm = mapPhrases(usfm)
        if global_search:
            usfm = mapUSFMByGlobalSearch(
                usfm=usfm,
                words_strongs_index=strongs_index,
                words_false_positives_index=location_index['false_positives'],
                words_category_index=category_index)
            # NOTE: if we need to add phrase mapping to global search un-comment these lines
            # if map_phrases:
            #     usfm = mapPhrases(usfm)
        outfile = os.path.join(output_dir, os.path.basename(file))
        write_file(outfile, usfm)
Exemplo n.º 17
0
 def _build_catalogs(self):
     """
     Builds the global catalogs
     :return:
     """
     catalogs_path = os.path.join(self.repo_dir, 'catalogs.json')
     package = read_file(catalogs_path)
     return {
         'repo_name': self.repo_name,
         'commit_id': self.commit_id,
         'timestamp': self.timestamp,
         'package': package,
         'dirty': False
     }
Exemplo n.º 18
0
    def test_convert_file(self):
        usfm = csvtousfm3.convert(lang='Gr',
                                  csv_file=os.path.join(
                                      self.resources_dir, 'input.csv'))

        self.assertIsInstance(usfm, list)
        self.assertEqual(2, len(usfm))
        for book in usfm:
            self.assertIsInstance(book['usfm'], unicode)
            expected_usfm = read_file(
                os.path.join(self.resources_dir,
                             '{}_output.usfm'.format(book['id'])))
            self.assertIsInstance(expected_usfm, unicode)
            self.assertMultiLineEqual(expected_usfm, book['usfm'])
Exemplo n.º 19
0
def build_usx(usfm_dir, usx_dir):
    """
    Builds the usx from usfm after performing some custom processing
    :param usfm_dir:
    :param usx_dir:
    :return:
    """
    # strip word data
    files = os.listdir(usfm_dir)
    for name in files:
        f = os.path.join(usfm_dir, name)
        usfm = read_file(f)
        write_file(f, convert_chunk_markers(strip_word_data(usfm)))

    UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
Exemplo n.º 20
0
    def test_unsigned_external_content(self, mock_reporter):
        format = {
            'format': '',
            'modified': '',
            'size': '',
            'url': 'https://google.com',
            'signature': ''
        }
        row = json.loads(
            read_file(
                os.path.join(self.resources_dir,
                             'progress_db/no_sig_external_content-row.json')))

        checker = ConsistencyChecker('cdn.door43.org', 'api.door43.org')
        errors = checker.check_format(format, row)
        self.assertEqual([], errors)
Exemplo n.º 21
0
def build_usx(usfm_dir, usx_dir, logger=None):
    """
    Builds the usx from usfm after performing some custom processing
    :param usfm_dir:
    :param usx_dir:
    :return:
    """
    # strip word data
    files = os.listdir(usfm_dir)
    for name in files:
        f = os.path.join(usfm_dir, name)
        usfm = read_file(f)
        write_file(f, remove_unknown_markers(convert_chunk_markers(strip_word_data(usfm))))

    if logger:
        logger.debug("Actual USX conversion into {}".format(usx_dir))
    UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
Exemplo n.º 22
0
    def test_unsigned_local_content(self, mock_reporter):
        format = {
            'format': '',
            'modified': '',
            'size': '',
            'url': 'https://api.door43.org',
            'signature': ''
        }
        row = json.loads(
            read_file(
                os.path.join(self.resources_dir,
                             'progress_db/no_sig_external_content-row.json')))

        checker = ConsistencyChecker('cdn.door43.org', 'api.door43.org')
        errors = checker.check_format(format, row)
        self.assertIn(
            "Consistency Check Failed: en_obs: url 'https://api.door43.org' has not been signed yet",
            errors)
Exemplo n.º 23
0
 def _build_localization(self):
     """
     Builds the localization for various components in the catalog
     :return:
     """
     files = sorted(glob(os.path.join(self.repo_dir, '*.json')))
     localization = {}
     for f in files:
         self.logger.debug("Reading {0}...".format(f))
         language = os.path.splitext(os.path.basename(f))[0]
         try:
             localization[language] = json.loads(read_file(f))
         except Exception as e:
             raise Exception('Bad JSON: {0}'.format(e))
     return {
         'repo_name': self.repo_name,
         'commit_id': self.commit_id,
         'timestamp': self.timestamp,
         'package': json.dumps(localization, sort_keys=True),
         'dirty': False
     }
Exemplo n.º 24
0
def index_tn_rc(lid, temp_dir, rc_dir, reporter=None):
    """
    Converts a v3 tN into it's v2 equivalent.
    This will write a bunch of files and return a list of files to be uploaded.

    Chunk definitions will be used to validate the note organization.

    :param lid: the language id of the notes
    :param temp_dir: the directory where all the files will be written
    :param rc_dir: the directory of the resource container
    :param reporter: a lambda handler used for reporting
    :type reporter: Handler
    :return: a list of note files to upload
    """
    manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml')))
    content_format = manifest['dublin_core']['format']
    if content_format == 'text/markdown':
        return tn_md_to_json_file(lid, temp_dir, rc_dir, manifest, reporter)
    elif content_format == 'text/tsv':
        return tn_tsv_to_json_file(lid, temp_dir, rc_dir, manifest, reporter)
    elif reporter:
        reporter.report_error("Unsupported content type '{}' found in {}".format(content_format, rc_dir))
        raise Exception("Unsupported content type '{}' found in {}".format(content_format, rc_dir))
Exemplo n.º 25
0
    def test_complex_usfm3_to_usfm2(self):
        usfm3 = read_file(os.path.join(self.resources_dir, 'usfm/57-TIT.usfm'))
        expected_usfm2 = read_file(os.path.join(self.resources_dir, 'complex_tit.usfm2'))

        usfm2 = usfm3_to_usfm2(usfm3)
        self.assertEqual(expected_usfm2, usfm2)
Exemplo n.º 26
0
    def test_usfm3_to_usfm2(self):
        usfm3 = read_file(os.path.join(self.resources_dir, 'usfm3_sample.usfm'))
        expected_usfm2 = read_file(os.path.join(self.resources_dir, 'usfm2_sample.usfm'))

        usfm2 = usfm3_to_usfm2(usfm3)
        self.assertEqual(expected_usfm2, usfm2)
Exemplo n.º 27
0
 def _strip_print_script(file_to_sign):
     html = read_file(file_to_sign)
     html = html.replace('window.print()', '')
     write_file(file_to_sign, html)
Exemplo n.º 28
0
    def _build_versification(self):
        """
        DEPRECATED

        we are no longer processing versification.
        :return:
        """
        bible_dir = os.path.join(self.repo_dir, 'bible')
        versification_dirs = os.listdir(bible_dir)
        books = {}
        package = []
        uploads = []

        # group by project
        for vrs_dir in versification_dirs:
            vrs_id = os.path.basename(vrs_dir)
            book_files = sorted(
                glob(os.path.join(bible_dir, vrs_dir, 'chunks', '*.json')))
            for b in book_files:
                self.logger.debug('Reading "{}" versification for "{}"'.format(
                    vrs_id, b))
                b_id = os.path.splitext(os.path.basename(b))[0]
                try:
                    book_vrs = json.loads(read_file(b))
                except Exception as e:
                    raise Exception, Exception(
                        'Bad JSON: {0}'.format(e)), sys.exc_info()[2]
                book = WebhookHandler.retrieve_or_make(
                    books, b_id, {
                        'identifier':
                        b_id,
                        'chunks_url':
                        '{0}/bible/{}/{}/v{}/chunks.json'.format(
                            self.cdn_url, vrs_id, b_id, self.api_version),
                        'chunks': {}
                    })
                book['chunks'][vrs_id] = book_vrs
        temp_dir = os.path.join(self.temp_dir, 'versification')
        if not os.path.isdir:
            os.mkdir(temp_dir)
        for book in books:
            book = books[book]

            # write chunks
            chunk_file = os.path.join(temp_dir, book['identifier'] + '.json')
            write_file(chunk_file, json.dumps(book['chunks'], sort_keys=True))
            # for now we bypass signing and upload chunks directly
            upload_key = 'bible/{}/v{}/chunks.json'.format(
                book['identifier'], self.api_version)
            uploads.append({'key': upload_key, 'path': chunk_file})

            # build package
            del book['chunks']
            package.append(book)

        return {
            'repo_name': self.repo_name,
            'commit_id': self.commit_id,
            'timestamp': self.timestamp,
            'package': json.dumps(package, sort_keys=True),
            'uploads': uploads,
            'dirty': False
        }
Exemplo n.º 29
0
    def test_inprogress(self, mock_reporter):
        mockV3Api = MockAPI(os.path.join(self.resources_dir, 'v3_api'),
                            'https://api.door43.org/')
        mockV3Api.add_host(os.path.join(self.resources_dir, 'v3_cdn'),
                           'https://test-cdn.door43.org/')

        mockV2Api = MockAPI(os.path.join(self.resources_dir, 'ts_api'),
                            'https://test')
        mockS3 = MockS3Handler('ts_bucket')
        mockDb = MockDynamodbHandler()
        mockDb._load_db(
            os.path.join(TestTsV2Catalog.resources_dir,
                         'ready_inprogress_db.json'))

        mockLog = MockLogger()
        event = self.make_event()
        converter = TsV2CatalogHandler(
            event=event,
            context=None,
            logger=mockLog,
            s3_handler=mockS3,
            dynamodb_handler=mockDb,
            url_handler=mockV3Api.get_url,
            download_handler=mockV3Api.download_file,
            url_exists_handler=lambda url: True)
        converter.run()

        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/catalog.json')
        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/obs/languages.json')
        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/obs/en/resources.json')
        self.assertNotIn('v2/ts/obs/en/obs/source.json',
                         mockS3._recent_uploads)
        # assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/obs/en/obs/source.json')
        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/obs/en/notes.json')
        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/obs/en/questions.json')
        # we have frozen tw_cat
        self.assertNotIn('v2/ts/obs/en/tw_cat.json', mockS3._recent_uploads)

        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/1ch/languages.json')
        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/1ch/en/resources.json')
        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/1ch/en/ulb/v7/source.json')
        self.assertNotIn('v2/ts/1ch/en/notes.json', mockS3._recent_uploads)
        # assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/1ch/en/notes.json')
        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/1ch/en/questions.json')
        self.assertNotIn('v2/ts/1ch/en/tw_cat.json', mockS3._recent_uploads)
        # assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/1ch/en/tw_cat.json')

        assert_s3_equals_api_json(self, mockS3, mockV2Api,
                                  'v2/ts/bible/en/words.json')

        # validate urls in generate catalogs match the generated output paths
        root_url = '{}/'.format(
            event['stage-variables']['cdn_url'].rstrip('/'))
        catalog = json.loads(
            read_file(mockS3._recent_uploads['v2/ts/catalog.json']))
        url_err_msg = 'url in catalog does not match upload path: {}'
        for project in catalog:
            lang_catalog_path = project['lang_catalog'].replace(
                root_url, '').split('?')[0]
            self.assertIn(lang_catalog_path, mockS3._recent_uploads,
                          url_err_msg.format(lang_catalog_path))
            lang_catalog = json.loads(
                read_file(mockS3._recent_uploads[lang_catalog_path]))
            for language in lang_catalog:
                res_catalog_path = language['res_catalog'].replace(
                    root_url, '').split('?')[0]
                self.assertIn(res_catalog_path, mockS3._recent_uploads,
                              url_err_msg.format(res_catalog_path))
                res_catalog = json.loads(
                    read_file(mockS3._recent_uploads[res_catalog_path]))
                for resource in res_catalog:
                    questions_path = resource['checking_questions'].replace(
                        root_url, '').split('?')[0]
                    # notes_path = resource['notes'].replace(root_url, '').split('?')[0]
                    # source_path = resource['source'].replace(root_url, '').split('?')[0]
                    terms_path = resource['terms'].replace(root_url,
                                                           '').split('?')[0]
                    # terms_map_path = resource['tw_cat'].replace(root_url, '').split('?')[0]

                    if questions_path:
                        self.assertIn(questions_path, mockS3._recent_uploads,
                                      url_err_msg.format(questions_path))
                    # if notes_path:
                    # self.assertIn(notes_path, mockS3._uploads, url_err_msg.format(notes_path))
                    # if source_path:
                    #     self.assertIn(source_path, mockS3._uploads, url_err_msg.format(source_path))
                    if terms_path:
                        self.assertIn(terms_path, mockS3._recent_uploads,
                                      url_err_msg.format(terms_path))
Exemplo n.º 30
0
def tn_md_to_json_file(lid, temp_dir, rc_dir, manifest, reporter=None):
    """
    Converts a markdown tN to json
    This will write a bunch of files and return a list of files to be uploaded.

    Chunk definitions will be used to validate the note organization.

    :param lid: the language id of the notes
    :param temp_dir: the directory where all the files will be written
    :param rc_dir: the directory of the resource container
    :param manifest: the rc manifest data
    :param reporter: a lambda handler used for reporting
    :type reporter: Handler
    :return: a list of note files to upload
    """
    dc = manifest['dublin_core']
    note_general_re = re.compile('^([^#]+)', re.UNICODE)
    note_re = re.compile('^#+([^#\n]+)#*([^#]*)', re.UNICODE | re.MULTILINE | re.DOTALL)
    tn_uploads = {}

    for project in manifest['projects']:
        pid = Handler.sanitize_identifier(project['identifier'])
        chunk_json = {}
        if pid != 'obs':
            try:
                chunk_json = index_chunks(download_chunks(pid))
            except:
                if reporter:
                    reporter.report_error('Failed to retrieve chunk information for {}-{}'.format(lid, pid))
                continue

        note_dir = os.path.normpath(os.path.join(rc_dir, project['path']))
        note_json = []
        if not os.path.exists(note_dir):
            raise Exception('Could not find translationNotes directory at {}'.format(note_dir))
        chapters = os.listdir(note_dir)

        for chapter in chapters:
            if chapter in ['.', '..', 'front', '.DS_Store']:
                continue
            chapter_dir = os.path.join(note_dir, chapter)
            verses = os.listdir(chapter_dir)
            verses.sort()

            # zero pad chapter to match chunking scheme
            chapter = pad_to_match(chapter, chunk_json)
            chapter_chunk_json = chunk_json.get(chapter, {})

            # validate chapters
            if pid != 'obs' and chapter not in chunk_json:
                raise Exception('Missing chapter "{}" key in chunk json while reading chunks for {}. RC: {}' \
                    .format(chapter, pid, rc_dir))

            notes = []
            firstvs = None
            note_hashes = []
            for verse in verses:
                if verse in ['.', '..', 'intro.md', '.DS_Store']:
                    continue

                # notes = []
                verse_file = os.path.join(chapter_dir, verse)
                verse = verse.split('.')[0]
                try:
                    verse_body = read_file(verse_file)
                except Exception as e:
                    if reporter:
                        reporter.report_error('Failed to read file {}'.format(verse_file))
                    raise e

                verse_body = convert_rc_links(verse_body)
                general_notes = note_general_re.search(verse_body)

                # zero pad verse to match chunking scheme
                verse = pad_to_match(verse, chapter_chunk_json)

                # close chunk
                if firstvs is not None and (pid != 'obs' and not chapter_chunk_json):
                    if reporter:
                        reporter.report_error(
                            'Could not find chunk data for {} {} {}'.format(rc_dir, pid, chapter))

                if firstvs is not None and (pid == 'obs' or verse in chapter_chunk_json):
                    note_json.append({
                        'id': '{}-{}'.format(chapter, firstvs),
                        'tn': notes
                    })
                    firstvs = verse
                    notes = []
                elif firstvs is None:
                    firstvs = verse

                if general_notes:
                    verse_body = note_general_re.sub('', verse_body)
                    notes.append({
                        'ref': 'General Information',
                        'text': general_notes.group(0).strip()
                    })

                for note in note_re.findall(verse_body):
                    # TRICKY: do not include translation words in the list of notes
                    if note[0].strip().lower() != 'translationwords':
                        hasher = hashlib.md5()
                        hasher.update(note[0].strip().lower().encode('utf-8'))
                        note_hash = hasher.hexdigest()
                        if note_hash not in note_hashes:
                            note_hashes.append(note_hash)
                            notes.append({
                                'ref': note[0].strip(),
                                'text': note[1].strip()
                            })

            # close last chunk
            if firstvs is not None:
                note_json.append({
                    'id': '{}-{}'.format(chapter, firstvs),
                    'tn': notes
                })

        if note_json:
            tn_key = '_'.join([lid, '*', pid, 'tn'])
            note_json.append({'date_modified': str(dc['modified']).replace('-', '')})
            note_upload = prep_data_upload('{}/{}/notes.json'.format(pid, lid), note_json, temp_dir)
            tn_uploads[tn_key] = note_upload

    return tn_uploads