Пример #1
0
    def test_index_inconsistent_image_titles(self):
        mockApi = MockAPI(self.resources_dir, 'https://example.com')
        format = {
            'format': 'type=book',
            'url': 'https://example.com/en_obs.zip'
        }
        response = index_obs('en', 'obs', format, self.temp_dir,
                             mockApi.download_file)
        chapters = response['chapters']
        self.assertEqual(2, len(chapters))

        self.assertEqual('01', chapters[0]['number'])
        self.assertEqual(16, len(chapters[0]['frames']))
        frames = chapters[0]['frames']
        self.assertEqual('01-01', frames[0]['id'])
        self.assertEqual('01-02', frames[1]['id'])

        self.assertEqual('02', chapters[1]['number'])
        self.assertEqual(12, len(chapters[1]['frames']))
Пример #2
0
    def convert_v3_to_v2(self, v3_catalog, status):
        """
        Builds a v2 catalog for the uW api endpoint.
        This uses the v3 catalog as the source
        :param v3_catalog: the v3 catalog
        :param status: the build status retrieved from AWS.
        :return: the complete v2 catalog
        """
        cat_keys = []
        v2_catalog = {'obs': {}, 'bible': {}}

        title_map = {'bible': 'Bible', 'obs': 'Open Bible Stories'}

        last_modified = 0

        for lang in v3_catalog['languages']:
            lid = lang['identifier']
            self.logger.info('Processing {}'.format(lid))
            for res in lang['resources']:
                rid = res['identifier']
                if rid == 'obs':
                    cat_key = 'obs'
                else:
                    cat_key = 'bible'

                mod = str_to_unix_time(res['modified'])

                if int(mod) > last_modified:
                    last_modified = int(mod)

                # TRICKY: we are not processing the resource formats

                toc = []
                for proj in res['projects']:
                    pid = proj['identifier']
                    if 'formats' in proj and proj['formats']:
                        source = None
                        pdf = None
                        media = {
                            'audio': {
                                'src_dict': {}
                            },
                            'video': {
                                'src_dict': {}
                            }
                        }
                        for format in proj['formats']:
                            # skip media formats that do not match the source version
                            if 'source_version' in format and format[
                                    'source_version'] != res['version']:
                                if self.logger:
                                    self.logger.warning(
                                        '{}_{}_{}: media format "{}" does not match source version "{}" and will be excluded.'
                                        .format(lid, rid, pid, format['url'],
                                                res['version']))
                                continue

                            if rid == 'obs' and 'type=book' in format['format']:
                                # TRICKY: obs must be converted to json
                                process_id = '_'.join([lid, rid, pid])
                                obs_key = '{}/{}/{}/{}/v{}/source.json'.format(
                                    self.cdn_root_path, pid, lid, rid,
                                    res['version'])
                                if process_id not in status['processed']:
                                    obs_json = index_obs(
                                        lid, rid, format, self.temp_dir,
                                        self.download_file)
                                    upload = self._prep_json_upload(
                                        obs_key, obs_json)
                                    self.cdn_handler.upload_file(
                                        upload['path'], upload['key'])

                                    # sign obs file.
                                    # TRICKY: we only need to sign obs so we do so now.
                                    sig_file = self.signer.sign_file(
                                        upload['path'])
                                    try:
                                        self.signer.verify_signature(
                                            upload['path'], sig_file)
                                        self.cdn_handler.upload_file(
                                            sig_file,
                                            '{}.sig'.format(upload['key']))
                                    except RuntimeError:
                                        if self.logger:
                                            self.logger.warning(
                                                'Could not verify signature {}'
                                                .format(sig_file))

                                    status['processed'].update(
                                        {process_id: []})
                                    status['timestamp'] = time.strftime(
                                        "%Y-%m-%dT%H:%M:%SZ")
                                    self.db_handler.update_item(
                                        {
                                            'api_version':
                                            UwV2CatalogHandler.api_version
                                        }, status)
                                else:
                                    cat_keys = cat_keys + status['processed'][
                                        process_id]

                                source = {
                                    'url':
                                    '{}/{}'.format(self.cdn_url, obs_key),
                                    'signature':
                                    '{}/{}.sig'.format(self.cdn_url, obs_key)
                                }
                            elif rid != 'obs' and format[
                                    'format'] == 'text/usfm':
                                # process bible
                                process_id = '_'.join([lid, rid, pid])
                                bible_key = '{0}/{1}/{2}/{3}/v{4}/{1}.usfm'.format(
                                    self.cdn_root_path, pid, lid, rid,
                                    res['version'])
                                if process_id not in status['processed']:
                                    usfm = self._process_usfm(format)
                                    upload = self._prep_text_upload(
                                        bible_key, usfm)
                                    self.cdn_handler.upload_file(
                                        upload['path'], upload['key'])

                                    # sign file
                                    sig_file = self.signer.sign_file(
                                        upload['path'])
                                    try:
                                        self.signer.verify_signature(
                                            upload['path'], sig_file)
                                        self.cdn_handler.upload_file(
                                            sig_file,
                                            '{}.sig'.format(upload['key']))
                                    except RuntimeError:
                                        if self.logger:
                                            self.logger.warning(
                                                'Could not verify signature {}'
                                                .format(sig_file))

                                    status['processed'].update(
                                        {process_id: []})
                                    status['timestamp'] = time.strftime(
                                        "%Y-%m-%dT%H:%M:%SZ")
                                    self.db_handler.update_item(
                                        {
                                            'api_version':
                                            UwV2CatalogHandler.api_version
                                        }, status)
                                else:
                                    cat_keys = cat_keys + status['processed'][
                                        process_id]
                                source = {
                                    'url':
                                    '{}/{}'.format(self.cdn_url, bible_key),
                                    'signature':
                                    '{}/{}.sig'.format(self.cdn_url, bible_key)
                                }
                            elif 'content=audio/mp3' in format[
                                    'format'] or 'content=video/mp4' in format[
                                        'format']:
                                # process media
                                quality_value, quality_suffix = self.__parse_media_quality(
                                    format['quality'])
                                if 'content=audio/mp3' in format['format']:
                                    media_container = media['audio']
                                    quality_key = 'bitrate'
                                    quality_short_key = 'br'
                                else:
                                    media_container = media['video']
                                    quality_key = 'resolution'
                                    quality_short_key = 'res'

                                # build chapter src
                                src_dict = {}
                                if 'chapters' in format:
                                    for chapter in format['chapters']:
                                        src_dict[chapter['identifier']] = {
                                            quality_short_key: [{
                                                quality_key:
                                                int(quality_value),
                                                'mod':
                                                int(
                                                    str_to_unix_time(
                                                        chapter['modified'])),
                                                'size':
                                                chapter['size']
                                            }],
                                            'chap':
                                            chapter['identifier'],
                                            'length':
                                            int(math.ceil(chapter['length'])),
                                            'src':
                                            chapter['url'].replace(
                                                format['quality'],
                                                '{bitrate}' + quality_suffix),
                                            'src_sig':
                                            chapter['signature'].replace(
                                                format['quality'],
                                                '{bitrate}' + quality_suffix)
                                        }

                                merge_dict(
                                    media_container, {
                                        'contributors':
                                        ',\\n'.join(format['contributor']),
                                        'rev':
                                        format['version'],
                                        'txt_ver':
                                        format['source_version'],
                                        'src_dict':
                                        src_dict
                                    })
                            elif 'application/pdf' == format['format']:
                                pdf = {
                                    'url': format['url'],
                                    'source_version': format['source_version']
                                }

                        # build catalog
                        if not source:
                            if self.logger:
                                self.logger.debug(
                                    'No book text found in {}_{}_{}'.format(
                                        lid, rid, pid))
                            continue

                        media_keys = media.keys()
                        for key in media_keys:
                            if media[key]['src_dict']:
                                media[key]['src_list'] = [
                                    media[key]['src_dict'][k]
                                    for k in media[key]['src_dict']
                                ]
                                del media[key]['src_dict']
                            else:
                                del media[key]
                        toc_item = {
                            'desc': '',
                            'media': media,
                            'mod': mod,
                            'slug': proj['identifier'],
                            'src': source['url'],
                            'src_sig': source['signature'],
                            'title': proj['title'],
                        }
                        if rid == 'obs':
                            del toc_item['slug']
                        if pdf:
                            toc_item['pdf'] = pdf['url']

                        if not media:
                            del toc_item['media']
                        toc.append(toc_item)

                if not toc:
                    continue

                # TRICKY: not all manifests have a source text
                if 'source' in res and len(res['source']):
                    source = res['source'][0]
                else:
                    source = {'language': '', 'version': ''}

                comment = ''
                if 'comment' in res:
                    comment = res['comment']

                # TRICKY: maintain legacy slug formatting for backwards compatibility
                legacy_slug = '{}-{}'.format(rid, lid)
                res_v2_id = rid
                if legacy_slug in self.legacy_slugs or rid == 'obs':
                    res_v2_id = legacy_slug

                res_v2 = {
                    'slug': res_v2_id,
                    'name': res['title'],
                    'mod': mod,
                    'status': {
                        'checking_entity':
                        '; '.join(res['checking']['checking_entity']),
                        'checking_level':
                        res['checking']['checking_level'],
                        'comments':
                        comment,
                        'contributors':
                        '; '.join(res['contributor']),
                        'publish_date':
                        res['issued'],
                        'source_text':
                        source['language'],
                        'source_text_version':
                        source['version'],
                        'version':
                        res['version']
                    },
                    'toc': toc
                }

                if not lid in v2_catalog[cat_key]:
                    v2_catalog[cat_key][lid] = {
                        'lc': lid,
                        'mod': mod,
                        'vers': []
                    }
                v2_catalog[cat_key][lid]['vers'].append(res_v2)

        # condense catalog
        catalog = {'cat': [], 'mod': last_modified}
        for cat_slug in v2_catalog:
            langs = []
            for lid in v2_catalog[cat_slug]:
                langs.append(v2_catalog[cat_slug][lid])

            catalog['cat'].append({
                'slug': cat_slug,
                'title': title_map[cat_slug],
                'langs': langs
            })
        return catalog
Пример #3
0
    def __execute(self):
        cat_keys = []
        cat_dict = {}
        supplemental_resources = []

        result = self._get_status()
        if not result:
            return False
        else:
            (self.status, source_status) = result

        # check if build is complete
        if self.status['state'] == 'complete':
            self.logger.debug('Catalog already generated')
            return True

        # retrieve the latest catalog
        self.logger.debug("Catalog url {0}".format(
            source_status['catalog_url']))
        catalog_content = self.get_url(source_status['catalog_url'], True)
        if not catalog_content:
            self.logger.error("{0} does not exist".format(
                source_status['catalog_url']))
            return False
        try:
            self.latest_catalog = json.loads(catalog_content)
        except Exception as e:
            self.logger.error("Failed to load the catalog json: {0}".format(e))
            return False

        # walk v3 catalog
        for lang in self.latest_catalog['languages']:
            lid = TsV2CatalogHandler.sanitize_identifier(lang['identifier'],
                                                         lower=False)
            self.logger.info('Processing {}'.format(lid))
            for res in lang['resources']:
                rid = TsV2CatalogHandler.sanitize_identifier(res['identifier'])
                self.logger.debug('Processing {}_{}'.format(lid, rid))

                rc_format = None

                self.logger.debug('Temp directory {} contents {}'.format(
                    self.temp_dir, get_subdirs(self.temp_dir)))
                res_temp_dir = os.path.join(self.temp_dir, lid, rid)
                os.makedirs(res_temp_dir)

                if 'formats' in res:
                    for format in res['formats']:
                        finished_processes = {}
                        if not rc_format and get_rc_type(format):
                            # locate rc_format (for multi-project RCs)
                            rc_format = format
                        #res is resource, rid is resource id, lid is language id
                        process_id = '_'.join([lid, rid, 'usfm'])
                        if process_id not in self.status['processed']:
                            self._process_usfm(lid, rid, res, format,
                                               res_temp_dir)
                            finished_processes[process_id] = []

                        # TRICKY: bible notes and questions are in the resource
                        if rid != 'obs':
                            process_id = '_'.join([lid, rid, 'notes'])
                            if process_id not in self.status['processed']:
                                self.logger.info(
                                    'Processing notes {}_{}'.format(lid, rid))
                                tn = self._index_note_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tn:
                                    self._upload_all(tn)
                                    finished_processes[process_id] = tn.keys()
                                    cat_keys = cat_keys + tn.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                            process_id = '_'.join([lid, rid, 'questions'])
                            if process_id not in self.status['processed']:
                                self.logger.info(
                                    'Processing questions {}_{}'.format(
                                        lid, rid))
                                tq = self._index_question_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tq:
                                    self._upload_all(tq)
                                    finished_processes[process_id] = tq.keys()
                                    cat_keys = cat_keys + tq.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                        # TRICKY: update the finished processes once per format to limit db hits
                        if finished_processes:
                            self.status['processed'].update(finished_processes)
                            self.status['timestamp'] = time.strftime(
                                "%Y-%m-%dT%H:%M:%SZ")
                            self.db_handler.update_item(
                                {
                                    'api_version':
                                    TsV2CatalogHandler.api_version
                                }, self.status)

                for project in res['projects']:
                    pid = TsV2CatalogHandler.sanitize_identifier(
                        project['identifier'])
                    self.logger.debug('Processing {}_{}_{}'.format(
                        lid, rid, pid))
                    if 'formats' in project:
                        for format in project['formats']:
                            finished_processes = {}
                            if not rc_format and get_rc_type(format):
                                # locate rc_format (for single-project RCs)
                                rc_format = format

                            # TRICKY: there should only be a single tW for each language
                            process_id = '_'.join([lid, 'words'])
                            if process_id not in self.status['processed']:
                                tw = self._index_words_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tw:
                                    self._upload_all(tw)
                                    finished_processes[process_id] = tw.keys()
                                    cat_keys = cat_keys + tw.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                            if rid == 'obs':
                                process_id = '_'.join([lid, rid, pid])
                                if process_id not in self.status['processed']:
                                    self.logger.debug(
                                        'Processing {}'.format(process_id))
                                    obs_json = index_obs(
                                        lid, rid, format, res_temp_dir,
                                        self.download_file)
                                    upload = prep_data_upload(
                                        '{}/{}/{}/v{}/source.json'.format(
                                            pid, lid, rid, res['version']),
                                        obs_json, res_temp_dir)
                                    self._upload(upload)
                                    finished_processes[process_id] = []
                                else:
                                    cat_keys = cat_keys + self.status[
                                        'processed'][process_id]

                            # TRICKY: obs notes and questions are in the project
                            process_id = '_'.join([lid, rid, pid, 'notes'])
                            if process_id not in self.status['processed']:
                                tn = self._index_note_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tn:
                                    self._upload_all(tn)
                                    finished_processes[process_id] = tn.keys()
                                    cat_keys = cat_keys + tn.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                            process_id = '_'.join([lid, rid, pid, 'questions'])
                            if process_id not in self.status['processed']:
                                tq = self._index_question_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tq:
                                    self._upload_all(tq)
                                    finished_processes[process_id] = tq.keys()
                                    cat_keys = cat_keys + tq.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                            # TRICKY: update the finished processes once per format to limit db hits
                            if finished_processes:
                                self.status['processed'].update(
                                    finished_processes)
                                self.status['timestamp'] = time.strftime(
                                    "%Y-%m-%dT%H:%M:%SZ")
                                self.db_handler.update_item(
                                    {
                                        'api_version':
                                        TsV2CatalogHandler.api_version
                                    }, self.status)

                    if not rc_format:
                        raise Exception(
                            'Could not find a format for {}_{}_{}'.format(
                                lid, rid, pid))

                    modified = make_legacy_date(rc_format['modified'])
                    rc_type = get_rc_type(rc_format)

                    self.logger.debug(
                        'Resource container type is {}'.format(rc_type))

                    if modified is None:
                        modified = time.strftime('%Y%m%d')
                        self.logger.warning(
                            'Could not find date modified for {}_{}_{} from "{}"'
                            .format(lid, rid, pid, rc_format['modified']))

                    if rc_type == 'book' or rc_type == 'bundle':
                        self._build_catalog_node(cat_dict, lang, res, project,
                                                 modified)
                    else:
                        # store supplementary resources for processing after catalog nodes have been fully built
                        supplemental_resources.append({
                            'language': lang,
                            'resource': res,
                            'project': project,
                            'modified': modified,
                            'rc_type': rc_type
                        })

                # cleanup resource directory
                remove_tree(res_temp_dir)
            # cleanup language directory
            remove_tree(os.path.join(self.temp_dir, lid))
        # inject supplementary resources
        for s in supplemental_resources:
            self._add_supplement(cat_dict, s['language'], s['resource'],
                                 s['project'], s['modified'], s['rc_type'])

        api_uploads = []

        # normalize catalog nodes
        root_cat = []
        for pid in cat_dict:
            project = cat_dict[pid]
            lang_cat = []
            for lid in project['_langs']:
                lang = project['_langs'][lid]
                res_cat = []
                for rid in lang['_res']:
                    res = lang['_res'][rid]

                    # disable missing catalogs

                    # disable tN
                    if '_'.join([lid, '*', pid, 'tn']) not in cat_keys:
                        res['notes'] = ''

                    # disable tQ
                    if '_'.join([lid, '*', pid, 'tq']) not in cat_keys:
                        res['checking_questions'] = ''

                    # disable tW
                    if '_'.join([lid, '*', '*', 'tw']) not in cat_keys:
                        res['terms'] = ''

                    res_cat.append(res)
                api_uploads.append(
                    prep_data_upload('{}/{}/resources.json'.format(pid, lid),
                                     res_cat, self.temp_dir))

                del lang['_res']
                if ('project' in lang):
                    # skip empty artifacts
                    lang_cat.append(lang)
                else:
                    self.logger.warning(
                        'Excluding empty language artifact in {}'.format(pid))
            api_uploads.append(
                prep_data_upload('{}/languages.json'.format(pid), lang_cat,
                                 self.temp_dir))

            del project['_langs']
            if len(lang_cat) != 0:
                root_cat.append(project)
        catalog_upload = prep_data_upload('catalog.json', root_cat,
                                          self.temp_dir)
        api_uploads.append(catalog_upload)
        # TRICKY: also upload to legacy path for backwards compatibility
        api_uploads.append({
            'key': '/ts/txt/2/catalog.json',
            'path': catalog_upload['path']
        })

        # upload files
        for upload in api_uploads:
            if not upload['key'].startswith('/'):
                key = '{}/{}'.format(TsV2CatalogHandler.cdn_root_path,
                                     upload['key'])
            else:
                key = upload['key'].lstrip('/')
            self.cdn_handler.upload_file(upload['path'], key)

        self.status['state'] = 'complete'
        self.status['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
        self.db_handler.update_item(
            {'api_version': TsV2CatalogHandler.api_version}, self.status)