def test_index_inconsistent_image_titles(self): mockApi = MockAPI(self.resources_dir, 'https://example.com') format = { 'format': 'type=book', 'url': 'https://example.com/en_obs.zip' } response = index_obs('en', 'obs', format, self.temp_dir, mockApi.download_file) chapters = response['chapters'] self.assertEqual(2, len(chapters)) self.assertEqual('01', chapters[0]['number']) self.assertEqual(16, len(chapters[0]['frames'])) frames = chapters[0]['frames'] self.assertEqual('01-01', frames[0]['id']) self.assertEqual('01-02', frames[1]['id']) self.assertEqual('02', chapters[1]['number']) self.assertEqual(12, len(chapters[1]['frames']))
def convert_v3_to_v2(self, v3_catalog, status): """ Builds a v2 catalog for the uW api endpoint. This uses the v3 catalog as the source :param v3_catalog: the v3 catalog :param status: the build status retrieved from AWS. :return: the complete v2 catalog """ cat_keys = [] v2_catalog = {'obs': {}, 'bible': {}} title_map = {'bible': 'Bible', 'obs': 'Open Bible Stories'} last_modified = 0 for lang in v3_catalog['languages']: lid = lang['identifier'] self.logger.info('Processing {}'.format(lid)) for res in lang['resources']: rid = res['identifier'] if rid == 'obs': cat_key = 'obs' else: cat_key = 'bible' mod = str_to_unix_time(res['modified']) if int(mod) > last_modified: last_modified = int(mod) # TRICKY: we are not processing the resource formats toc = [] for proj in res['projects']: pid = proj['identifier'] if 'formats' in proj and proj['formats']: source = None pdf = None media = { 'audio': { 'src_dict': {} }, 'video': { 'src_dict': {} } } for format in proj['formats']: # skip media formats that do not match the source version if 'source_version' in format and format[ 'source_version'] != res['version']: if self.logger: self.logger.warning( '{}_{}_{}: media format "{}" does not match source version "{}" and will be excluded.' .format(lid, rid, pid, format['url'], res['version'])) continue if rid == 'obs' and 'type=book' in format['format']: # TRICKY: obs must be converted to json process_id = '_'.join([lid, rid, pid]) obs_key = '{}/{}/{}/{}/v{}/source.json'.format( self.cdn_root_path, pid, lid, rid, res['version']) if process_id not in status['processed']: obs_json = index_obs( lid, rid, format, self.temp_dir, self.download_file) upload = self._prep_json_upload( obs_key, obs_json) self.cdn_handler.upload_file( upload['path'], upload['key']) # sign obs file. # TRICKY: we only need to sign obs so we do so now. sig_file = self.signer.sign_file( upload['path']) try: self.signer.verify_signature( upload['path'], sig_file) self.cdn_handler.upload_file( sig_file, '{}.sig'.format(upload['key'])) except RuntimeError: if self.logger: self.logger.warning( 'Could not verify signature {}' .format(sig_file)) status['processed'].update( {process_id: []}) status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( { 'api_version': UwV2CatalogHandler.api_version }, status) else: cat_keys = cat_keys + status['processed'][ process_id] source = { 'url': '{}/{}'.format(self.cdn_url, obs_key), 'signature': '{}/{}.sig'.format(self.cdn_url, obs_key) } elif rid != 'obs' and format[ 'format'] == 'text/usfm': # process bible process_id = '_'.join([lid, rid, pid]) bible_key = '{0}/{1}/{2}/{3}/v{4}/{1}.usfm'.format( self.cdn_root_path, pid, lid, rid, res['version']) if process_id not in status['processed']: usfm = self._process_usfm(format) upload = self._prep_text_upload( bible_key, usfm) self.cdn_handler.upload_file( upload['path'], upload['key']) # sign file sig_file = self.signer.sign_file( upload['path']) try: self.signer.verify_signature( upload['path'], sig_file) self.cdn_handler.upload_file( sig_file, '{}.sig'.format(upload['key'])) except RuntimeError: if self.logger: self.logger.warning( 'Could not verify signature {}' .format(sig_file)) status['processed'].update( {process_id: []}) status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( { 'api_version': UwV2CatalogHandler.api_version }, status) else: cat_keys = cat_keys + status['processed'][ process_id] source = { 'url': '{}/{}'.format(self.cdn_url, bible_key), 'signature': '{}/{}.sig'.format(self.cdn_url, bible_key) } elif 'content=audio/mp3' in format[ 'format'] or 'content=video/mp4' in format[ 'format']: # process media quality_value, quality_suffix = self.__parse_media_quality( format['quality']) if 'content=audio/mp3' in format['format']: media_container = media['audio'] quality_key = 'bitrate' quality_short_key = 'br' else: media_container = media['video'] quality_key = 'resolution' quality_short_key = 'res' # build chapter src src_dict = {} if 'chapters' in format: for chapter in format['chapters']: src_dict[chapter['identifier']] = { quality_short_key: [{ quality_key: int(quality_value), 'mod': int( str_to_unix_time( chapter['modified'])), 'size': chapter['size'] }], 'chap': chapter['identifier'], 'length': int(math.ceil(chapter['length'])), 'src': chapter['url'].replace( format['quality'], '{bitrate}' + quality_suffix), 'src_sig': chapter['signature'].replace( format['quality'], '{bitrate}' + quality_suffix) } merge_dict( media_container, { 'contributors': ',\\n'.join(format['contributor']), 'rev': format['version'], 'txt_ver': format['source_version'], 'src_dict': src_dict }) elif 'application/pdf' == format['format']: pdf = { 'url': format['url'], 'source_version': format['source_version'] } # build catalog if not source: if self.logger: self.logger.debug( 'No book text found in {}_{}_{}'.format( lid, rid, pid)) continue media_keys = media.keys() for key in media_keys: if media[key]['src_dict']: media[key]['src_list'] = [ media[key]['src_dict'][k] for k in media[key]['src_dict'] ] del media[key]['src_dict'] else: del media[key] toc_item = { 'desc': '', 'media': media, 'mod': mod, 'slug': proj['identifier'], 'src': source['url'], 'src_sig': source['signature'], 'title': proj['title'], } if rid == 'obs': del toc_item['slug'] if pdf: toc_item['pdf'] = pdf['url'] if not media: del toc_item['media'] toc.append(toc_item) if not toc: continue # TRICKY: not all manifests have a source text if 'source' in res and len(res['source']): source = res['source'][0] else: source = {'language': '', 'version': ''} comment = '' if 'comment' in res: comment = res['comment'] # TRICKY: maintain legacy slug formatting for backwards compatibility legacy_slug = '{}-{}'.format(rid, lid) res_v2_id = rid if legacy_slug in self.legacy_slugs or rid == 'obs': res_v2_id = legacy_slug res_v2 = { 'slug': res_v2_id, 'name': res['title'], 'mod': mod, 'status': { 'checking_entity': '; '.join(res['checking']['checking_entity']), 'checking_level': res['checking']['checking_level'], 'comments': comment, 'contributors': '; '.join(res['contributor']), 'publish_date': res['issued'], 'source_text': source['language'], 'source_text_version': source['version'], 'version': res['version'] }, 'toc': toc } if not lid in v2_catalog[cat_key]: v2_catalog[cat_key][lid] = { 'lc': lid, 'mod': mod, 'vers': [] } v2_catalog[cat_key][lid]['vers'].append(res_v2) # condense catalog catalog = {'cat': [], 'mod': last_modified} for cat_slug in v2_catalog: langs = [] for lid in v2_catalog[cat_slug]: langs.append(v2_catalog[cat_slug][lid]) catalog['cat'].append({ 'slug': cat_slug, 'title': title_map[cat_slug], 'langs': langs }) return catalog
def __execute(self): cat_keys = [] cat_dict = {} supplemental_resources = [] result = self._get_status() if not result: return False else: (self.status, source_status) = result # check if build is complete if self.status['state'] == 'complete': self.logger.debug('Catalog already generated') return True # retrieve the latest catalog self.logger.debug("Catalog url {0}".format( source_status['catalog_url'])) catalog_content = self.get_url(source_status['catalog_url'], True) if not catalog_content: self.logger.error("{0} does not exist".format( source_status['catalog_url'])) return False try: self.latest_catalog = json.loads(catalog_content) except Exception as e: self.logger.error("Failed to load the catalog json: {0}".format(e)) return False # walk v3 catalog for lang in self.latest_catalog['languages']: lid = TsV2CatalogHandler.sanitize_identifier(lang['identifier'], lower=False) self.logger.info('Processing {}'.format(lid)) for res in lang['resources']: rid = TsV2CatalogHandler.sanitize_identifier(res['identifier']) self.logger.debug('Processing {}_{}'.format(lid, rid)) rc_format = None self.logger.debug('Temp directory {} contents {}'.format( self.temp_dir, get_subdirs(self.temp_dir))) res_temp_dir = os.path.join(self.temp_dir, lid, rid) os.makedirs(res_temp_dir) if 'formats' in res: for format in res['formats']: finished_processes = {} if not rc_format and get_rc_type(format): # locate rc_format (for multi-project RCs) rc_format = format #res is resource, rid is resource id, lid is language id process_id = '_'.join([lid, rid, 'usfm']) if process_id not in self.status['processed']: self._process_usfm(lid, rid, res, format, res_temp_dir) finished_processes[process_id] = [] # TRICKY: bible notes and questions are in the resource if rid != 'obs': process_id = '_'.join([lid, rid, 'notes']) if process_id not in self.status['processed']: self.logger.info( 'Processing notes {}_{}'.format(lid, rid)) tn = self._index_note_files( lid, rid, format, process_id, res_temp_dir) if tn: self._upload_all(tn) finished_processes[process_id] = tn.keys() cat_keys = cat_keys + tn.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] process_id = '_'.join([lid, rid, 'questions']) if process_id not in self.status['processed']: self.logger.info( 'Processing questions {}_{}'.format( lid, rid)) tq = self._index_question_files( lid, rid, format, process_id, res_temp_dir) if tq: self._upload_all(tq) finished_processes[process_id] = tq.keys() cat_keys = cat_keys + tq.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] # TRICKY: update the finished processes once per format to limit db hits if finished_processes: self.status['processed'].update(finished_processes) self.status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( { 'api_version': TsV2CatalogHandler.api_version }, self.status) for project in res['projects']: pid = TsV2CatalogHandler.sanitize_identifier( project['identifier']) self.logger.debug('Processing {}_{}_{}'.format( lid, rid, pid)) if 'formats' in project: for format in project['formats']: finished_processes = {} if not rc_format and get_rc_type(format): # locate rc_format (for single-project RCs) rc_format = format # TRICKY: there should only be a single tW for each language process_id = '_'.join([lid, 'words']) if process_id not in self.status['processed']: tw = self._index_words_files( lid, rid, format, process_id, res_temp_dir) if tw: self._upload_all(tw) finished_processes[process_id] = tw.keys() cat_keys = cat_keys + tw.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] if rid == 'obs': process_id = '_'.join([lid, rid, pid]) if process_id not in self.status['processed']: self.logger.debug( 'Processing {}'.format(process_id)) obs_json = index_obs( lid, rid, format, res_temp_dir, self.download_file) upload = prep_data_upload( '{}/{}/{}/v{}/source.json'.format( pid, lid, rid, res['version']), obs_json, res_temp_dir) self._upload(upload) finished_processes[process_id] = [] else: cat_keys = cat_keys + self.status[ 'processed'][process_id] # TRICKY: obs notes and questions are in the project process_id = '_'.join([lid, rid, pid, 'notes']) if process_id not in self.status['processed']: tn = self._index_note_files( lid, rid, format, process_id, res_temp_dir) if tn: self._upload_all(tn) finished_processes[process_id] = tn.keys() cat_keys = cat_keys + tn.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] process_id = '_'.join([lid, rid, pid, 'questions']) if process_id not in self.status['processed']: tq = self._index_question_files( lid, rid, format, process_id, res_temp_dir) if tq: self._upload_all(tq) finished_processes[process_id] = tq.keys() cat_keys = cat_keys + tq.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] # TRICKY: update the finished processes once per format to limit db hits if finished_processes: self.status['processed'].update( finished_processes) self.status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( { 'api_version': TsV2CatalogHandler.api_version }, self.status) if not rc_format: raise Exception( 'Could not find a format for {}_{}_{}'.format( lid, rid, pid)) modified = make_legacy_date(rc_format['modified']) rc_type = get_rc_type(rc_format) self.logger.debug( 'Resource container type is {}'.format(rc_type)) if modified is None: modified = time.strftime('%Y%m%d') self.logger.warning( 'Could not find date modified for {}_{}_{} from "{}"' .format(lid, rid, pid, rc_format['modified'])) if rc_type == 'book' or rc_type == 'bundle': self._build_catalog_node(cat_dict, lang, res, project, modified) else: # store supplementary resources for processing after catalog nodes have been fully built supplemental_resources.append({ 'language': lang, 'resource': res, 'project': project, 'modified': modified, 'rc_type': rc_type }) # cleanup resource directory remove_tree(res_temp_dir) # cleanup language directory remove_tree(os.path.join(self.temp_dir, lid)) # inject supplementary resources for s in supplemental_resources: self._add_supplement(cat_dict, s['language'], s['resource'], s['project'], s['modified'], s['rc_type']) api_uploads = [] # normalize catalog nodes root_cat = [] for pid in cat_dict: project = cat_dict[pid] lang_cat = [] for lid in project['_langs']: lang = project['_langs'][lid] res_cat = [] for rid in lang['_res']: res = lang['_res'][rid] # disable missing catalogs # disable tN if '_'.join([lid, '*', pid, 'tn']) not in cat_keys: res['notes'] = '' # disable tQ if '_'.join([lid, '*', pid, 'tq']) not in cat_keys: res['checking_questions'] = '' # disable tW if '_'.join([lid, '*', '*', 'tw']) not in cat_keys: res['terms'] = '' res_cat.append(res) api_uploads.append( prep_data_upload('{}/{}/resources.json'.format(pid, lid), res_cat, self.temp_dir)) del lang['_res'] if ('project' in lang): # skip empty artifacts lang_cat.append(lang) else: self.logger.warning( 'Excluding empty language artifact in {}'.format(pid)) api_uploads.append( prep_data_upload('{}/languages.json'.format(pid), lang_cat, self.temp_dir)) del project['_langs'] if len(lang_cat) != 0: root_cat.append(project) catalog_upload = prep_data_upload('catalog.json', root_cat, self.temp_dir) api_uploads.append(catalog_upload) # TRICKY: also upload to legacy path for backwards compatibility api_uploads.append({ 'key': '/ts/txt/2/catalog.json', 'path': catalog_upload['path'] }) # upload files for upload in api_uploads: if not upload['key'].startswith('/'): key = '{}/{}'.format(TsV2CatalogHandler.cdn_root_path, upload['key']) else: key = upload['key'].lstrip('/') self.cdn_handler.upload_file(upload['path'], key) self.status['state'] = 'complete' self.status['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( {'api_version': TsV2CatalogHandler.api_version}, self.status)