def is_lambda_running(context, dbname, lambda_suffix=None, dynamodb_handler=None): """ Retrieves the last recorded process information for this lambda. This is used to determine if the lambda is already running. :param context: :param string dbname: the database holding a list of lambda run times :param string lambda_suffix: name of the lambda handler :return: """ if not context: return False if dynamodb_handler: db = dynamodb_handler(dbname) else: db = DynamoDBHandler(dbname) lambda_name = context.function_name if lambda_suffix: lambda_name = '{}.{}'.format(lambda_name, lambda_suffix) request = db.get_item({ "lambda": lambda_name }) if request: last_time = arrow.get(request['started_at']).to('local') # TRICKY: we use this lambda's expires time instead of the recorded value avoid delays in applying changes to expiration times. timeout = arrow.now().shift(minutes=-lambda_min_remaining(context)) return last_time > timeout else: return False
def __init__(self, event, context, **kwargs): super(ForkHandler, self).__init__(event, context) self.stage_vars = self.retrieve(self.event, 'stage-variables', 'payload') gogs_token = self.retrieve(self.stage_vars, 'gogs_token', 'Environment Vars') self.gogs_url = self.retrieve(self.stage_vars, 'gogs_url', 'Environment Vars') self.gogs_org = self.retrieve(self.stage_vars, 'gogs_org', 'Environment Vars') self.from_email = self.retrieve(self.stage_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(self.stage_vars, 'to_email', 'Environment Vars') if 'dynamodb_handler' in kwargs: self.progress_table = kwargs['dynamodb_handler'] else: self.progress_table = DynamoDBHandler( '{}d43-catalog-in-progress'.format( self.stage_prefix())) # pragma: no cover if 'gitea_client' in kwargs: self.gitea_client = kwargs['gitea_client'] else: self.gitea_client = GiteaClient # pragma: no cover if 'boto_handler' in kwargs: self.boto = kwargs['boto_handler'] else: self.boto = boto3 # pragma: no cover if 'logger' in kwargs: self.logger = kwargs['logger'] self.gogs_api = self.gitea_client.GiteaApi(self.gogs_url) self.gogs_auth = self.gitea_client.Token(gogs_token)
def is_lambda_running(context, dbname, lambda_suffix=None, dynamodb_handler=None): """ Retrieves the last recorded process information for this lambda. This is used to determine if the lambda is already running. :param context: :param string dbname: the database holding a list of lambda run times :param string lambda_suffix: name of the lambda handler :return: """ if not context: return False if dynamodb_handler: db = dynamodb_handler(dbname) else: db = DynamoDBHandler(dbname) lambda_name = context.function_name if lambda_suffix: lambda_name = '{}.{}'.format(lambda_name, lambda_suffix) request = db.get_item({"lambda": lambda_name}) if request: start_time = arrow.get(request['started_at']).to('local') expires_time = start_time.shift(minutes=request['expires'] / 60000) return arrow.now() > expires_time else: return False
def lambda_sec_remaining(context, dbname, lambda_suffix=None, dynamodb_handler=None): """ Returns the time remaining in seconds before the lambda times out :param context: :return: """ if not context: return timedelta(seconds=0) if dynamodb_handler: db = dynamodb_handler(dbname) else: db = DynamoDBHandler(dbname) lambda_name = context.function_name if lambda_suffix: lambda_name = '{}.{}'.format(lambda_name, lambda_suffix) request = db.get_item({"lambda": lambda_name}) if request: start_time = arrow.get(request['started_at']).to('local') # TRICKY: expires is in milliseconds expires_time = start_time.shift(microseconds=(int(request['expires']) * 1000)) return expires_time - arrow.now() else: return timedelta(seconds=0)
def set_lambda_running(context, dbname, lambda_suffix=None, dynamodb_handler=None): """ Sets the process information for this lambda. This is used to indicate the lambda is currently running. :param context: :param string dbname: the database holding a list of lambda run times. :param string lambda_suffix: name of the lambda handler :return: """ if not context: return if dynamodb_handler: db = dynamodb_handler(dbname) else: db = DynamoDBHandler(dbname) lambda_name = context.function_name if lambda_suffix: lambda_name = '{}.{}'.format(lambda_name, lambda_suffix) db.insert_item({ "lambda": lambda_name, "request_id": context.aws_request_id, "started_at": arrow.utcnow().isoformat(), "expires": context.get_remaining_time_in_millis() })
def _clear_report(self): """ Removes the error report from the db :return: """ db = DynamoDBHandler(self.__table_name) db.delete_item({'lambda': self.__reporter}) self._report = None
def _record_report(self): """ Stores the error report in the database :return: """ if self._report: db = DynamoDBHandler(self.__table_name) db.update_item({'lambda': self.__reporter}, self._report)
def __init__(self, event, context, **kwargs): super(CatalogHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.cdn_url = self.retrieve(env_vars, 'cdn_url').rstrip('/') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket') self.api_bucket = self.retrieve(env_vars, 'api_bucket') self.api_url = self.retrieve(env_vars, 'api_url').rstrip('/') self.to_email = self.retrieve(env_vars, 'to_email') self.from_email = self.retrieve(env_vars, 'from_email') self.api_version = self.retrieve(env_vars, 'version') in_progress_db = self.retrieve_with_default( env_vars, 'in_progress_db', '{}d43-catalog-in-progress'.format(self.stage_prefix())) status_db = self.retrieve_with_default( env_vars, 'status_db', '{}d43-catalog-status'.format(self.stage_prefix())) errors_db = self.retrieve_with_default( env_vars, 'errors_db', '{}d43-catalog-errors'.format(self.stage_prefix())) if 'dynamodb_handler' in kwargs: db_handler = kwargs['dynamodb_handler'] self.progress_table = db_handler(in_progress_db) self.status_table = db_handler(status_db) self.errors_table = db_handler(errors_db) else: self.progress_table = DynamoDBHandler( in_progress_db) # pragma: no cover self.status_table = DynamoDBHandler(status_db) # pragma: no cover self.errors_table = DynamoDBHandler(errors_db) # pragma: no cover self.catalog = {"languages": []} if 's3_handler' in kwargs: self.api_handler = kwargs['s3_handler'](self.api_bucket) else: self.api_handler = S3Handler(self.api_bucket) # pragma: no cover if 'ses_handler' in kwargs: self.ses_handler = kwargs['ses_handler']() else: self.ses_handler = SESHandler() # pragma: no cover if 'consistency_checker' in kwargs: self.checker = kwargs['consistency_checker']() else: self.checker = ConsistencyChecker( self.cdn_bucket, self.api_bucket) # pragma: no cover if 'get_url_handler' in kwargs: self.get_url = kwargs['get_url_handler'] else: self.get_url = get_url # pragma: no cover if 'url_exists_handler' in kwargs: self.url_exists = kwargs['url_exists_handler'] else: self.url_exists = url_exists # pragma: no cover
def __init__(self, event, context, logger, **kwargs): super(WebhookHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.gogs_url = self.retrieve(env_vars, 'gogs_url', 'Environment Vars') self.gogs_token = self.retrieve(env_vars, 'gogs_token', 'Environment Vars') self.gogs_org = self.retrieve(env_vars, 'gogs_org', 'Environment Vars') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.api_url = self.retrieve(env_vars, 'api_url', 'Environment Vars') self.repo_commit = self.retrieve(event, 'body-json', 'payload') self.api_version = self.retrieve(env_vars, 'version') # NOTE: it would be better to use the header X-GitHub-Event to determine the type of event. if 'pull_request' in self.repo_commit: # TODO: this is deprecated self.__parse_pull_request(self.repo_commit) elif 'forkee' in self.repo_commit or ('action' in self.repo_commit and self.repo_commit['action'] == 'created'): # handles fork and create events self.__parse_fork(self.repo_commit) elif 'pusher' in self.repo_commit: self.__parse_push(self.repo_commit) else: raise Exception('Unsupported webhook request received ' + self.repo_commit['repository']['name'] + ' ' + json.dumps(self.repo_commit)) self.resource_id = None # set in self._build self.logger = logger # type: logging._loggerClass if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.db_handler = DynamoDBHandler( '{}d43-catalog-in-progress'.format( self.stage_prefix())) # pragma: no cover if 's3_handler' in kwargs: self.s3_handler = kwargs['s3_handler'] else: self.s3_handler = S3Handler(self.cdn_bucket) # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover
def add_error(self, message): """ Adds an error to the report :param string|list message: :return: """ if isinstance(message, list): self.logger.info('Reporting Error: {}'.format(json.dumps(message)), exc_info=1) elif isinstance(message, str) or isinstance(message, unicode): self.logger.info('Reporting Error: {}'.format(message), exc_info=1) else: self.logger.warning( 'Unable to report error. Invalid type "{}"'.format( type(message)), exc_info=1) return db = DynamoDBHandler(self.__table_name) # load report if not self._report: item = db.get_item({'lambda': self.__reporter}) if not item: item = {} self._report = { 'errors': [], 'lambda': self.__reporter, 'reporters': [] } self._report.update(item) # start new report if self.__request_id not in self._report['reporters']: self._report['errors'] = [] self._report['reporters'].append(self.__request_id) # append errors to report if isinstance(message, list): timestamp = arrow.utcnow().isoformat() for m in message: self._report['errors'].append({ 'message': m.decode('utf-8'), 'timestamp': timestamp }) else: self._report['errors'].append({ 'message': message.decode('utf-8'), 'timestamp': arrow.utcnow().isoformat() })
def __init__(self, event, context, logger, **kwargs): super(WebhookHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.gogs_url = self.retrieve(env_vars, 'gogs_url', 'Environment Vars') self.gogs_org = self.retrieve(env_vars, 'gogs_org', 'Environment Vars') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.api_url = self.retrieve(env_vars, 'api_url', 'Environment Vars') self.repo_commit = self.retrieve(event, 'body-json', 'payload') self.api_version = self.retrieve(env_vars, 'version') self.in_progress_db = self.retrieve_with_default( env_vars, 'in_progress_db', '{}d43-catalog-in-progress'.format(self.stage_prefix())) if 'pull_request' in self.repo_commit: self.__parse_pull_request(self.repo_commit) else: self.__parse_push(self.repo_commit) self.resource_id = None # set in self._build self.logger = logger # type: logging._loggerClass if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.logger.debug( "Creating Dynamodb handler pointing to {}".format( self.in_progress_db)) self.db_handler = DynamoDBHandler( self.in_progress_db) # pragma: no cover if 's3_handler' in kwargs: self.s3_handler = kwargs['s3_handler'] else: self.s3_handler = S3Handler(self.cdn_bucket) # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover
def __init__(self, event, context, logger, signer, **kwargs): super(SigningHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.api_version = self.retrieve(env_vars, 'version', 'Environment Vars') self.api_bucket = self.retrieve(env_vars, 'api_bucket', 'Environment Vars') self.logger = logger # type: logging._loggerClass self.signer = signer self.in_progress_db = self.retrieve_with_default( env_vars, 'in_progress_db', '{}d43-catalog-in-progress'.format(self.stage_prefix())) if 's3_handler' in kwargs: self.cdn_handler = kwargs['s3_handler'] else: self.cdn_handler = S3Handler(self.cdn_bucket) # pragma: no cover self.temp_dir = tempfile.mkdtemp(prefix='signing_') if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.db_handler = DynamoDBHandler( self.in_progress_db) # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover if 'url_exists_handler' in kwargs: self.url_exists = kwargs['url_exists_handler'] else: self.url_exists = url_exists # pragma: no cover if 'url_headers_handler' in kwargs: self.url_headers = kwargs['url_headers_handler'] else: self.url_headers = url_headers # pragma: no cover
def __init__(self, event, context, logger, **kwargs): super(TsV2CatalogHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars').rstrip('/') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.max_usfm_size = int( self.retrieve_with_default(env_vars, 'max_usfm_size', '2000000')) self.status_db = self.retrieve_with_default( env_vars, 'status_db', '{}d43-catalog-status'.format(self.stage_prefix())) self.logger = logger # type: logging._loggerClass if 's3_handler' in kwargs: self.cdn_handler = kwargs['s3_handler'] else: self.cdn_handler = S3Handler(self.cdn_bucket) # pragma: no cover if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.db_handler = DynamoDBHandler( self.status_db) # pragma: no cover if self.db_handler.logger: self.db_handler.logger.setLevel(logger.level) if 'url_handler' in kwargs: self.get_url = kwargs['url_handler'] else: self.get_url = get_url # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover if 'url_exists_handler' in kwargs: self.url_exists = kwargs['url_exists_handler'] else: self.url_exists = url_exists # pragma: no cover self.temp_dir = tempfile.mkdtemp('', 'tsv2', None)
def __init__(self, event, context, logger, **kwargs): super(UwV2CatalogHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars').rstrip('/') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.logger = logger # type: logging._loggerClass self.temp_dir = tempfile.mkdtemp('', 'uw_v2', None) self.status_db = self.retrieve_with_default( env_vars, 'status_db', '{}d43-catalog-status'.format(self.stage_prefix())) if 's3_handler' in kwargs: self.cdn_handler = kwargs['s3_handler'] else: self.cdn_handler = S3Handler(self.cdn_bucket) # pragma: no cover if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.db_handler = DynamoDBHandler( self.status_db) # pragma: no cover if 'url_handler' in kwargs: self.get_url = kwargs['url_handler'] else: self.get_url = get_url # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover if 'signing_handler' in kwargs: self.signer = kwargs['signing_handler'] else: self.signer = Signer(ENC_PRIV_PEM_PATH) # pragma: no cover
def clear_lambda_running(context, dbname, lambda_suffix=None, dynamodb_handler=None): """ This is a convenience method to clear a lambda from the list of running lambdas :param context: :param string dbname: :param string lambda_suffix: the name of the lambda handler :param dynamodb_handler: :return: """ if dynamodb_handler: db = dynamodb_handler(dbname) else: db = DynamoDBHandler(dbname) lambda_name = context.function_name if lambda_suffix: lambda_name = '{}.{}'.format(lambda_name, lambda_suffix) db.delete_item({"lambda": lambda_name})
class UwV2CatalogHandler(InstanceHandler): cdn_root_path = 'v2/uw' api_version = 'uw.2' def __init__(self, event, context, logger, **kwargs): super(UwV2CatalogHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars').rstrip('/') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.logger = logger # type: logging._loggerClass self.temp_dir = tempfile.mkdtemp('', 'uw_v2', None) if 's3_handler' in kwargs: self.cdn_handler = kwargs['s3_handler'] else: self.cdn_handler = S3Handler(self.cdn_bucket) # pragma: no cover if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.db_handler = DynamoDBHandler('{}d43-catalog-status'.format( self.stage_prefix())) # pragma: no cover if 'url_handler' in kwargs: self.get_url = kwargs['url_handler'] else: self.get_url = get_url # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover if 'signing_handler' in kwargs: self.signer = kwargs['signing_handler'] else: self.signer = Signer(ENC_PRIV_PEM_PATH) # pragma: no cover def __del__(self): try: shutil.rmtree(self.temp_dir) finally: pass def _run(self): """ Generates the v2 catalog :return: """ try: return self.__execute() except Exception as e: self.report_error(e.message) raise Exception, Exception(e), sys.exc_info()[2] def __execute(self): """ We wrap this in a separate function to more easily handle errors :return: """ uploads = [] result = self._get_status() if not result: return False else: (status, source_status) = result # check if build is complete if status['state'] == 'complete': if self.logger: self.logger.debug('Catalog already generated') return True # retrieve the latest catalog catalog_content = self.get_url(source_status['catalog_url'], True) if not catalog_content: if self.logger: self.logger.error("{0} does not exist".format( source_status['catalog_url'])) return False try: self.latest_catalog = json.loads(catalog_content) except Exception as e: if self.logger: self.logger.error( "Failed to load the catalog json: {0}".format(e)) return False catalog = self.convert_v3_to_v2(self.latest_catalog, status) catalog_upload = self._prep_json_upload('catalog.json', catalog) uploads.append(catalog_upload) # TRICKY: also upload to legacy path for backwards compatibility uploads.append({ 'key': '/uw/txt/2/catalog.json', 'path': catalog_upload['path'] }) # upload files for upload in uploads: if not upload['key'].startswith('/'): key = '{}/{}'.format(UwV2CatalogHandler.cdn_root_path, upload['key']) else: key = upload['key'].lstrip('/') self.cdn_handler.upload_file(upload['path'], key) status['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ") status['state'] = 'complete' self.db_handler.update_item( {'api_version': UwV2CatalogHandler.api_version}, status) def convert_v3_to_v2(self, v3_catalog, status): """ Builds a v2 catalog for the uW api endpoint. This uses the v3 catalog as the source :param v3_catalog: the v3 catalog :param status: the build status retrieved from AWS. :return: the complete v2 catalog """ cat_keys = [] v2_catalog = {'obs': {}, 'bible': {}} title_map = {'bible': 'Bible', 'obs': 'Open Bible Stories'} last_modified = 0 for lang in v3_catalog['languages']: lid = lang['identifier'] self.logger.info('Processing {}'.format(lid)) for res in lang['resources']: rid = res['identifier'] if rid == 'obs': cat_key = 'obs' else: cat_key = 'bible' mod = str_to_unix_time(res['modified']) if int(mod) > last_modified: last_modified = int(mod) # TRICKY: we are not processing the resource formats toc = [] for proj in res['projects']: pid = proj['identifier'] if 'formats' in proj and proj['formats']: source = None pdf = None media = { 'audio': { 'src_dict': {} }, 'video': { 'src_dict': {} } } for format in proj['formats']: # skip media formats that do not match the source version if 'source_version' in format and format[ 'source_version'] != res['version']: if self.logger: self.logger.warning( '{}_{}_{}: media format "{}" does not match source version "{}" and will be excluded.' .format(lid, rid, pid, format['url'], res['version'])) continue if rid == 'obs' and 'type=book' in format['format']: # TRICKY: obs must be converted to json process_id = '_'.join([lid, rid, pid]) obs_key = '{}/{}/{}/{}/v{}/source.json'.format( self.cdn_root_path, pid, lid, rid, res['version']) if process_id not in status['processed']: obs_json = index_obs( lid, rid, format, self.temp_dir, self.download_file) upload = self._prep_json_upload( obs_key, obs_json) self.cdn_handler.upload_file( upload['path'], upload['key']) # sign obs file. # TRICKY: we only need to sign obs so we do so now. sig_file = self.signer.sign_file( upload['path']) try: self.signer.verify_signature( upload['path'], sig_file) self.cdn_handler.upload_file( sig_file, '{}.sig'.format(upload['key'])) except RuntimeError: if self.logger: self.logger.warning( 'Could not verify signature {}' .format(sig_file)) status['processed'].update( {process_id: []}) status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( { 'api_version': UwV2CatalogHandler.api_version }, status) else: cat_keys = cat_keys + status['processed'][ process_id] source = { 'url': '{}/{}'.format(self.cdn_url, obs_key), 'signature': '{}/{}.sig'.format(self.cdn_url, obs_key) } elif rid != 'obs' and format[ 'format'] == 'text/usfm': # process bible process_id = '_'.join([lid, rid, pid]) bible_key = '{0}/{1}/{2}/{3}/v{4}/{1}.usfm'.format( self.cdn_root_path, pid, lid, rid, res['version']) if process_id not in status['processed']: usfm = self._process_usfm(format) upload = self._prep_text_upload( bible_key, usfm) self.cdn_handler.upload_file( upload['path'], upload['key']) # sign file sig_file = self.signer.sign_file( upload['path']) try: self.signer.verify_signature( upload['path'], sig_file) self.cdn_handler.upload_file( sig_file, '{}.sig'.format(upload['key'])) except RuntimeError: if self.logger: self.logger.warning( 'Could not verify signature {}' .format(sig_file)) status['processed'].update( {process_id: []}) status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( { 'api_version': UwV2CatalogHandler.api_version }, status) else: cat_keys = cat_keys + status['processed'][ process_id] source = { 'url': '{}/{}'.format(self.cdn_url, bible_key), 'signature': '{}/{}.sig'.format(self.cdn_url, bible_key) } elif 'content=audio/mp3' in format[ 'format'] or 'content=video/mp4' in format[ 'format']: # process media quality_value, quality_suffix = self.__parse_media_quality( format['quality']) if 'content=audio/mp3' in format['format']: media_container = media['audio'] quality_key = 'bitrate' quality_short_key = 'br' else: media_container = media['video'] quality_key = 'resolution' quality_short_key = 'res' # build chapter src src_dict = {} if 'chapters' in format: for chapter in format['chapters']: src_dict[chapter['identifier']] = { quality_short_key: [{ quality_key: int(quality_value), 'mod': int( str_to_unix_time( chapter['modified'])), 'size': chapter['size'] }], 'chap': chapter['identifier'], 'length': int(math.ceil(chapter['length'])), 'src': chapter['url'].replace( format['quality'], '{bitrate}' + quality_suffix), 'src_sig': chapter['signature'].replace( format['quality'], '{bitrate}' + quality_suffix) } merge_dict( media_container, { 'contributors': ',\\n'.join(format['contributor']), 'rev': format['version'], 'txt_ver': format['source_version'], 'src_dict': src_dict }) elif 'application/pdf' == format['format']: pdf = { 'url': format['url'], 'source_version': format['source_version'] } # build catalog if not source: if self.logger: self.logger.debug( 'No book text found in {}_{}_{}'.format( lid, rid, pid)) continue media_keys = media.keys() for key in media_keys: if media[key]['src_dict']: media[key]['src_list'] = [ media[key]['src_dict'][k] for k in media[key]['src_dict'] ] del media[key]['src_dict'] else: del media[key] toc_item = { 'desc': '', 'media': media, 'mod': mod, 'slug': proj['identifier'], 'src': source['url'], 'src_sig': source['signature'], 'title': proj['title'], } if rid == 'obs': del toc_item['slug'] if pdf: toc_item['pdf'] = pdf['url'] if not media: del toc_item['media'] toc.append(toc_item) if not toc: continue # TRICKY: not all manifests have a source text if 'source' in res and len(res['source']): source = res['source'][0] else: source = {'language': '', 'version': ''} comment = '' if 'comment' in res: comment = res['comment'] # TRICKY: maintain legacy slug formatting for backwards compatibility legacy_slug = '{}-{}'.format(rid, lid) res_v2_id = rid if legacy_slug in self.legacy_slugs or rid == 'obs': res_v2_id = legacy_slug res_v2 = { 'slug': res_v2_id, 'name': res['title'], 'mod': mod, 'status': { 'checking_entity': '; '.join(res['checking']['checking_entity']), 'checking_level': res['checking']['checking_level'], 'comments': comment, 'contributors': '; '.join(res['contributor']), 'publish_date': res['issued'], 'source_text': source['language'], 'source_text_version': source['version'], 'version': res['version'] }, 'toc': toc } if not lid in v2_catalog[cat_key]: v2_catalog[cat_key][lid] = { 'lc': lid, 'mod': mod, 'vers': [] } v2_catalog[cat_key][lid]['vers'].append(res_v2) # condense catalog catalog = {'cat': [], 'mod': last_modified} for cat_slug in v2_catalog: langs = [] for lid in v2_catalog[cat_slug]: langs.append(v2_catalog[cat_slug][lid]) catalog['cat'].append({ 'slug': cat_slug, 'title': title_map[cat_slug], 'langs': langs }) return catalog def _process_usfm(self, format): url = format['url'] usfm_file = os.path.join(self.temp_dir, md5(url).hexdigest()) self.download_file(url, usfm_file) usfm = read_file(usfm_file) return convert_chunk_markers(strip_word_data(usfm)) def _get_status(self): """ Retrieves the catalog status from AWS. :return: A tuple containing the status object of the target and source catalogs, or False if the source is not ready """ status_results = self.db_handler.query_items({ 'api_version': { 'condition': 'is_in', 'value': ['3', UwV2CatalogHandler.api_version] } }) source_status = None status = None for s in status_results: if s['api_version'] == '3': source_status = s elif s['api_version'] == UwV2CatalogHandler.api_version: status = s if not source_status: if self.logger: self.logger.debug('Source catalog status not found') return False if source_status['state'] != 'complete': if self.logger: self.logger.debug('Source catalog is not ready for use') return False if not status or status['source_timestamp'] != source_status[ 'timestamp']: # begin or restart process status = { 'api_version': UwV2CatalogHandler.api_version, 'catalog_url': '{}/uw/txt/2/catalog.json'.format(self.cdn_url), 'source_api': source_status['api_version'], 'source_timestamp': source_status['timestamp'], 'state': 'in-progress', 'processed': {} } return (status, source_status) def _prep_json_upload(self, key, data): """ Prepares some data for upload to s3 :param key: :param data: :return: """ temp_file = os.path.join(self.temp_dir, key) write_file(temp_file, json.dumps(data, sort_keys=True)) return {'key': key, 'path': temp_file} def _prep_text_upload(self, key, data): """ Prepares some data for upload to s3 :param key: :param data: :return: """ temp_file = os.path.join(self.temp_dir, key) write_file(temp_file, data) return {'key': key, 'path': temp_file} def __parse_media_quality(self, quality): """ Returns the value and suffix from the quality :param quality: :return: """ abc = 'abcdefghijklmnopqrstufwxyz' value = quality.rstrip('{}{}'.format(abc, abc.upper())) suffix = quality[len(value):] return value, suffix # 'legacy_slugs' contains a list of legacy slugs for resources 'vers'. Legacy slugs are formatted as `res-lang` legacy_slugs = [ "ulb-ceb", "udb-ceb", "ulb-ee", "ulb-en", "udb-en", "ulb-hna", "ulb-ilo", "ulb-kbp", "ulb-kpo", "ulb-las", "ulb-lpx" ]
class SigningHandler(InstanceHandler): max_file_size = 400000000 # 400mb def __init__(self, event, context, logger, signer, **kwargs): super(SigningHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.api_version = self.retrieve(env_vars, 'version', 'Environment Vars') self.api_bucket = self.retrieve(env_vars, 'api_bucket', 'Environment Vars') self.logger = logger # type: logging._loggerClass self.signer = signer self.in_progress_db = self.retrieve_with_default( env_vars, 'in_progress_db', '{}d43-catalog-in-progress'.format(self.stage_prefix())) if 's3_handler' in kwargs: self.cdn_handler = kwargs['s3_handler'] else: self.cdn_handler = S3Handler(self.cdn_bucket) # pragma: no cover self.temp_dir = tempfile.mkdtemp(prefix='signing_') if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.db_handler = DynamoDBHandler( self.in_progress_db) # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover if 'url_exists_handler' in kwargs: self.url_exists = kwargs['url_exists_handler'] else: self.url_exists = url_exists # pragma: no cover if 'url_headers_handler' in kwargs: self.url_headers = kwargs['url_headers_handler'] else: self.url_headers = url_headers # pragma: no cover def __del__(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def _safe_url_exists(self, url): """ Safely checks if a url exists. :param url: :return: """ try: return self.url_exists(url) except Exception as e: self.report_error('Failed to read url "{}": {}'.format( url, e.message)) return False def _run(self): items = self.db_handler.query_items({'signed': False}) try: for item in items: repo_name = item['repo_name'] try: package = json.loads(item['package']) except Exception as e: self.report_error('Skipping {}. Bad Manifest: {}'.format( repo_name, e)) continue if repo_name != "catalogs" and repo_name != 'localization' and repo_name != 'versification': self.process_db_item(item, package) found_items = len(items) > 0 if not found_items and self.logger: self.logger.info('No items found for signing') return found_items except Exception as e: self.report_error('Failed processing an item: {}'.format( e.message)) raise Exception, Exception(e), sys.exc_info()[2] finally: if os.path.isdir(self.temp_dir): shutil.rmtree(self.temp_dir, ignore_errors=True) def process_db_item(self, item, package): was_signed = False fully_signed = True self.logger.info('Processing {}'.format(item['repo_name'])) if 'formats' in package: for format in package['formats']: # process resource formats (already_signed, newly_signed) = self.process_format(item, package['dublin_core'], None, format) if newly_signed: was_signed = True if not (already_signed or newly_signed): fully_signed = False for project in package['projects']: if 'formats' in project: for format in project['formats']: # process project formats (already_signed, newly_signed) = self.process_format( item, package['dublin_core'], project, format) if newly_signed: was_signed = True if not (already_signed or newly_signed): fully_signed = False # process format chapters if 'chapters' in format: sanitized_chapters = [] for chapter in format['chapters']: # TRICKY: only process/keep chapters that actually have a valid url if 'url' not in chapter or not self._safe_url_exists( chapter['url']): if 'url' not in chapter: missing_url = 'empty url' else: missing_url = chapter['url'] self.logger.warning( 'Skipping chapter {}:{} missing url {}'. format(project['identifier'], chapter['identifier'], missing_url)) continue (already_signed, newly_signed) = self.process_format( item, package['dublin_core'], project, chapter) sanitized_chapters.append(chapter) if newly_signed: was_signed = True if not (already_signed or newly_signed): fully_signed = False format['chapters'] = sanitized_chapters # update format if sanitized_chapters and not 'content=' in format[ 'format'] and format['url'].endswith('zip'): if format['chapters'][0]['url'].endswith('.mp3'): format[ 'format'] = 'application/zip; content=audio/mp3' if format['chapters'][0]['url'].endswith('.mp4'): format[ 'format'] = 'application/zip; content=video/mp4' if was_signed or fully_signed: self.logger.debug('recording signatures') record_keys = {'repo_name': item['repo_name']} self.db_handler.update_item( record_keys, { 'package': json.dumps(package, sort_keys=True), 'signed': fully_signed }) def process_format(self, item, dublin_core, project, format): """ Performs the signing on the format object. Files outside of the cdn will not be signed :param item: :param dublin_core: :param project: this may be None. :param format: :return: (already_signed, newly_signed) """ if 'signature' in format and format['signature']: return (True, False) else: self.logger.debug('Signing {}'.format(format['url'])) base_name = os.path.basename(format['url']) file_to_sign = os.path.join(self.temp_dir, base_name) # extract cdn key from url url_info = urlparse.urlparse(format['url']) src_key = url_info.path.lstrip('/') sig_key = '{}.sig'.format(src_key) build_rules = get_build_rules(format, 'signing') # TRICKY: allow dev environments to download from prod environment # RS: I added the s3 bucket here because it isn't yet accessible via urls valid_hosts = [ self.cdn_bucket, self.cdn_bucket + ".s3.us-east-2.amazonaws.com" ] if self.stage_prefix(): if not self.cdn_bucket.startswith(self.stage_prefix()): self.logger.warning( 'Expected `cdn_bucket` to begin with the stage prefix ({}) but found {}' .format(self.stage_prefix(), self.cdn_bucket)) prod_cdn_bucket = self.cdn_bucket.lstrip(self.stage_prefix()) valid_hosts.append(prod_cdn_bucket) # TRICKY: force dev environments to handle prod content as external files # if format['url'].startswith(prod_cdn_url): # build_rules.append('sign_given_url') # TRICKY: some html content is on the api if 'html_format' in build_rules: valid_hosts.append(self.api_bucket) prod_api_bucket = self.api_bucket.lstrip(self.stage_prefix()) valid_hosts.append(prod_api_bucket) # verify url is on the cdn if not url_info.hostname in valid_hosts: # TODO: external media should be imported if it's not too big # This allows media to be hosted on third party servers format['signature'] = '' #'{}.sig'.format(format['url']) self.logger.warning( 'cannot sign files outside of the cdn: {}'.format( format['url'])) self.logger.warning('valid hosts are: {}'.format( ", ".join(valid_hosts))) return (True, True) try: headers = self.url_headers(format['url']) except Exception as e: self.report_error('Could not read headers from {}: {}'.format( format['url'], e)) return (False, False) # skip files that are too large size = int(headers.get('content-length', 0)) if size > SigningHandler.max_file_size: sig_url = '{}.sig'.format(format['url']) if not self._safe_url_exists(sig_url): # wait for signature to be manually uploaded self.report_error('File is too large to sign {}'.format( format['url'])) return (False, False) # finish with manually uploaded signature format['size'] = size if not format['modified']: format['modified'] = str_to_timestamp( datetime.datetime.now().isoformat()) format['signature'] = sig_url return (False, True) # download file try: if 'sign_given_url' in build_rules or 'html_format' in build_rules: # report error if response is 400+ if headers.status >= 400: self.report_error('Resource not available at {}'.format( format['url'])) return (False, False) self.download_file(format['url'], file_to_sign) else: # TRICKY: most files to be signed are stored in a temp directory src_temp_key = 'temp/{}/{}/{}'.format(item['repo_name'], item['commit_id'], src_key) self.cdn_handler.download_file(src_temp_key, file_to_sign) except Exception as e: self.report_error( 'The file "{}" could not be downloaded: {}'.format( base_name, e)) return (False, False) # strip print script from html if 'html_format' in build_rules: self.logger.debug('Removing print script from {} html'.format( item['repo_name'])) self._strip_print_script(file_to_sign) # sign file sig_file = self.signer.sign_file(file_to_sign) try: self.signer.verify_signature(file_to_sign, sig_file) except RuntimeError: if self.logger: self.logger.warning( 'The signature was not successfully verified.') return (False, False) # TRICKY: re-format html urls if 'html_format' in build_rules: html_name = dublin_core['identifier'] if project: html_name = project['identifier'] src_key = '{}/{}/v{}/media/html/{}.html'.format( dublin_core['language']['identifier'], dublin_core['identifier'], self.api_version, html_name) sig_key = '{}.sig'.format(src_key) format['url'] = '{}/{}'.format(self.cdn_url, src_key) # upload files if 'sign_given_url' not in build_rules or 'html_format' in build_rules: # TRICKY: upload temp files to production self.cdn_handler.upload_file(file_to_sign, src_key) self.cdn_handler.upload_file(sig_file, sig_key) # add the url of the sig file to the format format['signature'] = '{}.sig'.format(format['url']) # read modified date from file stats = os.stat(file_to_sign) if not format['modified']: modified = headers.get('last-modified') if modified: # TRICKY: http header gives an odd date format date = datetime.datetime.strptime(modified, '%a, %d %b %Y %H:%M:%S %Z') modified = str_to_timestamp(date.isoformat()) else: modified = unix_to_timestamp(stats.st_mtime) format['modified'] = modified format['size'] = stats.st_size # retrieve playback time from multimedia files _, ext = os.path.splitext(file_to_sign) if ext == '.mp3': audio = MP3(file_to_sign) format['length'] = audio.info.length elif ext == '.mp4': video = MP4(file_to_sign) format['length'] = video.info.length # add file format if missing if not 'format' in format or not format['format']: try: mime = ext_to_mime(ext) format['format'] = mime except Exception as e: if self.logger: self.logger.error(e.message) # clean up disk space os.remove(file_to_sign) return (False, True) @staticmethod def _strip_print_script(file_to_sign): html = read_file(file_to_sign) html = html.replace('window.print()', '') write_file(file_to_sign, html)
class CatalogHandler(InstanceHandler): def __init__(self, event, context, **kwargs): super(CatalogHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.cdn_url = self.retrieve(env_vars, 'cdn_url').rstrip('/') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket') self.api_bucket = self.retrieve(env_vars, 'api_bucket') self.api_url = self.retrieve(env_vars, 'api_url').rstrip('/') self.to_email = self.retrieve(env_vars, 'to_email') self.from_email = self.retrieve(env_vars, 'from_email') self.api_version = self.retrieve(env_vars, 'version') if 'dynamodb_handler' in kwargs: db_handler = kwargs['dynamodb_handler'] self.progress_table = db_handler('{}d43-catalog-in-progress'.format(self.stage_prefix())) self.status_table = db_handler('{}d43-catalog-status'.format(self.stage_prefix())) self.errors_table = db_handler('{}d43-catalog-errors'.format(self.stage_prefix())) else: self.progress_table = DynamoDBHandler('{}d43-catalog-in-progress'.format(self.stage_prefix())) # pragma: no cover self.status_table = DynamoDBHandler('{}d43-catalog-status'.format(self.stage_prefix())) # pragma: no cover self.errors_table = DynamoDBHandler('{}d43-catalog-errors'.format(self.stage_prefix())) # pragma: no cover self.catalog = { "languages": [] } if 's3_handler' in kwargs: self.api_handler = kwargs['s3_handler'](self.api_bucket) else: self.api_handler = S3Handler(self.api_bucket) # pragma: no cover if 'ses_handler' in kwargs: self.ses_handler = kwargs['ses_handler']() else: self.ses_handler = SESHandler() # pragma: no cover if 'consistency_checker' in kwargs: self.checker = kwargs['consistency_checker']() else: self.checker = ConsistencyChecker(self.cdn_bucket, self.api_bucket) # pragma: no cover if 'get_url_handler' in kwargs: self.get_url = kwargs['get_url_handler'] else: self.get_url = get_url # pragma: no cover if 'url_exists_handler' in kwargs: self.url_exists = kwargs['url_exists_handler'] else: self.url_exists = url_exists # pragma: no cover def get_language(self, language): """ Gets the existing language or creates a new one :param language: :return: """ found_lang = None for lang in self.catalog['languages']: if lang['identifier'] == language['identifier']: found_lang = lang break if not found_lang: self.catalog['languages'].append(language) else: language = found_lang if 'resources' not in language: language['resources'] = [] return language def _run(self): completed_items = 0 items = self.progress_table.query_items() for item in items: repo_name = item['repo_name'] self.logger.info('Processing {}'.format(repo_name)) try: package = json.loads(item['package']) except Exception as e: self.report_error('Skipping {}. Bad Manifest: {}'.format(repo_name, e)) continue if repo_name == "catalogs": self.catalog['catalogs'] = package elif repo_name == 'localization': self._build_localization(package) elif repo_name == 'versification': # TODO: we have not yet determined what to do with versification pass else: if self._build_rc(item, package, self.checker): completed_items += 1 # remove empty languages condensed_languages = [] for lang in self.catalog['languages']: if 'resources' in lang and len(lang['resources']) > 0: condensed_languages.append(lang) self.catalog['languages'] = condensed_languages response = { 'success': False, 'incomplete': len(self.checker.all_errors) > 0, 'message': None, 'catalog': self.catalog } if completed_items > 0: status = self._read_status() if status and status['state'] == 'complete' and not self._catalog_has_changed(self.catalog): response['success'] = True response['message'] = 'No changes detected. Catalog not deployed' else: cat_str = json.dumps(self.catalog, sort_keys=True, separators=(',',':')) try: catalog_path = os.path.join(tempfile.gettempdir(), 'catalog.json') write_file(catalog_path, cat_str) c_stats = os.stat(catalog_path) self.logger.info('New catalog built: {} Kilobytes'.format(c_stats.st_size * 0.001)) self.api_handler.upload_file(catalog_path, 'v{0}/catalog.json'.format(self.api_version), cache_time=0) # TRICKY: only mark as complete when there are no errors if len(self.checker.all_errors): self._publish_status('incomplete') else: self._publish_status() response['success'] = True response['message'] = 'Uploaded new catalog to {0}/v{1}/catalog.json'.format(self.api_url, self.api_version) except Exception as e: self.checker.log_error('Unable to save catalog: {0}'.format(e)) # pragma: no cover if len(self.checker.all_errors) > 0: self.report_error(self.checker.all_errors) if completed_items == 0: self.checker.log_error('There were no formats to process') if not response['success']: response['catalog'] = None response['message'] = '{0}'.format(self.checker.all_errors) if(response['success']): self.logger.info(response['message']) else: self.logger.error('Catalog was not published due to errors') return response def _read_status(self): """ Retrieves the recorded status of the catalog :return: """ results = self.status_table.query_items({'api_version': self.api_version}) if not results: return None else: return results[0] def _publish_status(self, state='complete'): """ Updates the catalog status :param state: the state of completion the catalog is in :return: """ self.logger.debug('Recording catalog status: "{}"'.format(state)) self.status_table.update_item( {'api_version': self.api_version}, { 'state': state, 'timestamp': time.strftime("%Y-%m-%dT%H:%M:%SZ"), 'catalog_url': '{0}/v{1}/catalog.json'.format(self.api_url, self.api_version) } ) def _build_rc(self, item, manifest, checker): """ Builds a RC entry in the catalog. :param item: :param manifest: :param checker: :return: True if the entry was successfully added otherwise False """ errors = checker.check(item) if errors: return False dc = manifest['dublin_core'] language = dc['language'] language = self.get_language(language) # gets the existing language container or creates a new one formats = [] for fmt in manifest['formats']: errors = checker.check_format(fmt, item) if not errors: self._strip_build_rules(fmt) formats.append(fmt) if len(formats) > 0: resource = copy.deepcopy(dc) resource['projects'] = [] del resource['conformsto'] del resource['format'] del resource['language'] del resource['type'] resource['checking'] = copy.deepcopy(manifest['checking']) if not resource['relation']: resource['relation'] = [] # store projects for project in manifest['projects']: if 'formats' in project: for fmt in project['formats']: self._strip_build_rules(fmt) checker.check_format(fmt, item) if not project['categories']: project['categories'] = [] del project['path'] resource['projects'].append(project) # store formats # TRICKY: Bible usfm bundles should always be at the resource level is_bible = dc['identifier'] == 'ulb' or dc['identifier'] == 'udb' if len(manifest['projects']) == 1 and not (is_bible and self.has_usfm_bundle(formats)): # single-project RCs store formats in projects for backwards compatibility. if 'formats' in resource['projects'][0]: formats = formats + resource['projects'][0]['formats'] resource['projects'][0]['formats'] = formats # multi-project RCs store formats in resource resource['formats'] = formats if 'comment' not in resource: resource['comment'] = '' language['resources'].append(resource) return True return False def _strip_build_rules(self, obj): """ Recursively removes 'build_tools' from an object :param obj: :return: """ if 'build_rules' in obj: del obj['build_rules'] if 'projects' in obj: for project in obj['projects']: self._strip_build_rules(project) if 'formats' in obj: for format in obj['formats']: self._strip_build_rules(format) if 'chapters' in obj: for chapter in obj['chapters']: self._strip_build_rules(chapter) def has_usfm_bundle(self, formats): """ Checks if an array of formats contains a format that is a usfm bundle :param formats: :return: """ for format in formats: if 'text/usfm' in format['format'] and 'type=bundle' in format['format']: return True return False def _build_versification(self, package, checker): """ DEPRECATED Adds versification chunks to projects in the catalog. Note: this may not do anything if no languages have been generated yet. self._build_rc will pick up the slack in that case. :param package: :return: False if errors were encountered """ dict = {} for project in package: dict[project['identifier']] = project if not self.url_exists(project['chunks_url']): checker.log_error('{} does not exist'.format(project['chunks_url'])) # for performance's sake we'll fail on a single error return False # inject into existing projects for lang in self.catalog['languages']: if 'resources' not in lang: continue for res in lang['resources']: if 'projects' not in res: continue for proj in res['projects']: if proj['identifier'] in dict and proj['versification']: proj.update(dict[proj['identifier']]) return True def _build_localization(self, package): """ Adds localization to the catalog :param package: :return: """ for lang in package: localization = package[lang] language = localization['language'] del localization['language'] language = self.get_language(language) # gets the existing language container or creates a new one language.update(localization) def _catalog_has_changed(self, catalog): """ Checks if the catalog has changed compared to the given catalog :param catalog: :return: """ try: catalog_url = '{0}/v{1}/catalog.json'.format(self.api_url, self.api_version) self.logger.debug('Comparing new catalog against old ({})'.format(catalog_url)) old_catalog_str = self.get_url(catalog_url, True) new_catalog_str = json.dumps(catalog, sort_keys=True, separators=(',',':')) old_hash = hashlib.md5(old_catalog_str.encode('utf-8')).hexdigest() new_hash = hashlib.md5(new_catalog_str.encode('utf-8')).hexdigest() self.logger.debug('Old catalog hash: {}'.format(old_hash)) self.logger.debug('New catalog hash: {}'.format(new_hash)) return old_hash != new_hash except Exception as e: return True
class WebhookHandler(Handler): def __init__(self, event, context, logger, **kwargs): super(WebhookHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.gogs_url = self.retrieve(env_vars, 'gogs_url', 'Environment Vars') self.gogs_org = self.retrieve(env_vars, 'gogs_org', 'Environment Vars') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.api_url = self.retrieve(env_vars, 'api_url', 'Environment Vars') self.repo_commit = self.retrieve(event, 'body-json', 'payload') self.api_version = self.retrieve(env_vars, 'version') self.in_progress_db = self.retrieve_with_default( env_vars, 'in_progress_db', '{}d43-catalog-in-progress'.format(self.stage_prefix())) if 'pull_request' in self.repo_commit: self.__parse_pull_request(self.repo_commit) else: self.__parse_push(self.repo_commit) self.resource_id = None # set in self._build self.logger = logger # type: logging._loggerClass if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.logger.debug( "Creating Dynamodb handler pointing to {}".format( self.in_progress_db)) self.db_handler = DynamoDBHandler( self.in_progress_db) # pragma: no cover if 's3_handler' in kwargs: self.s3_handler = kwargs['s3_handler'] else: self.s3_handler = S3Handler(self.cdn_bucket) # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover def __parse_pull_request(self, payload): """ Parses a pull request :param payload: :return: True if the pull request should be processed """ pull_request = self.retrieve(payload, 'pull_request', 'payload') self.repo_owner = payload['repository']['owner']['username'] self.repo_name = payload['repository']['name'] self.temp_dir = tempfile.mkdtemp('', self.repo_name, None) self.repo_file = os.path.join(self.temp_dir, self.repo_name + '.zip') # TRICKY: gogs gives a lower case name to the folder in the zip archive self.repo_dir = os.path.join(self.temp_dir, self.repo_name.lower()) commit_sha = self.retrieve(pull_request, 'merge_commit_sha', 'pull_request') self.timestamp = str_to_timestamp( self.retrieve(pull_request, 'merged_at', 'pull_request')) repository = self.retrieve(payload, 'repository', 'payload') url = self.retrieve(repository, 'html_url', 'repository').rstrip('/') self.commit_url = '{}/commit/{}'.format(url, commit_sha) if commit_sha: self.commit_id = commit_sha[:10] else: self.commit_id = None def __parse_push(self, payload): """ Parses a regular push commit :param payload: :return: """ self.repo_owner = payload['repository']['owner']['username'] self.repo_name = payload['repository']['name'] self.temp_dir = tempfile.mkdtemp('', self.repo_name, None) self.repo_file = os.path.join(self.temp_dir, self.repo_name + '.zip') # TRICKY: gogs gives a lower case name to the folder in the zip archive self.repo_dir = os.path.join(self.temp_dir, self.repo_name.lower()) self.commit_id = payload['after'] commit = None for commit in payload['commits']: if commit['id'] == self.commit_id: break self.commit_url = commit['url'] self.timestamp = str_to_timestamp(commit['timestamp']) self.commit_id = self.commit_id[:10] def _run(self): if not self.commit_url.startswith(self.gogs_url): raise Exception( 'Only accepting webhooks from {0} but found {1}'.format( self.gogs_url, self.commit_url)) # pragma: no cover if self.repo_owner.lower() != self.gogs_org.lower(): raise Exception( "Only accepting repos from the {0} organization. Organization sent is {1}" .format(self.gogs_org, self.repo_owner)) # pragma: no cover # skip un-merged pull requests if 'pull_request' in self.repo_commit: pr = self.repo_commit['pull_request'] if not pr['merged']: raise Exception('Skipping un-merged pull request') try: # build catalog entry data = self._build() if data: # upload data if 'uploads' in data: self.logger.debug('Uploading files for "{}"'.format( self.repo_name)) for upload in data['uploads']: self.logger.debug('^...{}'.format(upload['key'])) self.logger.debug("Uploading to {0} {1}".format( upload["path"], upload["key"])) self.s3_handler.upload_file(upload['path'], upload['key']) del data['uploads'] else: self.logger.debug( 'No upload-able content found in "{}"'.format( self.repo_name)) self.db_handler.insert_item(data) else: self.logger.debug('No data found in {}'.format(self.repo_name)) except Exception as e: self.report_error(e.message) raise Exception, Exception(e), sys.exc_info()[2] finally: # clean if self.temp_dir and os.path.isdir(self.temp_dir): shutil.rmtree(self.temp_dir, ignore_errors=True) return { "success": True, "message": "Successfully added {0} ({1}) to the catalog".format( self.repo_name, self.commit_id) } def _build(self): """ Constructs a new catalog entry from the repository :return: the constructed object """ self.download_repo(self.commit_url, self.repo_file) self.unzip_repo_file(self.repo_file, self.temp_dir) if not os.path.isdir(self.repo_dir): raise Exception('Was not able to find {0}'.format( self.repo_dir)) # pragma: no cover self.logger.info('Processing repository "{}"'.format(self.repo_name)) data = {} if self.repo_name == 'localization': data = self._build_localization() elif self.repo_name == 'catalogs': data = self._build_catalogs() elif self.repo_name == 'versification': # TODO: we do not yet know what to do with versification return None else: data = self._build_rc() return data def _build_rc(self): """ Builds a Resource Container following the RC0.2 spec :return: """ manifest_path = os.path.join(self.repo_dir, 'manifest.yaml') if not os.path.isfile(manifest_path): raise Exception( 'Repository {0} does not have a manifest.yaml file'.format( self.repo_name)) try: manifest = WebhookHandler.load_yaml_file(manifest_path) except Exception as e: raise Exception('Bad Manifest: {0}'.format(e)) try: ConsistencyChecker.check_manifest(manifest) except Exception as e: raise Exception('Bad Manifest: {0}'.format(e)) # identifiers must be lowercase manifest['dublin_core']['identifier'] = self.sanitize_identifier( manifest['dublin_core']['identifier']) # resource version must be string manifest['dublin_core']['version'] = '{}'.format( manifest['dublin_core']['version']) # build media formats media_path = os.path.join(self.repo_dir, 'media.yaml') resource_formats = [] project_formats = {} if os.path.isfile(media_path): try: media = WebhookHandler.load_yaml_file(media_path) except Exception as e: raise Exception('Bad Media: {0}'.format(e)) project_chapters = self._listChapters(self.repo_dir, manifest) try: resource_formats, project_formats = parse_media( media=media, content_version=manifest['dublin_core']['version'], project_chapters=project_chapters) except Exception as e: self.report_error('Failed to parse media in {}. {}'.format( self.repo_name, e.message)) stats = os.stat(self.repo_file) # normalize dates try: manifest['dublin_core']['modified'] = str_to_timestamp( manifest['dublin_core']['modified']) except Exception as e: self.logger.warning('Invalid datetime detected: {}'.format( e.message)) try: manifest['dublin_core']['issued'] = str_to_timestamp( manifest['dublin_core']['issued']) except Exception as e: self.logger.warning('Invalid datetime detected: {}'.format( e.message)) # TRICKY: single-project RCs get named after the project to avoid conflicts with multi-project RCs. if len(manifest['projects']) == 1: zip_name = manifest['projects'][0]['identifier'].lower() else: zip_name = manifest['dublin_core']['identifier'] resource_key = '{}/{}/v{}/{}.zip'.format( manifest['dublin_core']['language']['identifier'], manifest['dublin_core']['identifier'].split('-')[-1], manifest['dublin_core']['version'], zip_name) url = '{}/{}'.format(self.cdn_url, resource_key) file_info = { 'size': stats.st_size, 'modified': self.timestamp, 'format': 'application/zip; type={0} content={1} conformsto={2}'.format( manifest['dublin_core']['type'], manifest['dublin_core']['format'], manifest['dublin_core']['conformsto']), 'url': url, 'signature': "" } manifest['formats'] = [file_info] uploads = [{ 'key': self.make_upload_key(resource_key), 'path': self.repo_file }] # split usfm bundles if manifest['dublin_core']['type'] == 'bundle' and manifest[ 'dublin_core']['format'] == 'text/usfm': for project in manifest['projects']: pid = self.sanitize_identifier(project['identifier']) if 'formats' not in project: project['formats'] = [] resource_id = manifest['dublin_core']['identifier'].split( '-')[-1] project_key = '{}/{}/v{}/{}.usfm'.format( manifest['dublin_core']['language']['identifier'], resource_id, manifest['dublin_core']['version'], pid) project_url = '{}/{}'.format(self.cdn_url, project_key) p_file_path = os.path.join(self.repo_dir, project['path'].lstrip('\.\/')) p_stats = os.stat(p_file_path) try: resource_mtime = str_to_timestamp( manifest['dublin_core']['modified']) except Exception as e: self.logger.warning('Invalid datetime detected: {}'.format( e.message)) resource_mtime = manifest['dublin_core']['modified'] project['formats'].append({ 'format': 'text/usfm', 'modified': resource_mtime, 'signature': '', 'size': p_stats.st_size, 'url': project_url }) uploads.append({ 'key': self.make_upload_key(project_key), 'path': p_file_path }) # add media to projects for project in manifest['projects']: pid = self.sanitize_identifier(project['identifier']) if pid in project_formats: if 'formats' not in project: project['formats'] = [] project['formats'] = project['formats'] + project_formats[pid] # add media to resource manifest['formats'] = manifest['formats'] + resource_formats # add html format # TRICKY: these URLS are only available in prod # for project in manifest['projects']: # pid = self.sanitize_identifier(project['identifier']) # html_url = '' # if manifest['dublin_core']['identifier'] == 'obs': # # obs html # html_url = 'https://api.door43.org/tx/print?id={}/{}/{}'.format(self.gogs_org, self.repo_name, self.commit_id) # elif manifest['dublin_core']['identifier'] == 'ta': # # ta html # sort_slug = '{}'.format(int(project['sort']) + 1).zfill(2) # html_url = 'https://cdn.door43.org/u/Door43-Catalog/{}/{}/{}-{}.html'.format(self.repo_name, self.commit_id, sort_slug, pid) # elif manifest['dublin_core']['identifier'] not in ['tq', 'tn', 'tw', 'obs-tn', 'obs-tq']: # # we also have html for Bible resources # name, _ = os.path.splitext(os.path.basename(project['path'])) # html_url = 'https://cdn.door43.org/u/Door43-Catalog/{}/{}/{}.html'.format(self.repo_name, self.commit_id, name) # # if html_url and url_exists(html_url): # self.logger.info('Injecting {} html url: {}'.format(manifest['dublin_core']['identifier'], html_url)) # if 'formats' not in project: project['formats'] = [] # project['formats'].append({ # 'format': 'text/html', # 'modified': '', # 'signature': '', # 'size': '', # 'url': html_url, # 'build_rules': [ # 'signing.html_format' # ] # }) # else: # self.logger.warning('Missing html format for {}_{} at {}'.format(self.repo_name, pid, html_url)) return { 'repo_name': self.repo_name, 'commit_id': self.commit_id, 'language': manifest['dublin_core']['language']['identifier'], 'timestamp': self.timestamp, 'added_at': arrow.utcnow().isoformat(), 'package': json.dumps(manifest, sort_keys=True), 'signed': False, 'dirty': False, 'uploads': uploads } def _listChapters(self, rc_dir, manifest): """ Builds a dictionary of chapter ids for each project :param rc_dir: :param manifest: :return: """ chapters = {} if manifest['dublin_core']['type'] == 'book': for project in manifest['projects']: pid = self.sanitize_identifier(project['identifier']) project_path = os.path.normpath( os.path.join(rc_dir, project['path'])) files = os.listdir(project_path) for chapter in files: if chapter in [ '.', '..', 'toc.yaml', 'config.yaml', 'back', 'front' ]: continue chapter = chapter.split('.')[0] if pid not in chapters: chapters[pid] = [] chapters[pid].append(chapter) else: id = '_'.join([ manifest['dublin_core']['language']['identifier'], manifest['dublin_core']['identifier'], manifest['dublin_core']['type'] ]) self.logger.warning( 'Failed to generate media chapters. Only book RCs are currently supported. {}' .format(id)) return chapters def _build_versification(self): """ DEPRECATED we are no longer processing versification. :return: """ bible_dir = os.path.join(self.repo_dir, 'bible') versification_dirs = os.listdir(bible_dir) books = {} package = [] uploads = [] # group by project for vrs_dir in versification_dirs: vrs_id = os.path.basename(vrs_dir) book_files = sorted( glob(os.path.join(bible_dir, vrs_dir, 'chunks', '*.json'))) for b in book_files: self.logger.debug('Reading "{}" versification for "{}"'.format( vrs_id, b)) b_id = os.path.splitext(os.path.basename(b))[0] try: book_vrs = json.loads(read_file(b)) except Exception as e: raise Exception, Exception( 'Bad JSON: {0}'.format(e)), sys.exc_info()[2] book = WebhookHandler.retrieve_or_make( books, b_id, { 'identifier': b_id, 'chunks_url': '{0}/bible/{}/{}/v{}/chunks.json'.format( self.cdn_url, vrs_id, b_id, self.api_version), 'chunks': {} }) book['chunks'][vrs_id] = book_vrs temp_dir = os.path.join(self.temp_dir, 'versification') if not os.path.isdir: os.mkdir(temp_dir) for book in books: book = books[book] # write chunks chunk_file = os.path.join(temp_dir, book['identifier'] + '.json') write_file(chunk_file, json.dumps(book['chunks'], sort_keys=True)) # for now we bypass signing and upload chunks directly upload_key = 'bible/{}/v{}/chunks.json'.format( book['identifier'], self.api_version) uploads.append({'key': upload_key, 'path': chunk_file}) # build package del book['chunks'] package.append(book) return { 'repo_name': self.repo_name, 'commit_id': self.commit_id, 'timestamp': self.timestamp, 'package': json.dumps(package, sort_keys=True), 'uploads': uploads, 'dirty': False } def _build_localization(self): """ Builds the localization for various components in the catalog :return: """ files = sorted(glob(os.path.join(self.repo_dir, '*.json'))) localization = {} for f in files: self.logger.debug("Reading {0}...".format(f)) language = os.path.splitext(os.path.basename(f))[0] try: localization[language] = json.loads(read_file(f)) except Exception as e: raise Exception('Bad JSON: {0}'.format(e)) return { 'repo_name': self.repo_name, 'commit_id': self.commit_id, 'timestamp': self.timestamp, 'package': json.dumps(localization, sort_keys=True), 'dirty': False } def _build_catalogs(self): """ Builds the global catalogs :return: """ catalogs_path = os.path.join(self.repo_dir, 'catalogs.json') package = read_file(catalogs_path) return { 'repo_name': self.repo_name, 'commit_id': self.commit_id, 'timestamp': self.timestamp, 'package': package, 'dirty': False } def make_upload_key(self, path): """ Generates an upload key that conforms to the format `temp/<repo_name>/<commit>/<path>`. This allows further processing to associate files with an entry in dynamoDB. :param path: :return: """ return 'temp/{0}/{1}/{2}'.format(self.repo_name, self.commit_id, path) @staticmethod def retrieve_or_make(dictionary, key, default=None): """ Retrieves a value from a dictionary. If the key does not exist it will be created with the default value :param dict dictionary: :param any key: :param default: :return: """ if key not in dictionary: dictionary[key] = default return dictionary[key] @staticmethod def load_yaml_file(file_name, default=None): """ Deserialized <file_name> into a Python object :param str|unicode file_name: The name of the file to read :param default: The value to return if the file is not found """ if not os.path.isfile(file_name): return default # use utf-8-sig in case the file has a Byte Order Mark with codecs.open(file_name, 'r', 'utf-8-sig') as stream: return yaml.load(stream) def get_url(self, url): return get_url(url) def download_repo(self, commit_url, repo_file): repo_zip_url = commit_url.replace('commit', 'archive') + '.zip' try: self.logger.debug('Downloading {0}...'.format(repo_zip_url)) if not os.path.isfile(repo_file): self.download_file(repo_zip_url, repo_file) finally: pass def unzip_repo_file(self, repo_file, repo_dir): try: self.logger.debug('Unzipping {0}...'.format(repo_file)) unzip(repo_file, repo_dir) finally: pass
class ForkHandler(InstanceHandler): """ Triggers the webhook lambda if new repositories are found. """ def __init__(self, event, context, **kwargs): super(ForkHandler, self).__init__(event, context) self.stage_vars = self.retrieve(self.event, 'stage-variables', 'payload') gogs_token = self.retrieve(self.stage_vars, 'gogs_token', 'Environment Vars') self.gogs_url = self.retrieve(self.stage_vars, 'gogs_url', 'Environment Vars') self.gogs_org = self.retrieve(self.stage_vars, 'gogs_org', 'Environment Vars') self.from_email = self.retrieve(self.stage_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(self.stage_vars, 'to_email', 'Environment Vars') self.stage = self.retrieve(self.stage_vars, 'stage', 'Environment Vars') in_progress_db = self.retrieve_with_default( self.stage_vars, 'in_progress_db', '{}d43-catalog-in-progress'.format(self.stage_prefix())) self.catalog_webhook = self.retrieve_with_default( self.stage_vars, 'catalog_webhook_lambda', '{}d43-catalog_webhook'.format(self.stage)) if 'dynamodb_handler' in kwargs: self.progress_table = kwargs['dynamodb_handler'] else: self.progress_table = DynamoDBHandler( in_progress_db) # pragma: no cover if 'gogs_client' in kwargs: self.gogs_client = kwargs['gogs_client'] else: self.gogs_client = GogsClient # pragma: no cover if 'boto_handler' in kwargs: self.boto = kwargs['boto_handler'] else: self.boto = boto3 # pragma: no cover if 'logger' in kwargs: self.logger = kwargs['logger'] self.gogs_api = self.gogs_client.GogsApi(self.gogs_url) self.gogs_auth = self.gogs_client.Token(gogs_token) def _run(self, **kwargs): """ :param kwargs: :return: """ client = self.boto.client("lambda") # pragma: no cover repos = self.get_new_repos() # pragma: no cover self._trigger_webhook(client, repos) # pragma: no cover return True def _trigger_webhook(self, client, repos): """ Triggers the webhook in each repo in the list :param client boto3.client('lambda'): the lambda client :param repos list: an array of repos :return: """ if not repos or not len(repos): self.logger.info('No new repositories found') return for repo in repos: try: payload = self.make_hook_payload(repo) except Exception as e: self.logger.error( 'Failed to retrieve master branch for {0}: {1}'.format( repo.full_name, e)) continue try: self.logger.info('Simulating Webhook for {}'.format( repo.full_name)) client.invoke(FunctionName=self.catalog_webhook, InvocationType='Event', Payload=json.dumps(payload)) time.sleep(.5) except Exception as e: self.logger.error('Failed to trigger webhook {0}: {1}'.format( repo.full_name, e)) continue def make_hook_payload(self, repo): """ Generates a webhook payload for the repo :param repo: :return: """ branch = self.gogs_api.get_branch(self.gogs_auth, self.gogs_org, repo.name, repo.default_branch) return { "stage-variables": self.event['stage-variables'], "context": self.event['context'], "body-json": { "after": branch.commit.id, "commits": [{ "id": branch.commit.id, "message": branch.commit.message, "timestamp": branch.commit.timestamp, "url": '{0}/{1}/{2}/commit/{3}'.format( self.gogs_url, self.gogs_org, repo.name, branch.commit. id) # branch.commit.url <-- not implemented yet }], "repository": { "owner": { "username": self.gogs_org }, "name": repo.name } }, } def get_new_repos(self): """ Compares the organization repos with what's in progress and returns those that are new or updated. :return: """ org_repos = self.gogs_api.get_user_repos(None, self.gogs_org) items = self.progress_table.query_items() new_repos = [] for repo in org_repos: repo_name = repo.full_name.split("/")[-1] matching_item = self.__get_obj_in_array('repo_name', repo_name, items) if not matching_item or ('dirty' in matching_item and matching_item['dirty']): new_repos.append(repo) else: # check if changed # TODO: the branch API is currently broken so this code won't run try: branch = self.gogs_api.get_branch(None, self.gogs_org, repo_name, 'master') if branch: commit_id = branch.commit.id[:10] for item in items: if item['repo_name'] == repo_name and item[ 'commit_id'] != commit_id: new_repos.append(repo) except Exception as e: # TRICKY: with the api broken this would create a lot of noise # print('WARNING: failed to detect changes: {}'.format(e)) pass # pragma: no cover return new_repos def __get_obj_in_array(self, key, value, array): """ Retrieves the first item in an array if the key matches the value :param key: :param value: :param array: :return: """ for item in array: if item[key] == value: return item return None
class TsV2CatalogHandler(InstanceHandler): cdn_root_path = 'v2/ts' api_version = 'ts.2' def __init__(self, event, context, logger, **kwargs): super(TsV2CatalogHandler, self).__init__(event, context) env_vars = self.retrieve(event, 'stage-variables', 'payload') self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket', 'Environment Vars') self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars').rstrip('/') self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars') self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars') self.max_usfm_size = int( self.retrieve_with_default(env_vars, 'max_usfm_size', '2000000')) self.status_db = self.retrieve_with_default( env_vars, 'status_db', '{}d43-catalog-status'.format(self.stage_prefix())) self.logger = logger # type: logging._loggerClass if 's3_handler' in kwargs: self.cdn_handler = kwargs['s3_handler'] else: self.cdn_handler = S3Handler(self.cdn_bucket) # pragma: no cover if 'dynamodb_handler' in kwargs: self.db_handler = kwargs['dynamodb_handler'] else: self.db_handler = DynamoDBHandler( self.status_db) # pragma: no cover if self.db_handler.logger: self.db_handler.logger.setLevel(logger.level) if 'url_handler' in kwargs: self.get_url = kwargs['url_handler'] else: self.get_url = get_url # pragma: no cover if 'download_handler' in kwargs: self.download_file = kwargs['download_handler'] else: self.download_file = download_file # pragma: no cover if 'url_exists_handler' in kwargs: self.url_exists = kwargs['url_exists_handler'] else: self.url_exists = url_exists # pragma: no cover self.temp_dir = tempfile.mkdtemp('', 'tsv2', None) def __del__(self): try: shutil.rmtree(self.temp_dir) finally: pass def _run(self): """ Generates the v2 catalog :return: """ try: self.logger.debug('Temp directory {} contents {}'.format( '/tmp', get_subdirs('/tmp/'))) return self.__execute() except Exception as e: self.report_error(e.message) raise Exception, Exception(e), sys.exc_info()[2] def __execute(self): cat_keys = [] cat_dict = {} supplemental_resources = [] result = self._get_status() if not result: return False else: (self.status, source_status) = result # check if build is complete if self.status['state'] == 'complete': self.logger.debug('Catalog already generated') return True # retrieve the latest catalog self.logger.debug("Catalog url {0}".format( source_status['catalog_url'])) catalog_content = self.get_url(source_status['catalog_url'], True) if not catalog_content: self.logger.error("{0} does not exist".format( source_status['catalog_url'])) return False try: self.latest_catalog = json.loads(catalog_content) except Exception as e: self.logger.error("Failed to load the catalog json: {0}".format(e)) return False # walk v3 catalog for lang in self.latest_catalog['languages']: lid = TsV2CatalogHandler.sanitize_identifier(lang['identifier'], lower=False) self.logger.info('Processing {}'.format(lid)) for res in lang['resources']: rid = TsV2CatalogHandler.sanitize_identifier(res['identifier']) self.logger.debug('Processing {}_{}'.format(lid, rid)) rc_format = None self.logger.debug('Temp directory {} contents {}'.format( self.temp_dir, get_subdirs(self.temp_dir))) res_temp_dir = os.path.join(self.temp_dir, lid, rid) os.makedirs(res_temp_dir) if 'formats' in res: for format in res['formats']: finished_processes = {} if not rc_format and get_rc_type(format): # locate rc_format (for multi-project RCs) rc_format = format #res is resource, rid is resource id, lid is language id process_id = '_'.join([lid, rid, 'usfm']) if process_id not in self.status['processed']: self._process_usfm(lid, rid, res, format, res_temp_dir) finished_processes[process_id] = [] # TRICKY: bible notes and questions are in the resource if rid != 'obs': process_id = '_'.join([lid, rid, 'notes']) if process_id not in self.status['processed']: self.logger.info( 'Processing notes {}_{}'.format(lid, rid)) tn = self._index_note_files( lid, rid, format, process_id, res_temp_dir) if tn: self._upload_all(tn) finished_processes[process_id] = tn.keys() cat_keys = cat_keys + tn.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] process_id = '_'.join([lid, rid, 'questions']) if process_id not in self.status['processed']: self.logger.info( 'Processing questions {}_{}'.format( lid, rid)) tq = self._index_question_files( lid, rid, format, process_id, res_temp_dir) if tq: self._upload_all(tq) finished_processes[process_id] = tq.keys() cat_keys = cat_keys + tq.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] # TRICKY: update the finished processes once per format to limit db hits if finished_processes: self.status['processed'].update(finished_processes) self.status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( { 'api_version': TsV2CatalogHandler.api_version }, self.status) for project in res['projects']: pid = TsV2CatalogHandler.sanitize_identifier( project['identifier']) self.logger.debug('Processing {}_{}_{}'.format( lid, rid, pid)) if 'formats' in project: for format in project['formats']: finished_processes = {} if not rc_format and get_rc_type(format): # locate rc_format (for single-project RCs) rc_format = format # TRICKY: there should only be a single tW for each language process_id = '_'.join([lid, 'words']) if process_id not in self.status['processed']: tw = self._index_words_files( lid, rid, format, process_id, res_temp_dir) if tw: self._upload_all(tw) finished_processes[process_id] = tw.keys() cat_keys = cat_keys + tw.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] if rid == 'obs': process_id = '_'.join([lid, rid, pid]) if process_id not in self.status['processed']: self.logger.debug( 'Processing {}'.format(process_id)) obs_json = index_obs( lid, rid, format, res_temp_dir, self.download_file) upload = prep_data_upload( '{}/{}/{}/v{}/source.json'.format( pid, lid, rid, res['version']), obs_json, res_temp_dir) self._upload(upload) finished_processes[process_id] = [] else: cat_keys = cat_keys + self.status[ 'processed'][process_id] # TRICKY: obs notes and questions are in the project process_id = '_'.join([lid, rid, pid, 'notes']) if process_id not in self.status['processed']: tn = self._index_note_files( lid, rid, format, process_id, res_temp_dir) if tn: self._upload_all(tn) finished_processes[process_id] = tn.keys() cat_keys = cat_keys + tn.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] process_id = '_'.join([lid, rid, pid, 'questions']) if process_id not in self.status['processed']: tq = self._index_question_files( lid, rid, format, process_id, res_temp_dir) if tq: self._upload_all(tq) finished_processes[process_id] = tq.keys() cat_keys = cat_keys + tq.keys() else: cat_keys = cat_keys + self.status['processed'][ process_id] # TRICKY: update the finished processes once per format to limit db hits if finished_processes: self.status['processed'].update( finished_processes) self.status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( { 'api_version': TsV2CatalogHandler.api_version }, self.status) if not rc_format: raise Exception( 'Could not find a format for {}_{}_{}'.format( lid, rid, pid)) modified = make_legacy_date(rc_format['modified']) rc_type = get_rc_type(rc_format) self.logger.debug( 'Resource container type is {}'.format(rc_type)) if modified is None: modified = time.strftime('%Y%m%d') self.logger.warning( 'Could not find date modified for {}_{}_{} from "{}"' .format(lid, rid, pid, rc_format['modified'])) if rc_type == 'book' or rc_type == 'bundle': self._build_catalog_node(cat_dict, lang, res, project, modified) else: # store supplementary resources for processing after catalog nodes have been fully built supplemental_resources.append({ 'language': lang, 'resource': res, 'project': project, 'modified': modified, 'rc_type': rc_type }) # cleanup resource directory remove_tree(res_temp_dir) # cleanup language directory remove_tree(os.path.join(self.temp_dir, lid)) # inject supplementary resources for s in supplemental_resources: self._add_supplement(cat_dict, s['language'], s['resource'], s['project'], s['modified'], s['rc_type']) api_uploads = [] # normalize catalog nodes root_cat = [] for pid in cat_dict: project = cat_dict[pid] lang_cat = [] for lid in project['_langs']: lang = project['_langs'][lid] res_cat = [] for rid in lang['_res']: res = lang['_res'][rid] # disable missing catalogs # disable tN if '_'.join([lid, '*', pid, 'tn']) not in cat_keys: res['notes'] = '' # disable tQ if '_'.join([lid, '*', pid, 'tq']) not in cat_keys: res['checking_questions'] = '' # disable tW if '_'.join([lid, '*', '*', 'tw']) not in cat_keys: res['terms'] = '' res_cat.append(res) api_uploads.append( prep_data_upload('{}/{}/resources.json'.format(pid, lid), res_cat, self.temp_dir)) del lang['_res'] if ('project' in lang): # skip empty artifacts lang_cat.append(lang) else: self.logger.warning( 'Excluding empty language artifact in {}'.format(pid)) api_uploads.append( prep_data_upload('{}/languages.json'.format(pid), lang_cat, self.temp_dir)) del project['_langs'] if len(lang_cat) != 0: root_cat.append(project) catalog_upload = prep_data_upload('catalog.json', root_cat, self.temp_dir) api_uploads.append(catalog_upload) # TRICKY: also upload to legacy path for backwards compatibility api_uploads.append({ 'key': '/ts/txt/2/catalog.json', 'path': catalog_upload['path'] }) # upload files for upload in api_uploads: if not upload['key'].startswith('/'): key = '{}/{}'.format(TsV2CatalogHandler.cdn_root_path, upload['key']) else: key = upload['key'].lstrip('/') self.cdn_handler.upload_file(upload['path'], key) self.status['state'] = 'complete' self.status['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( {'api_version': TsV2CatalogHandler.api_version}, self.status) def _get_status(self): """ Retrieves the catalog status from AWS or generates a new status object :return: A tuple containing the status object of the target and source catalogs, or False if the source is not ready """ status_results = self.db_handler.query_items({ 'api_version': { 'condition': 'is_in', 'value': ['3', TsV2CatalogHandler.api_version] } }) source_status = None status = None for s in status_results: if s['api_version'] == '3': source_status = s elif s['api_version'] == TsV2CatalogHandler.api_version: status = s if not source_status: self.logger.warning('Source catalog status not found') return False if source_status['state'] != 'complete': self.logger.debug('Source catalog is not ready for use') return False if not status or status['source_timestamp'] != source_status[ 'timestamp']: # begin or restart process status = { 'api_version': TsV2CatalogHandler.api_version, 'catalog_url': '{}/ts/txt/2/catalog.json'.format(self.cdn_url), 'source_api': source_status['api_version'], 'source_timestamp': source_status['timestamp'], 'state': 'in-progress', 'processed': {} } return (status, source_status) def _index_note_files(self, lid, rid, format, process_id, temp_dir): """ :param lid: :param rid: :param format: :return: a dictionary of notes to upload """ tn_uploads = {} format_str = format['format'] if (rid == 'obs-tn' or rid == 'tn') and 'type=help' in format_str: self.logger.debug('Processing {}'.format(process_id)) rc_dir = download_rc(lid, rid, format['url'], temp_dir, self.download_file) if not rc_dir: return {} tn_uploads = index_tn_rc(lid=lid, temp_dir=temp_dir, rc_dir=rc_dir) remove_tree(rc_dir, True) return tn_uploads def _index_question_files(self, lid, rid, format, process_id, temp_dir): question_re = re.compile('^#+([^#\n]+)#*([^#]*)', re.UNICODE | re.MULTILINE | re.DOTALL) tq_uploads = {} format_str = format['format'] if (rid == 'obs-tq' or rid == 'tq') and 'type=help' in format_str: self.logger.debug('Processing {}'.format(process_id)) rc_dir = download_rc(lid, rid, format['url'], temp_dir, self.download_file) if not rc_dir: return {} manifest = yaml.load( read_file(os.path.join(rc_dir, 'manifest.yaml'))) dc = manifest['dublin_core'] for project in manifest['projects']: pid = TsV2CatalogHandler.sanitize_identifier( project['identifier']) question_dir = os.path.normpath( os.path.join(rc_dir, project['path'])) question_json = [] if not os.path.isdir(question_dir): self.logger.warning( 'Missing directory at {}. Is the manifest out of date?' .format(question_dir)) continue chapters = os.listdir(question_dir) for chapter in chapters: if chapter in ['.', '..']: continue unique_questions = {} chapter_dir = os.path.join(question_dir, chapter) chunks = os.listdir(chapter_dir) for chunk in chunks: if chunk in ['.', '..']: continue chunk_file = os.path.join(chapter_dir, chunk) chunk = chunk.split('.')[0] chunk_body = read_file(chunk_file) for question in question_re.findall(chunk_body): hasher = hashlib.md5() hasher.update(question[1].strip().encode('utf-8')) question_hash = hasher.hexdigest() if question_hash not in unique_questions: # insert unique question unique_questions[question_hash] = { 'q': question[0].strip(), 'a': question[1].strip(), 'ref': [u'{}-{}'.format(chapter, chunk)] } else: # append new reference unique_questions[question_hash]['ref'].append( '{}-{}'.format(chapter, chunk)) question_array = [] for hash in unique_questions: question_array.append(unique_questions[hash]) if question_array: question_json.append({ 'id': chapter, 'cq': question_array }) if question_json: tq_key = '_'.join([lid, '*', pid, 'tq']) question_json.append( {'date_modified': dc['modified'].replace('-', '')}) upload = prep_data_upload( '{}/{}/questions.json'.format(pid, lid), question_json, temp_dir) tq_uploads[tq_key] = upload remove_tree(rc_dir, True) return tq_uploads def _index_words_files(self, lid, rid, format, process_id, temp_dir): """ Returns an array of markdown files found in a tW dictionary :param lid: :param rid: :param format: :return: """ word_title_re = re.compile('^#([^#\n]*)#*', re.UNICODE) h2_re = re.compile('^##([^#\n]*)#*', re.UNICODE) obs_example_re = re.compile('\_*\[([^\[\]]+)\]\(([^\(\)]+)\)_*(.*)', re.UNICODE | re.IGNORECASE) block_re = re.compile('^##', re.MULTILINE | re.UNICODE) word_links_re = re.compile( '\[([^\[\]]+)\]\(\.\.\/(kt|other)\/([^\(\)]+)\.md\)', re.UNICODE | re.IGNORECASE) ta_html_re = re.compile( '(<a\s+href="(:[a-z-_0-9]+:ta:vol\d:[a-z-\_]+:[a-z-\_]+)"\s*>([^<]+)<\/a>)', re.UNICODE | re.IGNORECASE) words = [] format_str = format['format'] if rid == 'tw' and 'type=dict' in format_str: self.logger.debug('Processing {}'.format(process_id)) rc_dir = download_rc(lid, rid, format['url'], temp_dir, self.download_file) if not rc_dir: return {} manifest = yaml.load( read_file(os.path.join(rc_dir, 'manifest.yaml'))) dc = manifest['dublin_core'] # TRICKY: there should only be one project for project in manifest['projects']: pid = TsV2CatalogHandler.sanitize_identifier( project['identifier']) content_dir = os.path.normpath( os.path.join(rc_dir, project['path'])) categories = os.listdir(content_dir) for cat in categories: if cat in ['.', '..']: continue cat_dir = os.path.join(content_dir, cat) if not os.path.isdir(cat_dir): continue word_files = os.listdir(cat_dir) for word in word_files: if word in ['.', '..', '.DS_Store']: continue word_path = os.path.join(cat_dir, word) word_id = word.split('.md')[0] try: word_content = read_file(word_path) except Exception as e: self.report_error( 'Failed to read file {}: {}'.format( word_path, e.message)) raise # TRICKY: the title is always at the top title_match = word_title_re.match(word_content) if title_match: title = title_match.group(1) else: self.report_error( 'missing title in {}'.format(word_path)) continue word_content = word_title_re.sub('', word_content).strip() # TRICKY: the definition title is always after the title def_title = '' def_title_match = h2_re.match(word_content) if def_title_match: def_title = def_title_match.group(1).strip() word_content = h2_re.sub('', word_content).strip() else: self.report_error( 'missing definition title in {}'.format( word_path)) # find obs examples blocks = block_re.split(word_content) cleaned_blocks = [] examples = [] for block in blocks: if 'examples from the bible stories' in block.lower( ): for link in obs_example_re.findall(block): if 'obs' not in link[1]: self.logger.error( 'non-obs link found in passage examples: {}' .format(link[1])) else: examples.append({ 'ref': link[0].replace(':', '-'), 'text': markdown.markdown(link[2].strip()) }) else: cleaned_blocks.append(block) word_content = '##'.join(cleaned_blocks) # find all tW links and use them in related words related_words = [ w[2] for w in word_links_re.findall(word_content) ] # convert links to legacy form. TODO: we should convert links after converting to html so we don't have to do it twice. word_content = convert_rc_links(word_content) word_content = markdown.markdown(word_content) # convert html links back to dokuwiki links # TRICKY: we converted the ta urls, but now we need to format them as dokuwiki links # e.g. [[en:ta:vol1:translate:translate_unknown | How to Translate Unknowns]] for ta_link in ta_html_re.findall(word_content): new_link = u'[[{} | {}]]'.format( ta_link[1], ta_link[2]) word_content = word_content.replace( ta_link[0], new_link) words.append({ 'aliases': [ a.strip() for a in title.split(',') if a.strip() != word_id and a.strip() != title.strip() ], 'cf': related_words, 'def': word_content, 'def_title': def_title.rstrip(':'), 'ex': examples, 'id': word_id, 'sub': '', 'term': title.strip() }) remove_tree(rc_dir, True) if words: words.append({ 'date_modified': dc['modified'].replace('-', '').split('T')[0] }) upload = prep_data_upload('bible/{}/words.json'.format(lid), words, temp_dir) return {'_'.join([lid, '*', '*', 'tw']): upload} return {} def _process_usfm(self, lid, rid, resource, format, temp_dir): """ Converts a USFM bundle into usx, loads the data into json and uploads it. Returns an array of usx file paths. :param lid: :param rid: :param format: :return: an array of json blobs """ format_str = format['format'] if 'application/zip' in format_str and 'usfm' in format_str: self.logger.debug('Downloading {}'.format(format['url'])) rc_dir = download_rc(lid, rid, format['url'], temp_dir, self.download_file) if not rc_dir: return manifest = yaml.load( read_file(os.path.join(rc_dir, 'manifest.yaml'))) usx_dir = os.path.join(rc_dir, 'usx') for project in manifest['projects']: pid = TsV2CatalogHandler.sanitize_identifier( project['identifier']) # pid is project identifier, lid is language id, rid is resourceid process_id = '_'.join([lid, rid, pid]) if process_id not in self.status['processed']: self.logger.debug( 'Processing usfm for {}'.format(process_id)) # copy usfm project file usfm_dir = os.path.join(temp_dir, '{}_usfm'.format(process_id)) if not os.path.exists(usfm_dir): os.makedirs(usfm_dir) usfm_dest_file = os.path.normpath( os.path.join(usfm_dir, project['path'])) usfm_src_file = os.path.normpath( os.path.join(rc_dir, project['path'])) if os.path.getsize(usfm_src_file) < self.max_usfm_size: shutil.copyfile(usfm_src_file, usfm_dest_file) # transform usfm to usx build_usx(usfm_dir, usx_dir, self.logger) # convert USX to JSON path = os.path.normpath( os.path.join(usx_dir, '{}.usx'.format(pid.upper()))) source = build_json_source_from_usx( path, format['modified'], self) upload = prep_data_upload( '{}/{}/{}/v{}/source.json'.format( pid, lid, rid, resource['version']), source['source'], temp_dir) self.logger.debug('Uploading {}/{}/{}'.format( self.cdn_bucket, TsV2CatalogHandler.cdn_root_path, upload['key'])) self.cdn_handler.upload_file( upload['path'], '{}/{}'.format(TsV2CatalogHandler.cdn_root_path, upload['key'])) self.status['processed'][process_id] = [] else: self.logger.warn( "Skipping {} because it is too big".format( process_id)) self.status['processed'][process_id] = ['skipped'] self.status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( {'api_version': TsV2CatalogHandler.api_version}, self.status) else: self.logger.debug( 'USFM for {} has already been processed'.format( process_id)) # clean up download remove_tree(rc_dir, True) def _upload_all(self, uploads): """ Uploads an array or object of uploads :param uploads: :return: """ for upload in uploads: if isinstance(upload, dict): self._upload(upload) elif upload in uploads and isinstance(uploads[upload], dict): self._upload(uploads[upload]) else: raise Exception('invalid upload object') def _upload(self, upload): """ Uploads an upload :param upload: :return: """ path = upload['path'] key = '{}/{}'.format(TsV2CatalogHandler.cdn_root_path, upload['key']) self.logger.debug('Uploading {}/{}'.format(path, key)) self.cdn_handler.upload_file(path, key) def _add_supplement(self, catalog, language, resource, project, modified, rc_type): """ Adds supplementary helps to the catalog nodes :param catalog: :param language: :param resource: :param project: :param modified: :param rc_type: :return: """ lid = TsV2CatalogHandler.sanitize_identifier(language['identifier'], lower=False) if rc_type == 'help': pid = TsV2CatalogHandler.sanitize_identifier(project['identifier']) # tricky some languages may only have supplementary resources and no books # so no catalog node will have been built. Therefore we init them here. TsV2CatalogHandler._init_catalog_node(catalog, pid, lid) for rid in catalog[pid]['_langs'][lid]['_res']: res = catalog[pid]['_langs'][lid]['_res'][rid] if 'tn' in TsV2CatalogHandler.sanitize_identifier( resource['identifier']): res.update({ 'notes': '{}/{}/{}/{}/notes.json?date_modified={}'.format( self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid, modified) }) elif 'tq' in self.sanitize_identifier(resource['identifier']): res.update({ 'checking_questions': '{}/{}/{}/{}/questions.json?date_modified={}'.format( self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid, modified) }) elif rc_type == 'dict': for pid in catalog: # tricky some languages may only have supplementary resources and no books # so no catalog node will have been built. Therefore we init them here. TsV2CatalogHandler._init_catalog_node(catalog, pid, lid) for rid in catalog[pid]['_langs'][lid]['_res']: res = catalog[pid]['_langs'][lid]['_res'][rid] # TRICKY: obs and Bible now use the same words res.update({ 'terms': '{}/{}/bible/{}/words.json?date_modified={}'.format( self.cdn_url, TsV2CatalogHandler.cdn_root_path, lid, modified) }) @staticmethod def _init_catalog_node(catalog, pid, lid=None, rid=None): """ Initializes a node in the catalog. :param catalog: the v2 catalog dictionary :param pid: the project id to include in the catalog :param lid: the language id to include in the catalog :param rid: the resource id to include in the catalog :return: """ if pid not in catalog: catalog[pid] = {'_langs': {}} if lid is not None: if lid not in catalog[pid]['_langs']: catalog[pid]['_langs'][lid] = {'_res': {}, 'language': {}} if lid is not None and rid is not None: if rid not in catalog[pid]['_langs'][lid]['_res']: catalog[pid]['_langs'][lid]['_res'][rid] = {} def _build_catalog_node(self, catalog, language, resource, project, modified): """ Creates/updates a node in the catalog :param catalog: the v2 catalog dictionary :param language: the v3 language catalog object :param resource: the v3 resource catalog object :param project: the v3 project catalog object :param modified: :return: """ lid = TsV2CatalogHandler.sanitize_identifier(language['identifier'], lower=False) rid = TsV2CatalogHandler.sanitize_identifier(resource['identifier']) pid = TsV2CatalogHandler.sanitize_identifier(project['identifier']) # TRICKY: v2 api sorted obs with 1 if pid == 'obs': project['sort'] = 1 TsV2CatalogHandler._init_catalog_node(catalog, pid, lid, rid) # TRICKY: we must process the modified date in the order of resource, language, project to propagate dates correctly # resource res = catalog[pid]['_langs'][lid]['_res'][rid] r_modified = max_modified_date( res, modified) # TRICKY: dates bubble up from project comments = '' # TRICKY: comments are not officially supported in RCs but we use them if available if 'comment' in resource: comments = resource['comment'] # add chunks to non-obs projects chunks_url = '' if rid != 'obs': chunks_url = 'https://api.unfoldingword.org/bible/txt/1/{}/chunks.json'.format( pid) # if not self.url_exists(chunks_url) and 'chunks_url' in project: # Use the v3 api chunks url if the legacy version cannot be found # chunks_url = project['chunks_url'] source_url = '{}/{}/{}/{}/{}/v{}/source.json?date_modified={}'.format( self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid, rid, resource['version'], r_modified) source_text = '' source_text_version = '' if resource['source']: # TRICKY: some resources don't have a source source_text = resource['source'][0]['language'] source_text_version = resource['source'][0]['version'] else: self.report_error('Missing source translation in {} {}'.format( lid, rid)) res.update({ 'date_modified': r_modified, 'name': resource['title'], 'notes': '', 'slug': rid, 'status': { 'checking_entity': ', '.join(resource['checking']['checking_entity']), 'checking_level': resource['checking']['checking_level'], 'comments': comments, 'contributors': '; '.join(resource['contributor']), 'publish_date': resource['issued'], 'source_text': source_text, # v2 can only handle one source 'source_text_version': source_text_version, # v2 can only handle one source 'version': resource['version'] }, 'checking_questions': '', 'chunks': chunks_url, 'source': source_url, 'terms': '', 'tw_cat': '' }) res.update({ 'tw_cat': '{}/{}/{}/{}/tw_cat.json?date_modified={}'.format( self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid, r_modified) }) # bible projects have usfm if pid != 'obs': if 'formats' in project: for format in project['formats']: if 'text/usfm' == format['format']: res.update({ 'usfm': '{}?date_modified={}'.format( format['url'], r_modified) }) break # language lang = catalog[pid]['_langs'][lid] l_modified = max_modified_date( lang['language'], r_modified) # TRICKY: dates bubble up from resource description = '' if rid == 'obs': description = resource['description'] project_meta = list(project['categories']) # default to category ids if 'category_labels' in language: project_meta = [] for cat_id in project['categories']: if cat_id in language['category_labels']: project_meta.append(language['category_labels'][cat_id]) else: project_meta.append(cat_id) cat_lang = { 'language': { 'date_modified': l_modified, 'direction': language['direction'], 'name': language['title'], 'slug': lid }, 'project': { 'desc': description, 'meta': project_meta, 'name': project['title'] }, 'res_catalog': '{}/{}/{}/{}/resources.json?date_modified={}'.format( self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid, l_modified) } if 'ulb' == rid or 'udb' == rid: cat_lang['project']['sort'] = '{}'.format(project['sort']) lang.update(cat_lang) # project p_modified = max_modified_date(catalog[pid], l_modified) catalog[pid].update({ 'date_modified': p_modified, 'lang_catalog': '{}/{}/{}/languages.json?date_modified={}'.format( self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, p_modified), 'meta': project['categories'], 'slug': pid, 'sort': '{}'.format(project['sort']).zfill(2) })