Exemplo n.º 1
0
    def assertElasticsearchResultsEqual(self, first, second):
        """
        The ordering of list items in our Elasticsearch responses typically doesn't matter.
        The comparison done by this method is insensitive to ordering differences in lists.

        For details see the doc string for sort_frozen() and freeze()
        """
        self.assertEqual(sort_frozen(freeze(first)), sort_frozen(freeze(second)))
Exemplo n.º 2
0
def manage_subscriptions(plugin: RepositoryPlugin,
                         dss_client: DSSClient,
                         subscribe=True):
    response = dss_client.get_subscriptions(replica='aws',
                                            subscription_type='elasticsearch')
    current_subscriptions = freeze(response['subscriptions'])

    key, key_id = deployment.aws.get_hmac_key_and_id()

    if subscribe:
        base_url = config.indexer_endpoint()
        prefix = config.dss_query_prefix
        new_subscriptions = [
            freeze(
                dict(replica='aws',
                     es_query=query(prefix),
                     callback_url=furl(url=base_url,
                                       path=(config.default_catalog, action)),
                     hmac_key_id=key_id))
            for query, action in [(
                plugin.dss_subscription_query,
                'add'), (plugin.dss_deletion_subscription_query, 'delete')]
        ]
    else:
        new_subscriptions = []

    for subscription in current_subscriptions:
        # Note the use of <= to allow for the fact that DSS returns subscriptions with additional attributes, more
        # than were originally supplied. If the subscription returned by DSS is a superset of the subscription we want
        # to create, we can skip the update.
        matching_subscription = next(
            (new_subscription for new_subscription in new_subscriptions
             if new_subscription.items() <= subscription.items()), None)
        if matching_subscription:
            logger.info('Already subscribed: %r', thaw(subscription))
            new_subscriptions.remove(matching_subscription)
        else:
            subscription = thaw(subscription)
            logger.info('Removing stale subscription: %r', subscription)
            dss_client.delete_subscription(uuid=subscription['uuid'],
                                           replica=subscription['replica'],
                                           subscription_type='elasticsearch')

    for subscription in new_subscriptions:
        subscription = thaw(subscription)
        response = dss_client.put_subscription(**subscription,
                                               hmac_secret_key=key)
        subscription['uuid'] = response['uuid']
        logger.info('Registered subscription %r.', subscription)
Exemplo n.º 3
0
 def _derive_manifest_key(self,
                          format_: ManifestFormat,
                          catalog: CatalogName,
                          filters: Filters,
                          content_hash: int
                          ) -> str:
     """
     Return a manifest key deterministically derived from the arguments and
     the current commit hash. The same arguments will always produce the same
     return value in one revision of this code. Different arguments should,
     with a very high probability, produce different return values.
     """
     git_commit = config.lambda_git_status['commit']
     manifest_namespace = uuid.UUID('ca1df635-b42c-4671-9322-b0a7209f0235')
     filter_string = repr(sort_frozen(freeze(filters)))
     content_hash = str(content_hash)
     disable_multipart = str(config.disable_multipart_manifests)
     manifest_key_params = (
         git_commit,
         catalog,
         format_.value,
         content_hash,
         disable_multipart,
         filter_string
     )
     assert not any(',' in param for param in manifest_key_params[:-1])
     return str(uuid.uuid5(manifest_namespace, ','.join(manifest_key_params)))
Exemplo n.º 4
0
def threaded_manifests():
    def full_run(filters):
        url = request_manifest(thaw(filters))
        resp = requests.get(url)
        return resp.status_code

    filterss = set(freeze(generate_filters()) for _ in range(140))
    with ThreadPoolExecutor(10) as main_tpe:
        return set(main_tpe.map(full_run, filterss))
Exemplo n.º 5
0
class Config:
    """
    See `environment` for documentation of these settings.
    """
    @property
    def environ(self):
        return ChainMap(os.environ, self._outsourced_environ)

    @property
    def owner(self):
        return self.environ['AZUL_OWNER']

    def _boolean(self, value: str) -> bool:
        if value == "0":
            return False
        elif value == "1":
            return True
        else:
            raise ValueError('Expected "0" or "1"', value)

    @property
    def debug(self) -> int:
        debug = int(self.environ['AZUL_DEBUG'])
        self._validate_debug(debug)
        return debug

    @debug.setter
    def debug(self, debug: int):
        self._validate_debug(debug)
        self.environ['AZUL_DEBUG'] = str(debug)

    def _validate_debug(self, debug):
        require(debug in (0, 1, 2), "AZUL_DEBUG must be either 0, 1 or 2")

    _es_endpoint_env_name = 'AZUL_ES_ENDPOINT'

    @property
    def es_endpoint(self) -> Optional[Netloc]:
        try:
            es_endpoint = self.environ[self._es_endpoint_env_name]
        except KeyError:
            return None
        else:
            host, _, port = es_endpoint.partition(':')
            return host, int(port)

    def es_endpoint_env(
            self, es_endpoint: Union[Netloc, str],
            es_instance_count: Union[int, str]) -> Mapping[str, str]:
        if isinstance(es_endpoint, tuple):
            host, port = es_endpoint
            assert isinstance(host, str), host
            assert isinstance(port, int), port
            es_endpoint = f'{host}:{port}'
        elif isinstance(es_endpoint, str):
            pass
        else:
            assert False, es_endpoint
        return {
            self._es_endpoint_env_name: es_endpoint,
            self._es_instance_count_env_name: str(es_instance_count)
        }

    @property
    def aws_account_id(self) -> str:
        return self.environ['AZUL_AWS_ACCOUNT_ID']

    @property
    def project_root(self) -> str:
        return self.environ['project_root']

    @property
    def es_domain(self) -> str:
        return self.environ['AZUL_ES_DOMAIN']

    @property
    def share_es_domain(self) -> bool:
        return self._boolean(self.environ['AZUL_SHARE_ES_DOMAIN'])

    @property
    def s3_bucket(self) -> str:
        return self.environ['AZUL_S3_BUCKET']

    @property
    def manifest_expiration(self) -> int:
        """
        Number of days before a manifest will be deleted from the storage bucket
        """
        return 7

    @property
    def manifest_expiration_margin(self) -> float:
        """
        Minimum duration (in seconds) before a manifest in the storage bucket
        is considered too close to expiration for use
        """
        return 60 * 15

    @property
    def url_redirect_full_domain_name(self) -> str:
        return self.environ['AZUL_URL_REDIRECT_FULL_DOMAIN_NAME']

    @property
    def url_redirect_base_domain_name(self) -> str:
        return self.environ['AZUL_URL_REDIRECT_BASE_DOMAIN_NAME']

    @property
    def es_timeout(self) -> int:
        return int(self.environ['AZUL_ES_TIMEOUT'])

    @property
    def data_browser_domain(self):
        dcp_domain = 'data.humancellatlas.org'
        if self.deployment_stage == 'prod':
            return dcp_domain
        elif self.deployment_stage in ('integration', 'staging'):
            return f'{self.deployment_stage}.{dcp_domain}'
        else:
            return f'dev.{dcp_domain}'

    @property
    def data_browser_name(self):
        return f'{self.resource_prefix}-data-browser-{self.deployment_stage}'

    @property
    def data_portal_name(self):
        return f'{self.resource_prefix}-data-portal-{self.deployment_stage}'

    @property
    def dss_endpoint(self) -> Optional[str]:
        if self.dss_source is None:
            return None
        else:
            from azul.indexer import (
                SimpleSourceSpec, )
            return SimpleSourceSpec.parse(self.dss_source).name

    @property
    def dss_source(self) -> Optional[str]:
        return self.environ.get('AZUL_DSS_SOURCE')

    def sources(self, catalog: CatalogName) -> AbstractSet[str]:
        return config.catalogs[catalog].sources

    @property
    def tdr_allowed_source_locations(self) -> AbstractSet[str]:
        # FIXME: Eliminate local import
        #        https://github.com/DataBiosphere/azul/issues/3133
        import json
        return frozenset(
            json.loads(self.environ['AZUL_TDR_ALLOWED_SOURCE_LOCATIONS']))

    @property
    def tdr_source_location(self) -> str:
        location = self.environ['AZUL_TDR_SOURCE_LOCATION']
        allowed_locations = self.tdr_allowed_source_locations
        require(location in allowed_locations,
                f'{location!r} is not one of {allowed_locations!r}')
        return location

    @property
    def tdr_service_url(self) -> str:
        return self.environ['AZUL_TDR_SERVICE_URL']

    @property
    def sam_service_url(self):
        return self.environ['AZUL_SAM_SERVICE_URL']

    @property
    def dss_query_prefix(self) -> str:
        return self.environ.get('AZUL_DSS_QUERY_PREFIX', '')

    @property
    def partition_prefix_length(self) -> int:
        return int(self.environ['AZUL_PARTITION_PREFIX_LENGTH'])

    # Remove once https://github.com/HumanCellAtlas/data-store/issues/1837 is resolved

    def dss_deployment_stage(self, dss_endpoint: str) -> str:
        """
        >>> config.dss_deployment_stage('https://dss.staging.data.humancellatlas.org/v1')
        'staging'
        >>> config.dss_deployment_stage('https://dss.data.humancellatlas.org/v1')
        'prod'
        """
        from urllib.parse import (
            urlparse, )
        user, _, domain = urlparse(dss_endpoint).netloc.rpartition('@')
        domain = domain.split('.')
        require(domain[-3:] == ['data', 'humancellatlas', 'org'])
        require(domain[0] == 'dss')
        stage = domain[1:-3]
        assert len(stage) < 2
        return 'prod' if stage == [] else stage[0]

    @property
    def dss_direct_access(self) -> bool:
        return self._boolean(self.environ['AZUL_DSS_DIRECT_ACCESS'])

    def dss_direct_access_role(self,
                               lambda_name: str,
                               stage: Optional[str] = None) -> Optional[str]:
        key = 'AZUL_DSS_DIRECT_ACCESS_ROLE'
        try:
            role_arn = self.environ[key]
        except KeyError:
            return None
        else:
            arn, partition, service, region, account_id, resource = role_arn.split(
                ':')
            require(arn == 'arn')
            require(partition == 'aws')
            require(service == 'iam')
            require(region == '')
            require(account_id)
            resource_type, resource_id = resource.split('/')
            require(resource_type == 'role')
            try:
                lambda_name_template, default_stage = self.unqualified_resource_name(
                    resource_id)
                require(lambda_name_template == '*')
                if stage is None:
                    stage = default_stage
                role_name = self.qualified_resource_name(lambda_name,
                                                         stage=stage)
                return f'arn:aws:iam::{account_id}:role/{role_name}'
            except RequirementError:
                # If we fail to parse the role name, we can't parameterize it
                # and must return the ARN verbatim.
                return role_arn

    @property
    def num_dss_workers(self) -> int:
        return int(self.environ['AZUL_DSS_WORKERS'])

    @property
    def num_tdr_workers(self) -> int:
        return int(self.environ['AZUL_TDR_WORKERS'])

    @property
    def external_lambda_role_assumptors(
            self) -> MutableMapping[str, List[str]]:
        try:
            accounts = self.environ['AZUL_EXTERNAL_LAMBDA_ROLE_ASSUMPTORS']
        except KeyError:
            return {}
        else:
            return self._parse_principals(accounts)

    def _parse_principals(self, accounts) -> MutableMapping[str, List[str]]:
        # noinspection PyProtectedMember
        """
        >>> from azul import config  # Without this import, these doctests fail
        ...                          # in Pycharm since the fully qualified
        ...                          # class name of the exception would be
        ...                          # azul.RequirementError

        >>> config._parse_principals('123,foo*')
        {'123': ['foo*']}

        >>> config._parse_principals('123, foo*: 456,bar ,fubaz')
        {'123': ['foo*'], '456': ['bar', 'fubaz']}

        >>> config._parse_principals('')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('An account ID and at least one role must be specified', '')

        >>> config._parse_principals(' ')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('An account ID and at least one role must be specified', ' ')

        >>> config._parse_principals(':')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('An account ID and at least one role must be specified', '')

        >>> config._parse_principals(',')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('An account ID and at least one role must be specified', ',')

        >>> config._parse_principals(',:')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('An account ID and at least one role must be specified', ',')

        >>> config._parse_principals('123')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('An account ID and at least one role must be specified', '123')

        >>> config._parse_principals('123:')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('An account ID and at least one role must be specified', '123')

        >>> config._parse_principals('123 ,:')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('An account ID and at least one role must be specified', '123 ,')
        """
        result = {}
        for account in accounts.split(':'):
            account_id, *roles = map(str.strip, account.split(','))
            require(account_id and roles and all(roles),
                    'An account ID and at least one role must be specified',
                    account)
            result[account_id] = roles
        return result

    @property
    def resource_prefix(self):
        prefix = self.environ['AZUL_RESOURCE_PREFIX']
        self.validate_prefix(prefix)
        return prefix

    def qualified_resource_name(self, resource_name, suffix='', stage=None):
        self._validate_term(resource_name)
        if stage is None:
            stage = self.deployment_stage
        return f"{self.resource_prefix}-{resource_name}-{stage}{suffix}"

    def unqualified_resource_name(self,
                                  qualified_resource_name: str,
                                  suffix: str = '') -> Tuple[str, str]:
        """
        >>> config.unqualified_resource_name('azul-foo-dev')
        ('foo', 'dev')

        >>> config.unqualified_resource_name('azul-foo')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ['azul', 'foo']

        >>> config.unqualified_resource_name('azul-object_versions-dev')
        ('object_versions', 'dev')

        >>> config.unqualified_resource_name('azul-object-versions-dev')
        Traceback (most recent call last):
        ...
        azul.RequirementError: ['azul', 'object', 'versions', 'dev']
        """
        require(qualified_resource_name.endswith(suffix))
        if suffix:
            qualified_resource_name = qualified_resource_name[:-len(suffix)]
        # FIXME: Eliminate hardcoded separator
        #        https://github.com/databiosphere/azul/issues/2964
        components = qualified_resource_name.split('-')
        require(len(components) == 3, components)
        prefix, resource_name, deployment_stage = components
        require(prefix == self.resource_prefix)
        return resource_name, deployment_stage

    def subdomain(self, lambda_name):
        return self.environ['AZUL_SUBDOMAIN_TEMPLATE'].replace(
            '*', lambda_name)

    def api_lambda_domain(self, lambda_name: str) -> str:
        return self.subdomain(lambda_name) + "." + self.domain_name

    @property
    def drs_domain(self):
        return self.environ['AZUL_DRS_DOMAIN_NAME']

    def api_lambda_domain_aliases(self, lambda_name):
        """
        Additional alias domain names for the given API lambda
        """
        return [self.drs_domain
                ] if lambda_name == 'service' and self.drs_domain else []

    def lambda_endpoint(self, lambda_name: str) -> str:
        return "https://" + self.api_lambda_domain(lambda_name)

    def indexer_endpoint(self) -> str:
        return self.lambda_endpoint('indexer')

    def service_endpoint(self) -> str:
        return self.lambda_endpoint('service')

    def drs_endpoint(self):
        if self.drs_domain:
            return "https://" + self.drs_domain
        else:
            return self.service_endpoint()

    def lambda_names(self) -> List[str]:
        return ['indexer', 'service']

    @property
    def indexer_name(self) -> str:
        return self.indexer_function_name()

    @property
    def service_name(self) -> str:
        return self.service_function_name()

    def indexer_function_name(self, handler_name: Optional[str] = None):
        return self._function_name('indexer', handler_name)

    def service_function_name(self, handler_name: Optional[str] = None):
        return self._function_name('service', handler_name)

    def _function_name(self, lambda_name: str, handler_name: Optional[str]):
        if handler_name is None:
            return self.qualified_resource_name(lambda_name)
        else:
            # FIXME: Eliminate hardcoded separator
            #        https://github.com/databiosphere/azul/issues/2964
            return self.qualified_resource_name(lambda_name,
                                                suffix='-' + handler_name)

    deployment_name_re = re.compile(r'[a-z][a-z0-9]{1,16}')

    @classmethod
    def validate_prefix(cls, prefix):
        reject(
            cls.deployment_name_re.fullmatch(prefix) is None,
            f"Prefix '{prefix}' is to short, too long or contains invalid characters."
        )

    @classmethod
    def validate_deployment_name(cls, deployment_name):
        reject(
            cls.deployment_name_re.fullmatch(deployment_name) is None,
            f"Deployment name '{deployment_name}' is to short, too long or contains invalid characters."
        )

    @property
    def deployment_stage(self) -> str:
        deployment_name = self.environ['AZUL_DEPLOYMENT_STAGE']
        self.validate_deployment_name(deployment_name)
        return deployment_name

    @property
    def region(self) -> str:
        return self.environ['AWS_DEFAULT_REGION']

    @property
    def terraform_backend_bucket(self) -> str:
        return self.versioned_bucket

    @property
    def versioned_bucket(self):
        return self.environ['AZUL_VERSIONED_BUCKET']

    @property
    def enable_monitoring(self) -> bool:
        return self._boolean(self.environ['AZUL_ENABLE_MONITORING'])

    @property
    def disable_monitoring(self) -> bool:
        return not self.enable_monitoring

    @property
    def es_instance_type(self) -> str:
        return self.environ['AZUL_ES_INSTANCE_TYPE']

    _es_instance_count_env_name = 'AZUL_ES_INSTANCE_COUNT'

    @property
    def es_instance_count(self) -> int:
        return int(self.environ[self._es_instance_count_env_name])

    @property
    def es_volume_size(self) -> int:
        return int(self.environ['AZUL_ES_VOLUME_SIZE'])

    @property
    def _index_prefix(self) -> str:
        return self._term_from_env('AZUL_INDEX_PREFIX')

    # Because this property is relatively expensive to produce and frequently
    # used we are applying aggressive caching here, knowing very well that
    # this eliminates the option to reconfigure the running process by
    # manipulating os.environ['AZUL_CATALOGS'].
    #
    # It also means that mocking/patching would need to be done on this property
    # and that the mocked property would be inconsistent with the environment
    # variable. We feel that the performance gain is worth these concessions.

    @attr.s(frozen=True, kw_only=True, auto_attribs=True)
    class Catalog:
        """
        >>> plugins = dict(metadata=dict(name='hca'), repository=dict(name='tdr'))
        >>> kwargs = dict(atlas='', plugins=plugins, sources='')
        >>> c = Config.Catalog

        >>> c(name='dcp', internal=False, **kwargs) # doctest: +NORMALIZE_WHITESPACE
        Config.Catalog(name='dcp',
                       atlas='',
                       internal=False,
                       plugins={'metadata': {'name': 'hca'},
                                'repository': {'name': 'tdr'}},
                       sources='')

        >>> c(name='dcp-it', internal=True, **kwargs).is_integration_test_catalog
        True

        >>> c(name='foo-bar', internal=False, **kwargs).name
        'foo-bar'

        >>> c(name='foo-bar-it', internal=True, **kwargs).name
        'foo-bar-it'

        >>> c(name='a' * 61 + '-it', internal=True, **kwargs).is_integration_test_catalog
        True

        >>> c(name='a' * 62 + '-it', internal=True, **kwargs) # doctest: +NORMALIZE_WHITESPACE
        Traceback (most recent call last):
        ...
        azul.RequirementError: ('Catalog name is invalid',
                                'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-it')
        """
        @attr.s(frozen=True, kw_only=True, auto_attribs=True)
        class Plugin:
            name: str

        name: str
        atlas: str
        internal: bool
        plugins: Mapping[str, Plugin]
        sources: AbstractSet[str]

        _catalog_re = r'([a-z][a-z0-9]*(-[a-z0-9]+)*)'
        _catalog_re = r'(?=.{1,64}$)' + _catalog_re
        _it_catalog_suffix: ClassVar[str] = '-it'
        _it_catalog_re: ClassVar[re.Pattern] = re.compile(
            _catalog_re + rf'(?<={re.escape(_it_catalog_suffix)})')
        _catalog_re: ClassVar[re.Pattern] = re.compile(_catalog_re)

        def __attrs_post_init__(self):
            self.validate_name(self.name)
            # Import locally to avoid cyclical import
            from azul.plugins import (
                Plugin, )
            all_types = set(p.type_name() for p in Plugin.types())
            configured_types = self.plugins.keys()
            require(all_types == configured_types,
                    'Missing or extra plugin types',
                    all_types.symmetric_difference(configured_types))
            if self.internal:
                assert self.is_integration_test_catalog is True, self

        @cached_property
        def is_integration_test_catalog(self) -> bool:
            if self._it_catalog_re.match(self.name) is None:
                return False
            else:
                require(self.internal, 'IT catalogs must be internal', self)
                return True

        @cached_property
        def it_catalog(self) -> CatalogName:
            if self.is_integration_test_catalog:
                return self.name
            else:
                name = self.name + self._it_catalog_suffix
                assert self._it_catalog_re.match(name), name
                return name

        @classmethod
        def from_json(cls, name: str, spec: JSON) -> 'Config.Catalog':
            plugins = {
                plugin_type: cls.Plugin(**plugin_spec)
                for plugin_type, plugin_spec in spec['plugins'].items()
            }
            return cls(name=name,
                       atlas=spec['atlas'],
                       internal=spec['internal'],
                       plugins=plugins,
                       sources=set(spec['sources']))

        @classmethod
        def validate_name(cls, catalog, **kwargs):
            reject(
                cls._catalog_re.fullmatch(catalog) is None,
                'Catalog name is invalid', catalog, **kwargs)

    @cached_property
    def catalogs(self) -> Mapping[CatalogName, Catalog]:
        """
        A mapping from catalog name to a mapping from plugin type to plugin
        package name.
        """
        # FIXME: Eliminate local import
        #        https://github.com/DataBiosphere/azul/issues/3133
        import json
        catalogs = json.loads(self.environ['AZUL_CATALOGS'])
        require(bool(catalogs), 'No catalogs configured')
        return {
            name: self.Catalog.from_json(name, catalog)
            for name, catalog in catalogs.items()
        }

    @property
    def default_catalog(self) -> CatalogName:
        return first(self.catalogs)

    def it_catalog_for(self, catalog: CatalogName) -> Optional[CatalogName]:
        it_catalog = self.catalogs[catalog].it_catalog
        assert it_catalog in self.integration_test_catalogs, it_catalog
        return it_catalog

    def is_dss_enabled(self, catalog: Optional[str] = None) -> bool:
        return self._is_plugin_enabled('dss', catalog)

    def is_tdr_enabled(self, catalog: Optional[str] = None) -> bool:
        return self._is_plugin_enabled('tdr', catalog)

    def _is_plugin_enabled(self, plugin: str, catalog: Optional[str]) -> bool:
        def predicate(catalog):
            return catalog.plugins['repository'].name == plugin

        if catalog is None:
            return any(map(predicate, self.catalogs.values()))
        else:
            return predicate(self.catalogs[catalog])

    @cached_property
    def integration_test_catalogs(self) -> Mapping[CatalogName, Catalog]:
        return {
            name: catalog
            for name, catalog in self.catalogs.items()
            if catalog.is_integration_test_catalog
        }

    def es_index_name(self, catalog: CatalogName, entity_type: str,
                      aggregate: bool) -> str:
        return str(
            IndexName(prefix=self._index_prefix,
                      version=2,
                      deployment=self.deployment_stage,
                      catalog=catalog,
                      entity_type=entity_type,
                      aggregate=aggregate))

    def parse_es_index_name(self, index_name: str) -> 'IndexName':
        """
        Parse the name of an index in the current deployment.
        """
        index_name = IndexName.parse(index_name)
        assert index_name.prefix == self._index_prefix
        assert index_name.deployment == self.deployment_stage
        return index_name

    def parse_foreign_es_index_name(self, index_name) -> 'IndexName':
        """
        Parse the name of an index in any deployment and from any version of
        Azul provided that the deployment doesn't override the default index
        name prefix (AZUL_INDEX_PREFIX).
        """
        return IndexName.parse(index_name, expected_prefix=self._index_prefix)

    @property
    def domain_name(self) -> str:
        return self.environ['AZUL_DOMAIN_NAME']

    main_deployments_by_branch: Mapping[str, Sequence[str]] = freeze({
        'develop': ['dev'],
        'integration': ['integration'],
        'staging': ['staging'],
        'prod': ['prod', 'prod2']
    })

    main_branches_by_deployment: Mapping[str, str] = freeze({
        deployment: branch
        for branch, deployments in main_deployments_by_branch.items()
        for deployment in deployments
    })

    def is_main_deployment(self, stage: str = None) -> bool:
        if stage is None:
            stage = self.deployment_stage
        return stage in self.main_branches_by_deployment

    def is_stable_deployment(self, stage=None) -> bool:
        if stage is None:
            stage = self.deployment_stage
        return stage in ('staging', 'prod')

    @property
    def _git_status(self) -> Mapping[str, str]:
        import git
        repo = git.Repo(config.project_root)
        return {
            'azul_git_commit': repo.head.object.hexsha,
            'azul_git_dirty': str(repo.is_dirty()),
        }

    @property
    def lambda_git_status(self) -> Mapping[str, str]:
        return {
            'commit': self.environ['azul_git_commit'],
            'dirty': str_to_bool(self.environ['azul_git_dirty'])
        }

    @property
    def lambda_env(self) -> Dict[str, str]:
        """
        A dictionary with the environment variables to be used by a deployed AWS
        Lambda function or `chalice local`. Only includes those variables that
        don't need to be outsourced.
        """
        return {**self._lambda_env(outsource=False), **self._git_status}

    @property
    def lambda_env_for_outsourcing(self) -> Dict[str, str]:
        """
        Same as :meth:`lambda_env` but only for variables that need to be
        outsourced.
        """
        return self._lambda_env(outsource=True)

    def _lambda_env(self, *, outsource: bool) -> Dict[str, str]:
        return {
            k: v
            for k, v in os.environ.items()
            if k.startswith('AZUL_') and (len(v) > 128) == outsource
        }

    @cached_property
    def _outsourced_environ(self) -> Dict[str, str]:
        # FIXME: Eliminate local import
        #        https://github.com/DataBiosphere/azul/issues/3133
        import json
        try:
            with open_resource('environ.json') as f:
                return json.load(f)
        except NotInLambdaContextException:
            # An outsourced environment is only defined in a Lambda context,
            # outside of one the real environment still contains all variables
            # that would be outsourced in a Lambda context.
            return {}

    def contribution_lambda_timeout(self, *, retry: bool) -> int:
        return (15 if retry else 5) * 60

    def aggregation_lambda_timeout(self, *, retry: bool) -> int:
        return (10 if retry else 1) * 60

    # For the period from 05/31/2020 to 06/06/2020, the max was 18s and the
    # average + 3 standard deviations was 1.8s in the `dev` deployment.
    #
    health_lambda_timeout = 10

    service_lambda_timeout = 15 * 60

    api_gateway_timeout = 29

    # The number of seconds to extend the timeout of a Lambda fronted by API Gateway so that API Gateway times out
    # before the Lambda. We pad the Lambda timeout so we get consistent behaviour. Without this padding we'd have a
    # race between the Lambda being killed and API Gateway timing out.
    #
    api_gateway_timeout_padding = 2

    @property
    def api_gateway_lambda_timeout(self) -> int:
        return self.api_gateway_timeout + self.api_gateway_timeout_padding

    # This is set dynamically at runtime
    lambda_context: Optional[LambdaContext] = None

    @property
    def terra_client_timeout(self) -> float:
        value = os.environ['AZUL_TERRA_TIMEOUT']
        short_timeout, long_timeout = map(float, value.split(':'))
        require(short_timeout <= long_timeout, short_timeout, long_timeout)
        if self.lambda_context is None:
            return long_timeout
        else:
            remaining = self.lambda_context.get_remaining_time_in_millis()
            return (short_timeout
                    if remaining <= self.api_gateway_lambda_timeout else
                    long_timeout)

    term_re = re.compile("[a-z][a-z0-9_]{1,28}[a-z0-9]")

    def _term_from_env(self, env_var_name: str, optional=False) -> str:
        value = self.environ.get(env_var_name, default='')
        if value == '' and optional:
            return value
        else:
            self._validate_term(value, name=env_var_name)
            return value

    @classmethod
    def _validate_term(cls, term: str, name: str = 'Term') -> None:
        require(
            cls.term_re.fullmatch(term) is not None,
            f"{name} is either too short, too long or contains invalid characters: '{term}'"
        )

    @classmethod
    def validate_entity_type(cls, entity_type: str) -> None:
        cls._validate_term(entity_type, name='entity_type')

    def secrets_manager_secret_name(self, *args):
        return '/'.join(['dcp', 'azul', self.deployment_stage, *args])

    def enable_gcp(self):
        return 'GOOGLE_PROJECT' in self.environ

    class ServiceAccount(Enum):
        indexer = ''
        public = '_public'
        unregistered = '_unregistered'

        @property
        def id(self) -> str:
            return os.environ['AZUL_GOOGLE_SERVICE_ACCOUNT' +
                              self.value.upper()]

        @property
        def secret_name(self) -> str:
            return 'google_service_account' + self.value

    def state_machine_name(self, lambda_name):
        return config.qualified_resource_name(lambda_name)

    def _concurrency(self, value: str, retry: bool) -> int:
        """
        >>> config._concurrency('123', False)
        123
        >>> config._concurrency('123', True)
        123
        >>> config._concurrency('123/456', False)
        123
        >>> config._concurrency('123/456', True)
        456
        >>> config._concurrency('foo', False)
        Traceback (most recent call last):
        ...
        ValueError: invalid literal for int() with base 10: 'foo'
        >>> config._concurrency('123/foo', False)
        Traceback (most recent call last):
        ...
        ValueError: invalid literal for int() with base 10: 'foo'
        >>> config._concurrency('123/', False)
        Traceback (most recent call last):
        ...
        ValueError: invalid literal for int() with base 10: ''
        >>> config._concurrency('123/456/789', False)
        Traceback (most recent call last):
        ...
        ValueError: invalid literal for int() with base 10: '456/789'
        """
        value, sep, retry_value = value.partition('/')
        value = int(value)
        if sep:
            retry_value = int(retry_value)
        else:
            assert not retry_value
            retry_value = value
        return retry_value if retry else value

    def contribution_concurrency(self, *, retry: bool) -> int:
        return self._concurrency(self.environ['AZUL_CONTRIBUTION_CONCURRENCY'],
                                 retry)

    def aggregation_concurrency(self, *, retry: bool) -> int:
        return self._concurrency(self.environ['AZUL_AGGREGATION_CONCURRENCY'],
                                 retry)

    @property
    def bigquery_reserved_slots(self) -> int:
        """
        The number of BigQuery slots to reserve when reindexing a catalog from a
        repository that stores its data in Google BigQuery.
        """
        # Slots must be purchased in intervals of 100
        min_slots = 100
        concurrency = self.contribution_concurrency(retry=False)
        return max(1, round(concurrency / min_slots)) * min_slots

    @property
    def bigquery_batch_mode(self) -> bool:
        return self._boolean(os.environ['AZUL_BIGQUERY_BATCH_MODE'])

    def notifications_queue_name(self, *, retry=False, fail=False) -> str:
        name = self.unqual_notifications_queue_name(retry=retry, fail=fail)
        return self.qualified_resource_name(name)

    def unqual_notifications_queue_name(self, *, retry=False, fail=False):
        return self._unqual_queue_name('notifications', retry, fail)

    def tallies_queue_name(self, *, retry=False, fail=False) -> str:
        name = self.unqual_tallies_queue_name(retry=retry, fail=fail)
        return config.qualified_resource_name(name, suffix='.fifo')

    def unqual_tallies_queue_name(self, *, retry=False, fail=False):
        return self._unqual_queue_name('tallies', retry, fail)

    def _unqual_queue_name(self, basename: str, retry: bool,
                           fail: bool) -> str:
        parts = [basename]
        if fail:
            assert not retry
            parts.append('fail')
        elif retry:
            parts.append('retry')
        return '_'.join(parts)

    @property
    def all_queue_names(self) -> List[str]:
        return self.work_queue_names + self.fail_queue_names

    @property
    def fail_queue_names(self) -> List[str]:
        return [
            self.tallies_queue_name(fail=True),
            self.notifications_queue_name(fail=True)
        ]

    @property
    def work_queue_names(self) -> List[str]:
        return [
            queue_name(retry=retry)
            for queue_name in (self.notifications_queue_name,
                               self.tallies_queue_name)
            for retry in (False, True)
        ]

    url_shortener_whitelist = [
        r'([^.]+\.)*humancellatlas\.org',
        r'([^.]+\.)*singlecell\.gi\.ucsc\.edu'
    ]

    @property
    def es_refresh_interval(self) -> int:
        """
        Integral number of seconds between index refreshes in Elasticsearch
        """
        return 1

    # FIXME: Should depend on ES instance size
    #        https://github.com/DataBiosphere/azul/issues/2903
    #        https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/aes-limits.html#network-limits
    max_chunk_size = 10 * 1024 * 1024

    #: The maximum number of contributions to read in a single request during
    #: aggregation. A value that's too large could cause the response to be
    #: truncated by AWS, resulting a SerializationError. A value that's too
    #: small will result in poor performance due to latency accruing from an
    #: excessive number of requests being made.
    #:
    contribution_page_size = 100

    @property
    def terraform_component(self):
        return self._term_from_env('azul_terraform_component', optional=True)

    permissions_boundary_name = 'azul-boundary'

    @property
    def github_project(self) -> str:
        return self.environ['azul_github_project']

    @property
    def github_access_token(self) -> str:
        return self.environ['azul_github_access_token']

    @property
    def portal_db_bucket(self) -> str:
        return self.versioned_bucket

    def portal_db_object_key(self, catalog_source: str) -> str:
        return f'azul/{self.deployment_stage}/portals/{catalog_source}-db.json'

    @property
    def lambda_layer_bucket(self) -> str:
        return self.versioned_bucket

    @property
    def lambda_layer_key(self) -> str:
        return 'lambda_layers'

    @property
    def dynamo_object_version_table_name(self) -> str:
        return self.qualified_resource_name('object_versions')

    @property
    def dynamo_sources_cache_table_name(self) -> str:
        return self.qualified_resource_name('sources_cache_by_auth')

    @property
    def reindex_sources(self) -> List[str]:
        sources = shlex.split(self.environ.get('azul_reindex_sources', '*'))
        require(sources, 'Sources cannot be empty', sources)
        return sources

    terms_aggregation_size = 99999

    precision_threshold = 40000

    minimum_compression_size = 0

    @property
    def google_oauth2_client_id(self) -> Optional[str]:
        return os.environ.get('AZUL_GOOGLE_OAUTH2_CLIENT_ID')
Exemplo n.º 6
0
 def accumulate(self, value) -> bool:
     return super().accumulate(freeze(value))