Пример #1
0
    def __init__(self,
                 should_skip_logging=None,
                 elasticsearch_config=None,
                 producer=None,
                 **kwargs):
        self._should_skip_logging = should_skip_logging
        self._logs_producer = LogProducerProxy()
        self._es_client = configure_es(**elasticsearch_config)

        if producer == "kafka":
            kafka_config = kwargs["kafka_config"]
            self._logs_producer.initialize(KafkaLogsProducer(**kafka_config))
        elif producer == "elasticsearch":
            self._logs_producer.initialize(ElasticsearchLogsProducer())
        elif producer == "kinesis_stream":
            kinesis_stream_config = kwargs["kinesis_stream_config"]
            self._logs_producer.initialize(
                KinesisStreamLogsProducer(**kinesis_stream_config))
        else:
            raise Exception("Invalid log producer: %s" % producer)
Пример #2
0
class DocumentLogsModel(SharedModel, ActionLogsDataInterface,
                        ElasticsearchLogsModelInterface):
    """
    DocumentLogsModel implements the data model for the logs API backed by an elasticsearch service.
    """
    def __init__(self,
                 should_skip_logging=None,
                 elasticsearch_config=None,
                 producer=None,
                 **kwargs):
        self._should_skip_logging = should_skip_logging
        self._logs_producer = LogProducerProxy()
        self._es_client = configure_es(**elasticsearch_config)

        if producer == "kafka":
            kafka_config = kwargs["kafka_config"]
            self._logs_producer.initialize(KafkaLogsProducer(**kafka_config))
        elif producer == "elasticsearch":
            self._logs_producer.initialize(ElasticsearchLogsProducer())
        elif producer == "kinesis_stream":
            kinesis_stream_config = kwargs["kinesis_stream_config"]
            self._logs_producer.initialize(
                KinesisStreamLogsProducer(**kinesis_stream_config))
        else:
            raise Exception("Invalid log producer: %s" % producer)

    @staticmethod
    def _get_ids_by_names(repository_name, namespace_name, performer_name):
        """
        Retrieve repository/namespace/performer ids based on their names.

        throws DataModelException when the namespace_name does not match any user in the database.
        returns database ID or None if not exists.
        """
        repository_id = None
        account_id = None
        performer_id = None

        if repository_name and namespace_name:
            repository = model.repository.get_repository(
                namespace_name, repository_name)
            if repository:
                repository_id = repository.id
                account_id = repository.namespace_user.id

        if namespace_name and account_id is None:
            account = model.user.get_user_or_org(namespace_name)
            if account is None:
                raise DataModelException("Invalid namespace requested")

            account_id = account.id

        if performer_name:
            performer = model.user.get_user(performer_name)
            if performer:
                performer_id = performer.id

        return repository_id, account_id, performer_id

    def _base_query(self,
                    performer_id=None,
                    repository_id=None,
                    account_id=None,
                    filter_kinds=None,
                    index=None):
        if filter_kinds is not None:
            assert all(
                isinstance(kind_name, str) for kind_name in filter_kinds)

        if index is not None:
            search = LogEntry.search(index=index)
        else:
            search = LogEntry.search()

        if performer_id is not None:
            assert isinstance(performer_id, int)
            search = search.filter("term", performer_id=performer_id)

        if repository_id is not None:
            assert isinstance(repository_id, int)
            search = search.filter("term", repository_id=repository_id)

        if account_id is not None and repository_id is None:
            assert isinstance(account_id, int)
            search = search.filter("term", account_id=account_id)

        if filter_kinds is not None:
            kind_map = model.log.get_log_entry_kinds()
            ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
            search = search.exclude("terms", kind_id=ignore_ids)

        return search

    def _base_query_date_range(
        self,
        start_datetime,
        end_datetime,
        performer_id,
        repository_id,
        account_id,
        filter_kinds,
        index=None,
    ):
        skip_datetime_check = False
        if _date_range_in_single_index(start_datetime, end_datetime):
            index = self._es_client.index_name(start_datetime)
            skip_datetime_check = self._es_client.index_exists(index)

        if index and (skip_datetime_check
                      or self._es_client.index_exists(index)):
            search = self._base_query(performer_id,
                                      repository_id,
                                      account_id,
                                      filter_kinds,
                                      index=index)
        else:
            search = self._base_query(performer_id, repository_id, account_id,
                                      filter_kinds)

        if not skip_datetime_check:
            search = search.query("range",
                                  datetime={
                                      "gte": start_datetime,
                                      "lt": end_datetime
                                  })

        return search

    def _load_logs_for_day(
        self,
        logs_date,
        performer_id,
        repository_id,
        account_id,
        filter_kinds,
        after_datetime=None,
        after_random_id=None,
        size=PAGE_SIZE,
    ):
        index = self._es_client.index_name(logs_date)
        if not self._es_client.index_exists(index):
            return []

        search = self._base_query(performer_id,
                                  repository_id,
                                  account_id,
                                  filter_kinds,
                                  index=index)
        search = search.sort({"datetime": "desc"},
                             {"random_id.keyword": "desc"})
        search = search.extra(size=size)

        if after_datetime is not None and after_random_id is not None:
            after_datetime_epoch_ms = epoch_ms(after_datetime)
            search = search.extra(
                search_after=[after_datetime_epoch_ms, after_random_id])

        return search.execute()

    def _load_latest_logs(self, performer_id, repository_id, account_id,
                          filter_kinds, size):
        """
        Return the latest logs from Elasticsearch.

        Look at indices up to theset logrotateworker threshold, or up to 30 days if not defined.
        """
        # Set the last index to check to be the logrotateworker threshold, or 30 days
        end_datetime = datetime.now()
        start_datetime = end_datetime - timedelta(days=DATE_RANGE_LIMIT)

        latest_logs = []
        for day in _date_range_descending(start_datetime,
                                          end_datetime,
                                          includes_end_datetime=True):
            try:
                logs = self._load_logs_for_day(day,
                                               performer_id,
                                               repository_id,
                                               account_id,
                                               filter_kinds,
                                               size=size)
                latest_logs.extend(logs)
            except NotFoundError:
                continue

            if len(latest_logs) >= size:
                break

        return _for_elasticsearch_logs(latest_logs[:size], repository_id,
                                       account_id)

    def lookup_logs(
        self,
        start_datetime,
        end_datetime,
        performer_name=None,
        repository_name=None,
        namespace_name=None,
        filter_kinds=None,
        page_token=None,
        max_page_count=None,
    ):
        assert start_datetime is not None and end_datetime is not None

        # Check for a valid combined model token when migrating online from a combined model
        if page_token is not None and page_token.get(
                "readwrite_page_token") is not None:
            page_token = page_token.get("readwrite_page_token")

        if page_token is not None and max_page_count is not None:
            page_number = page_token.get("page_number")
            if page_number is not None and page_number + 1 > max_page_count:
                return LogEntriesPage([], None)

        repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
            repository_name, namespace_name, performer_name)

        after_datetime = None
        after_random_id = None
        if page_token is not None:
            after_datetime = parse_datetime(page_token["datetime"])
            after_random_id = page_token["random_id"]

        if after_datetime is not None:
            end_datetime = min(end_datetime, after_datetime)

        all_logs = []

        with CloseForLongOperation(config.app_config):
            for current_date in _date_range_descending(start_datetime,
                                                       end_datetime):
                try:
                    logs = self._load_logs_for_day(
                        current_date,
                        performer_id,
                        repository_id,
                        account_id,
                        filter_kinds,
                        after_datetime,
                        after_random_id,
                        size=PAGE_SIZE + 1,
                    )

                    all_logs.extend(logs)
                except NotFoundError:
                    continue

                if len(all_logs) > PAGE_SIZE:
                    break

        next_page_token = None
        all_logs = all_logs[0:PAGE_SIZE + 1]

        if len(all_logs) == PAGE_SIZE + 1:
            # The last element in the response is used to check if there's more elements.
            # The second element in the response is used as the pagination token because search_after does
            # not include the exact match, and so the next page will start with the last element.
            # This keeps the behavior exactly the same as table_logs_model, so that
            # the caller can expect when a pagination token is non-empty, there must be
            # at least 1 log to be retrieved.
            next_page_token = {
                "datetime": all_logs[-2].datetime.isoformat(),
                "random_id": all_logs[-2].random_id,
                "page_number":
                page_token["page_number"] + 1 if page_token else 1,
            }

        return LogEntriesPage(
            _for_elasticsearch_logs(all_logs[:PAGE_SIZE], repository_id,
                                    account_id),
            next_page_token,
        )

    def lookup_latest_logs(
        self,
        performer_name=None,
        repository_name=None,
        namespace_name=None,
        filter_kinds=None,
        size=20,
    ):
        repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
            repository_name, namespace_name, performer_name)

        with CloseForLongOperation(config.app_config):
            latest_logs = self._load_latest_logs(performer_id, repository_id,
                                                 account_id, filter_kinds,
                                                 size)

        return latest_logs

    def get_aggregated_log_counts(
        self,
        start_datetime,
        end_datetime,
        performer_name=None,
        repository_name=None,
        namespace_name=None,
        filter_kinds=None,
    ):
        if end_datetime - start_datetime >= timedelta(days=DATE_RANGE_LIMIT):
            raise Exception(
                "Cannot lookup aggregated logs over a period longer than a month"
            )

        repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
            repository_name, namespace_name, performer_name)

        with CloseForLongOperation(config.app_config):
            search = self._base_query_date_range(start_datetime, end_datetime,
                                                 performer_id, repository_id,
                                                 account_id, filter_kinds)
            search.aggs.bucket("by_id", "terms",
                               field="kind_id").bucket("by_date",
                                                       "date_histogram",
                                                       field="datetime",
                                                       interval="day")
            # es returns all buckets when size=0
            search = search.extra(size=0)
            resp = search.execute()

        if not resp.aggregations:
            return []

        counts = []
        by_id = resp.aggregations["by_id"]

        for id_bucket in by_id.buckets:
            for date_bucket in id_bucket.by_date.buckets:
                if date_bucket.doc_count > 0:
                    counts.append(
                        AggregatedLogCount(id_bucket.key,
                                           date_bucket.doc_count,
                                           date_bucket.key))

        return counts

    def count_repository_actions(self, repository, day):
        index = self._es_client.index_name(day)
        search = self._base_query_date_range(day,
                                             day + timedelta(days=1),
                                             None,
                                             repository.id,
                                             None,
                                             None,
                                             index=index)
        search = search.params(request_timeout=COUNT_REPOSITORY_ACTION_TIMEOUT)

        try:
            return search.count()
        except NotFoundError:
            return 0

    def log_action(
        self,
        kind_name,
        namespace_name=None,
        performer=None,
        ip=None,
        metadata=None,
        repository=None,
        repository_name=None,
        timestamp=None,
        is_free_namespace=False,
    ):
        if self._should_skip_logging and self._should_skip_logging(
                kind_name, namespace_name, is_free_namespace):
            return

        if repository_name is not None:
            assert repository is None
            assert namespace_name is not None
            repository = model.repository.get_repository(
                namespace_name, repository_name)

        if timestamp is None:
            timestamp = datetime.today()

        account_id = None
        performer_id = None
        repository_id = None

        if namespace_name is not None:
            account_id = model.user.get_namespace_user(namespace_name).id

        if performer is not None:
            performer_id = performer.id

        if repository is not None:
            repository_id = repository.id

        metadata_json = json.dumps(metadata or {}, default=_json_serialize)
        kind_id = model.log._get_log_entry_kind(kind_name)
        log = LogEntry(
            random_id=_random_id(),
            kind_id=kind_id,
            account_id=account_id,
            performer_id=performer_id,
            ip=ip,
            metadata_json=metadata_json,
            repository_id=repository_id,
            datetime=timestamp,
        )

        try:
            self._logs_producer.send(log)
        except LogSendException as lse:
            strict_logging_disabled = config.app_config.get(
                "ALLOW_PULLS_WITHOUT_STRICT_LOGGING")
            logger.exception("log_action failed",
                             extra=({
                                 "exception": lse
                             }).update(log.to_dict()))
            if not (strict_logging_disabled
                    and kind_name in ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING):
                raise

    def yield_logs_for_export(
        self,
        start_datetime,
        end_datetime,
        repository_id=None,
        namespace_id=None,
        max_query_time=None,
    ):
        max_query_time = max_query_time.total_seconds(
        ) if max_query_time is not None else 300
        search = self._base_query_date_range(start_datetime, end_datetime,
                                             None, repository_id, namespace_id,
                                             None)

        def raise_on_timeout(batch_generator):
            start = time()
            for batch in batch_generator:
                elapsed = time() - start
                if elapsed > max_query_time:
                    logger.error(
                        "Retrieval of logs `%s/%s` timed out with time of `%s`",
                        namespace_id,
                        repository_id,
                        elapsed,
                    )
                    raise LogsIterationTimeout()

                yield batch
                start = time()

        def read_batch(scroll):
            batch = []
            for log in scroll:
                batch.append(log)
                if len(batch) == DEFAULT_RESULT_WINDOW:
                    yield _for_elasticsearch_logs(batch,
                                                  repository_id=repository_id,
                                                  namespace_id=namespace_id)
                    batch = []

            if batch:
                yield _for_elasticsearch_logs(batch,
                                              repository_id=repository_id,
                                              namespace_id=namespace_id)

        search = search.params(size=DEFAULT_RESULT_WINDOW,
                               request_timeout=max_query_time)

        try:
            with CloseForLongOperation(config.app_config):
                for batch in raise_on_timeout(read_batch(search.scan())):
                    yield batch
        except ConnectionTimeout:
            raise LogsIterationTimeout()

    def can_delete_index(self, index, cutoff_date):
        return self._es_client.can_delete_index(index, cutoff_date)

    def list_indices(self):
        return self._es_client.list_indices()

    def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
        """
        Yield a context manager for a group of outdated logs.
        """
        all_indices = self.list_indices()
        for index in all_indices:
            if not self.can_delete_index(index, cutoff_date):
                continue

            context = ElasticsearchLogRotationContext(index,
                                                      min_logs_per_rotation,
                                                      self._es_client)
            yield context