def test_dynamic_block_batch_size_adjuster():
    config = BlockBatchSizeConfig(
        min=BlockNumber(5),
        warn_threshold=BlockNumber(50),
        initial=BlockNumber(1000),
        max=BlockNumber(100_000),
    )
    adjuster = BlockBatchSizeAdjuster(config, base=2, step_size=1)

    # Check initial value
    assert adjuster.batch_size == 1000

    adjuster.increase()
    assert adjuster.batch_size == 2000

    # Increase all the way to the max value
    for _ in range(6):
        adjuster.increase()
    assert adjuster.batch_size == config.max

    # Ensure we're clamped to the max value
    adjuster.increase()
    assert adjuster.batch_size == config.max

    # Decrease back down to the minimum
    for _ in range(15):
        adjuster.decrease()
    assert adjuster.batch_size == config.min

    # Decreasing below the minimum must raise an exception
    with pytest.raises(BlockBatchSizeTooSmall):
        adjuster.decrease()
Exemplo n.º 2
0
class BlockchainEvents:
    def __init__(
        self,
        web3: Web3,
        chain_id: ChainID,
        contract_manager: ContractManager,
        last_fetched_block: BlockNumber,
        event_filter: RaidenContractFilter,
        block_batch_size_config: BlockBatchSizeConfig,
        node_address: Address,
    ) -> None:
        self.web3 = web3
        self.chain_id = chain_id
        self.last_fetched_block = last_fetched_block
        self.contract_manager = contract_manager
        self.event_filter = event_filter
        self.block_batch_size_adjuster = BlockBatchSizeAdjuster(block_batch_size_config)
        self.node_address = node_address

        # This lock is used to add a new smart contract to the list of polled
        # smart contracts. The crucial optimization done by this class is to
        # query all smart contracts with only one request, this requires the
        # parameters `fromBlock` and `toBlock` to be the same for all smart
        # contracts. The lock is used to hold new requests, while the logs of
        # the new smart contract are queried to catch up, and then for it to be
        # added to the list of tracked smart contracts.
        #
        # This lock also guarantees that the events will be processed only
        # once, and because of this the `target_block_number` must always be a
        # confirmed block.
        #
        # Additionally, user facing APIs, which have on-chain side-effects,
        # used force poll the blockchain to update the node's state. This force
        # poll is used to provide a consistent view to the user, e.g. a channel
        # open call waits for the transaction to be mined and force polled the
        # event to update the node's state. This pattern introduced a race with
        # the alarm task and the task which served the user request, because
        # the events are returned only once per filter. The lock below is to
        # protect against these races (introduced by the commit
        # 3686b3275ff7c0b669a6d5e2b34109c3bdf1921d)
        self._filters_lock = Semaphore()
        self._address_to_abi: Dict[Address, ABI] = event_filter.abi_of_contract_address(
            contract_manager
        )

    def fetch_logs_in_batch(self, target_block_number: BlockNumber) -> Optional[PollResult]:
        """Poll the smart contract events for a limited number of blocks to
        avoid read timeouts (issue #3558).

        The block ``target_block_number`` will not be reached if it is more than
        ``self.block_batch_size_adjuster.batch_size`` blocks away. To ensure the
        target is reached keep calling ``fetch_logs_in_batch`` until
        ``PollResult.polled_block_number`` is the same as ``target_block_number``.

        This function will make sure that the block range for the queries is
        not too big, this is necessary because it may take a long time for an
        Ethereum node to process the request, which will result in read
        timeouts (issue #3558).

        The block batch size is adjusted dynamically based on the request
        processing duration (see ``_query_and_track()``, issue #5538).
        If the request times out the batch size is decreased and ``None``
        is returned.
        If the batch size falls below the lower threshold an exception is raised
        by the ``BlockBatchSizeAdjuster``.

        This will also group the queries as an optimization for a healthy node
        (issue #4872). This is enforced by the design of the datastructures,
        this will always fetch all the events for all the registered addresses.
        """
        # The target block has been reached already, raise an exception since
        # the caller is breaking the contract of the API
        if target_block_number <= self.last_fetched_block:
            raise ValueError(
                f"target {target_block_number} is in the past, the block has "
                f"been fetched already. Current {self.last_fetched_block}"
            )

        # As of Geth 1.9.5 there is no relational database nor an index of
        # blooms. Geth always does a linear search proportional to the number
        # of blocks in the query.
        #
        # As of Parity 2.5.8 the client has no relational database. The
        # blockchain events are indexed through a hierarchy of bloom filters
        # three levels deep, each level has it's own `.dbd` file.
        #
        # The Bottom layer is comprised of every block logs bloom, as defined
        # in the yellow paper, where each entry position matches the
        # originating block number. The top and mid layers are just an
        # optimization, in these layers each entry is composed of 16 blooms
        # filters from the layer below.
        #
        # Each pair (`address`, `topic`) of a query is used to create one bloom
        # filter, these blooms are then used find candidate blocks through the
        # bloom index, then these blocks are loaded and their logs filtered.
        #
        # Based on the `fromBlock` the index files are seeked to the correct
        # position. The search always start at the top level, if the query
        # bloom is not contained in the index then the search goes to next
        # entry at the top level and skips all the mid and lower indexes. The
        # same procedure is done for the mid level. If there is a match at the
        # lower level, then we may have a hit. Because the bloom index is the
        # same as the block number, this information is used to recover the
        # block hash.
        #
        # Each of the blocks that correspond to the hashes from the previous
        # step are then loaded, including the receipts with the logs. The
        # matching logs are then returned as results to the query.
        #
        # Additional notes for Parity :
        #
        # - Every operation to the bloom database uses an exclusive lock.
        # Therefore concurrent requests are not extremely useful.
        # - The path explained above is only used if the queries are done using
        # block numbers. Queries for block hashes will not use the index, this
        # seems necessary because there is only one index for the canonical
        # chain, and queries with block hashes seems to support uncle
        # blocks/reorgs.
        # - When an address is being queried for all the logs, it is better to
        # not specify any topics. Specially when multiple addresses are being
        # queried.
        # - The batching interface doesn't do any internal optimizations, so in
        # effect it is the same thing as sending multiple requests, one after
        # the other. The only benefit here would be to save the requests
        # round-trip time.

        with self._filters_lock:
            # Skip the last fetched block, since the ranges are inclusive the
            # same block will be fetched twice which could result in duplicate
            # events.
            from_block = BlockNumber(self.last_fetched_block + 1)

            # Limit the range of blocks fetched, this limits the size of
            # the scan done by the target node. The batch size is adjusted
            # below depending on the response time of the node.
            to_block = BlockNumber(
                min(from_block + self.block_batch_size_adjuster.batch_size, target_block_number)
            )

            # Sending a single request for all the smart contract addresses
            # is the core optimization here. Because both Geth and Parity
            # will do a linear search per request, in some shape or form,
            # sending only one request will result in only one linear
            # search.
            #
            # This optimization has a few benefits:
            #
            # - There will be only one request for all the smart contracts,
            # reducing trafic from Raiden to the Ethereum client, this is
            # important if the client is remote or a hosted service like
            # Infura.
            # - The request will be faster for large ranges (This is an
            # implementation detail that happen to be true for both
            # clients, the rationale is to reduce the number of loops that
            # go through lots of elements).

            try:
                decoded_result, max_request_duration = self._query_and_track(from_block, to_block)
            except EthGetLogsTimeout:
                # The request timed out - this typically means the node wasn't able to process
                # the requested batch size fast enough.
                # Decrease the batch size and let the higher layer retry.
                log.debug("Timeout while fetching blocks, decreasing batch size")
                self.block_batch_size_adjuster.decrease()
                return None

            can_use_bigger_batches = (
                target_block_number - from_block > self.block_batch_size_adjuster.batch_size
            )
            # Adjust block batch size depending on request duration.
            # To reduce oscillating the batch size is kept constant for request durations
            # between ``ETH_GET_LOGS_THRESHOLD_FAST`` and ``ETH_GET_LOGS_THRESHOLD_SLOW``.
            if max_request_duration < ETH_GET_LOGS_THRESHOLD_FAST:
                # The request was fast, increase batch size
                if can_use_bigger_batches:
                    # But only if we actually need bigger batches. This prevents the batch
                    # size from ballooning towards the maximum after the initial sync is done
                    # since then typically only one block is fetched at a time which is usually
                    # fast.
                    self.block_batch_size_adjuster.increase()
            elif max_request_duration > ETH_GET_LOGS_THRESHOLD_SLOW:
                # The request is taking longer than the 'slow' threshold - decrease
                # the batch size
                self.block_batch_size_adjuster.decrease()

            latest_confirmed_block = self.web3.eth.getBlock(to_block)

            self.last_fetched_block = to_block

            return PollResult(
                polled_block_number=to_block,
                polled_block_hash=BlockHash(bytes(latest_confirmed_block["hash"])),
                polled_block_gas_limit=BlockGasLimit(latest_confirmed_block["gasLimit"]),
                events=decoded_result,
            )

    def _query_and_track(
        self, from_block: BlockNumber, to_block: BlockNumber
    ) -> Tuple[List[DecodedEvent], float]:
        """Query the blockchain up to `to_block` and create the filters for the
        smart contracts deployed during the current batch.

        Because of how polling is optimized, filters for smart contracts
        deployed in the current batch must be created, queried, and be merged
        into the same batch. This is necessary to avoid race conditions on
        restarts that could lead to loss of events. Example:

                   last confirmed block
                   |
                   v    v end of current batch / new confirmed block
                   4    9
        Batch  ####------
        TNR    ####--*---
        TN           --*-
                     ^ ^
                     | new channel opened
                     |
                     new token network registered

        For this example, the current batch is fetching the range `[4, 9]`. In
        this range a new token is registered at block 6, at block 8 a new
        channel is opened in the new network.

        If the events of the new TN are *not* queried, block 9 will be
        confirmed after processing the batch which adds the TN, and iff the
        node crashes right after processing this batch, on the next restart
        *all* filters will start from 9, thus missing the event for the new
        channel on block 8.
        """
        max_request_duration: float = 0
        result: List[DecodedEvent] = []
        event_filter: Optional[RaidenContractFilter] = self.event_filter

        # While there are new smart contracts to follow, this will query them
        # and add to the existing filters.
        #
        # The batch itself may have an event for a newly deployed smart
        # contract, e.g. a new token network. The new smart contract needs a
        # filter, and then the filter has to be queried before for the same
        # batch before it is dispatched. This is necessary to guarantee safety
        # of restarts.
        i = 0
        while event_filter:
            i += 1
            blockchain_events: List[LogReceipt] = []

            for filter_params in event_filter.to_web3_filters(
                self.contract_manager, from_block, to_block, self.node_address
            ):
                log.debug(
                    "Querying new blockchain events",
                    from_block=from_block,
                    to_block=to_block,
                    event_filter=event_filter,
                    filter_params=filter_params,
                    i=i,
                    node=to_checksum_address(self.node_address),
                )
                filter_name = filter_params.pop("_name")  # type: ignore

                try:
                    start = time.monotonic()
                    # Using web3 because:
                    # - It sets an unique request identifier, not strictly necessary.
                    # - To avoid another abstraction to query the Ethereum client.
                    new_events: List[LogReceipt] = self.web3.manager.request_blocking(
                        RPCEndpoint("eth_getLogs"), [filter_params]
                    )
                    request_duration = time.monotonic() - start
                    max_request_duration = max(max_request_duration, request_duration)
                except ReadTimeout as ex:
                    # The request timed out while waiting for a response (as opposed to a
                    # ConnectTimeout).
                    # This will usually be caused by overloading of the target
                    # eth node but can also happen due to network conditions.
                    raise EthGetLogsTimeout() from ex

                log.debug(
                    "Fetched new blockchain events",
                    from_block=filter_params["fromBlock"],
                    to_block=filter_params["toBlock"],
                    addresses=filter_params["address"],
                    filter_name=filter_name,
                    new_events=new_events,
                    request_duration=request_duration,
                    i=i,
                    node=to_checksum_address(self.node_address),
                )
                blockchain_events.extend(new_events)

            if blockchain_events:
                # If this should ever decode events from non-controlled contracts, we need
                # to make sure no unrecoverable error is thrown. If this was an unrecoverable
                # it would open a surface for attacks.
                decoded_events = [
                    decode_raiden_event_to_internal(
                        self._address_to_abi[to_canonical_address(event["address"])],
                        self.chain_id,
                        event,
                    )
                    for event in blockchain_events
                ]
                sort_events(decoded_events)

                from dataclasses import asdict

                log.debug(
                    "Decoded new blockchain events",
                    decoded_events=[asdict(e) for e in decoded_events],
                    node=to_checksum_address(self.node_address),
                )
                result.extend(decoded_events)

                # Go through the results and create the child filters, if necessary.
                event_filter = new_filters_from_events(decoded_events)

                # Register the new filters, so that they will be fetched on the next iteration
                self.event_filter = self.event_filter.union(event_filter)
                self._address_to_abi.update(
                    event_filter.abi_of_contract_address(self.contract_manager)
                )
            else:
                event_filter = None

        return result, max_request_duration

    def uninstall_all_event_listeners(self) -> None:
        with self._filters_lock:
            self._address_to_abi = {}