Пример #1
0
  def __init__(self, L):
    ''' Initialise the `SubLater` with its parent `Later`.

        TODO: accept `discard=False` param to suppress the queue and
        associated checks.
    '''
    self._later = L
    self._later.open()
    self._lock = Lock()
    self._deferred = 0
    self._queued = 0
    self._queue = IterableQueue()
    self.closed = False
Пример #2
0
 def startup(self):
   ''' Connect to the server and log in.
   '''
   self._sock = self.conn_spec.connect()
   self.recvf = self._sock.makefile('r', encoding='iso8859-1')
   self.sendf = self._sock.makefile('w', encoding='ascii')
   self.client_begin()
   self.client_auth(self.conn_spec.user, self.conn_spec.password)
   self._result_queue = IterableQueue()
   self._client_worker = bg_thread(
       self._client_response_worker, args=(self._result_queue,)
   )
   return self
Пример #3
0
    def pushto(self, dstS, *, capacity=64, progress=None):
        ''' Allocate a Queue for Blocks to push from this Store to another Store `dstS`.
        Return `(Q,T)` where `Q` is the new Queue and `T` is the
        Thread processing the Queue.

        Parameters:
        * `dstS`: the secondary Store to receive Blocks.
        * `capacity`: the Queue capacity, arbitrary default `64`.
        * `progress`: an optional `Progress` counting submitted and completed data bytes.

        Once called, the caller can then .put Blocks onto the Queue.
        When finished, call Q.close() to indicate end of Blocks and
        T.join() to wait for the processing completion.
    '''
        sem = Semaphore(capacity)
        ##sem = Semaphore(1)
        name = "%s.pushto(%s)" % (self.name, dstS.name)
        with Pfx(name):
            Q = IterableQueue(capacity=capacity, name=name)
            srcS = self
            srcS.open()
            dstS.open()
            T = bg_thread(lambda: (
                self.push_blocks(name, Q, srcS, dstS, sem, progress),
                srcS.close(),
                dstS.close(),
            ))
            return Q, T
Пример #4
0
def report_offsets(bfr, run_parser):
  ''' Dispatch a parser in a separate Thread, return an IterableQueue yielding offsets.

      Parameters:
      * `bfr`: a `CornuCopyBuffer` providing data to parse
      * `run_parser`: a callable which runs the parser; it should accept a
        `CornuCopyBuffer` as its sole argument.

      This function allocates an `IterableQueue` to receive the parser offset
      reports and sets the `CornuCopyBuffer` with `report_offset` copying
      offsets to the queue.
      It is the task of the parser to call `bfr.report_offset` as
      necessary to indicate suitable offsets.
  '''
  with Pfx("report_offsets(bfr,run_parser=%s)", run_parser):
    offsetQ = IterableQueue()
    if bfr.copy_offsets is not None:
      warning("bfr %s already has copy_offsets, replacing", bfr)
    bfr.copy_offsets = offsetQ.put

    def thread_body():
      with Pfx("parser-thread"):
        try:
          run_parser(bfr)
        except Exception as e:
          exception("exception: %s", e)
          raise
        finally:
          offsetQ.close()

    T = PfxThread(target=thread_body)
    T.start()
    return offsetQ
Пример #5
0
    def keys(self):
        seen = set()
        Q = IterableQueue()

        def keys_from(S):
            for h in S.keys():
                Q.put(h)
            Q.put(None)

        busy = 0
        for S in self.read:
            bg_thread(partial(keys_from, S))
            busy += 1
        for h in Q:
            if h is None:
                busy -= 1
                if not busy:
                    Q.close()
            elif h not in seen:
                yield h
                seen.add(h)
Пример #6
0
def pipeline(later, actions, inputs=None, outQ=None, name=None):
    ''' Construct a function pipeline to be mediated by this Later queue.
      Return: `input, output`
      where `input`` is a closeable queue on which more data items can be put
      and `output` is an iterable from which result can be collected.

      Parameters:
      * `actions`: an iterable of filter functions accepting
        single items from the iterable `inputs`, returning an
        iterable output.
      * `inputs`: the initial iterable inputs; this may be None.
        If missing or None, it is expected that the caller will
        be supplying input items via `input.put()`.
      * `outQ`: the optional output queue; if None, an IterableQueue() will be
        allocated.
      * `name`: name for the PushQueue implementing this pipeline.

      If `inputs` is None or `open` is true, the returned `input` requires
      a call to `input.close()` when no further inputs are to be supplied.

      Example use with presupplied Later `L`:

          input, output = L.pipeline(
                  [
                    ls,
                    filter_ls,
                    ( FUNC_MANY_TO_MANY, lambda items: sorted(list(items)) ),
                  ],
                  ('.', '..', '../..'),
                 )
          for item in output:
            print(item)
  '''
    filter_funcs = list(actions)
    if not filter_funcs:
        raise ValueError("no actions")
    if outQ is None:
        outQ = IterableQueue(name="pipelineIQ")
    if name is None:
        name = "pipelinePQ"
    pipeline = Pipeline(name, later, filter_funcs, outQ)
    inQ = pipeline.inQ
    if inputs is not None:
        later.defer_iterable(inputs, inQ)
    else:
        debug(
            "%s._pipeline: no inputs, NOT setting up _defer_iterable( inputs, inQ=%r)",
            later, inQ)
    return pipeline
Пример #7
0
  def dispatch(self, func, retq=None, deliver=None, pfx=None, daemon=None):
    ''' Dispatch the callable `func` in a separate thread.

        On completion the result is the sequence
        `func_result, None, None, None`.
        On an exception the result is the sequence
        `None, exec_type, exc_value, exc_traceback`.

        If `retq` is not None, the result is .put() on retq.
        If `deliver` is not None, deliver(result) is called.
        If the parameter `pfx` is not None, submit pfx.partial(func);
        see the cs.logutils.Pfx.partial method for details.
        If `daemon` is not None, set the .daemon attribute of the Thread to `daemon`.

        TODO: high water mark for idle Threads.
    '''
    if self.closed:
      raise ValueError("%s: closed, but dispatch() called" % (self,))
    if pfx is not None:
      func = pfx.partial(func)
    if daemon is None:
      daemon = current_thread().daemon
    idle = self.idle_daemon if daemon else self.idle_fg
    with self._lock:
      debug("dispatch: idle = %s", idle)
      if idle:
        # use an idle thread
        entry = idle.pop()
        debug("dispatch: reuse %s", entry)
      else:
        debug("dispatch: need new thread")
        # no available threads - make one
        Targs = []
        T = Thread(
            target=self._handler,
            args=Targs,
            name=("%s:worker" % (self.name,))
        )
        T.daemon = daemon
        Q = IterableQueue(name="%s:IQ%d" % (self.name, seq()))
        entry = WTPoolEntry(T, Q)
        self.all.add(entry)
        Targs.append(entry)
        debug("%s: start new worker thread (daemon=%s)", self, T.daemon)
        T.start()
      entry.queue.put((func, retq, deliver))
Пример #8
0
def s3scrape(bucket_pool,
             srcurl,
             doit=False,
             do_delete=False,
             do_upload=False):
    ''' Sync website to S3 directory tree.
  '''
    global UPD
    ok = True
    L = Later(4, name="s3scrape(%r, %r)" % (bucket_pool.bucket_name, srcurl))
    with L:
        if do_upload:
            Q = IterableQueue()

            def dispatch():
                for LF in s3scrape_async(L,
                                         bucket_pool,
                                         srcurl,
                                         doit=doit,
                                         do_delete=do_delete):
                    Q.put(LF)
                Q.close()

            Thread(target=dispatch).start()
            for LF in Q:
                diff, ctype, srcU, dstpath, e, error_msg = LF()
                with Pfx(srcU):
                    if e:
                        error(error_msg)
                        ok = False
                    else:
                        line = "%s %-25s %s" % (diff.summary(), ctype, dstpath)
                        if diff.unchanged:
                            UPD.out(line)
                            ##UPD.nl(line)
                        else:
                            if diff.changed_fields() == ['time']:
                                # be quiet about time changes
                                UPD.out(line)
                            else:
                                UPD.nl(line)
                                ##UPD.nl("  %r", diff.metadata)
        if do_delete:
            # now process deletions
            with bucket_pool.instance() as B:
                ##if dstdir:
                ##  dstdir_prefix = dstdir + RSEP
                ##else:
                ##  dstdir_prefix = ''
                dstdir_prefix = RSEP
                with Pfx("S3.filter(Prefix=%r)", dstdir_prefix):
                    dstdelpaths = []
                    for s3obj in B.objects.filter(Prefix=dstdir_prefix):
                        dstpath = s3obj.key
                        with Pfx(dstpath):
                            if not dstpath.startswith(dstdir_prefix):
                                error("unexpected dstpath, not in subdir")
                                continue
                            dstrpath = dstpath[len(dstdir_prefix):]
                            if dstrpath.startswith(RSEP):
                                error("unexpected dstpath, extra %r", RSEP)
                                continue
                            raise RuntimeError("DELETION UNIMPLEMENTED")
                            srcpath = joinpath(srcdir,
                                               s32path(dstrpath, unpercent))
                            if os.path.exists(srcpath):
                                ##info("src exists, not deleting (src=%r)", srcpath)
                                continue
                            ## uncomment if new %hh omissions surface
                            ##UPD.nl("MISSING local %r", srcpath)
                            if dstrpath.endswith(RSEP):
                                # a folder
                                UPD.nl("d DEL %s", dstpath)
                            else:
                                UPD.nl("* DEL %s", dstpath)
                            dstdelpaths.append(dstpath)
                    if dstdelpaths:
                        dstdelpaths = sorted(dstdelpaths, reverse=True)
                        while dstdelpaths:
                            delpaths = dstdelpaths[:S3_MAX_DELETE_OBJECTS]
                            if doit:
                                result = B.delete_objects(
                                    Delete={
                                        'Objects': [{
                                            'Key': dstpath
                                        } for dstpath in delpaths]
                                    })
                                errs = result.get('Errors')
                                if errs:
                                    ok = False
                                    for err in errors:
                                        error("delete: %s: %r", err['Message'],
                                              err['Key'])
                            dstdelpaths[:len(delpaths)] = []
    L.wait()
    return ok
Пример #9
0
 def __init__(self):
     self.Q = IterableQueue(1024)
     self.bfr = CornuCopyBuffer(self.Q)
Пример #10
0
def blocked_chunks_of2(
    chunks,
    *,
    scanner=None,
    min_block=None,
    max_block=None,
):
    ''' Generator which connects to a scanner of a chunk stream in
      order to emit low level edge aligned data chunks.

      Parameters:
      * `chunks`: a source iterable of data chunks, handed to `scanner`
      * `scanner`: optional callable accepting a `CornuCopyBuffer` and
        returning an iterable of `int`s, such as a generator. `scanner`
        may be `None`, in which case only the rolling hash is used
        to locate boundaries.
      * `min_block`: the smallest amount of data that will be used
        to create a Block, default from `MIN_BLOCKSIZE` (`{MIN_BLOCKSIZE}`)
      * `max_block`: the largest amount of data that will be used to
        create a Block, default from `MAX_BLOCKSIZE` (`{MAX_BLOCKSIZE}`)

      The iterable returned from `scanner(chunks)` yields `int`s which are
      considered desirable block boundaries.
  '''
    if min_block is None:
        min_block = MIN_BLOCKSIZE
    elif min_block < 8:
        raise ValueError("rejecting min_block < 8: %s" % (min_block, ))
    if max_block is None:
        max_block = MAX_BLOCKSIZE
    elif max_block >= 1024 * 1024:
        raise ValueError("rejecting max_block >= 1024*1024: %s" %
                         (max_block, ))
    if min_block >= max_block:
        raise ValueError("rejecting min_block:%d >= max_block:%d" %
                         (min_block, max_block))
    # source data for aligned chunk construction
    dataQ = IterableQueue()
    # queue of offsets from the parser
    offsetQ = IterableQueue()
    # copy chunks to the parser and also to the post-parser chunk assembler
    tee_chunks = tee(chunks, dataQ)
    parse_bfr = CornuCopyBuffer(tee_chunks)

    runstate = defaults.runstate

    def run_parser(runstate, bfr, min_block, max_block, offsetQ):
        ''' Thread body to scan `chunks` for offsets.
        The chunks are copied to `parseQ`, then their boundary offsets.

        If thwere is a scanner we scan the input data with it first.
        When it terminates (including from some exception), we scan
        the remaining chunks with scanbuf.

        The main function processes `parseQ` and uses its chunks and offsets
        to assemble aligned chunks of data.
    '''
        try:
            offset = 0
            if scanner:
                # Consume the chunks and offsets via a queue.
                # The scanner puts offsets onto the queue.
                # When the scanner fetches from the chunks, those chunks are copied to the queue.
                # Accordingly, chunks _should_ arrive before offsets within them.
                # pylint: disable=broad-except
                try:
                    for offset in scanner(bfr):
                        if runstate.cancelled:
                            break
                        # the scanner should yield only offsets, not chunks and offsets
                        if not isinstance(offset, int):
                            warning("discarding non-int from scanner %s: %s",
                                    scanner, offset)
                        else:
                            offsetQ.put(offset)
                except Exception as e:
                    warning("exception from scanner %s: %s", scanner, e)
            # Consume the remainder of chunk_iter; the tee() will copy it to parseQ.
            # This is important to ensure that no chunk is missed.
            # We run these blocks through scanbuf() to find offsets.
            cso = bfr.offset  # offset after all the chunks so far
            assert offset <= cso
            sofar = cso - offset
            if sofar >= max_block:
                offsetQ.put(cso)
                sofar = 0
            for offset in scan(bfr,
                               sofar=sofar,
                               min_block=min_block,
                               max_block=max_block):
                if runstate.cancelled:
                    break
                offsetQ.put(cso + offset)
        finally:
            # end of offsets and chunks
            offsetQ.close()
            dataQ.close()

    # dispatch the parser
    bg_thread(run_parser,
              args=(runstate, parse_bfr, min_block, max_block, offsetQ),
              daemon=True)

    # data source for assembling aligned chunks
    data_bfr = CornuCopyBuffer(dataQ)
    sofar = 0
    offset = None
    for offset in offsetQ:
        assert offset >= sofar
        block_size = offset - sofar
        assert block_size >= 0, ("block_size:%d <= 0 -- sofar=%d, offset=%d" %
                                 (block_size, sofar, offset))
        if block_size < min_block:
            # skip over small edges
            assert scanner is not None, (
                "scanner=None but still got an overly near offset"
                " (sofar=%d, offset=%d => block_size=%d < min_block:%d)" %
                (sofar, offset, block_size, min_block))
            continue
        subchunks = data_bfr.takev(block_size)
        assert sum(map(len, subchunks)) == block_size
        if block_size > max_block:
            # break up overly long blocks without a parser
            assert scanner is not None, (
                "scanner=None but still got an overly distant offset"
                " (sofar=%d, offset=%d => block_size=%d > max_block:%d)" %
                (sofar, offset, block_size, max_block))
            yield from blocked_chunks_of2(subchunks,
                                          min_block=min_block,
                                          max_block=max_block)
        else:
            yield b''.join(subchunks)
        sofar += block_size
    bs = b''.join(data_bfr)
    if bs:
        assert len(bs) <= max_block
        yield bs
Пример #11
0
def blocked_chunks_of(
    chunks,
    *,
    scanner=None,
    min_block=None,
    max_block=None,
    histogram=None,
):
    ''' Generator which connects to a scanner of a chunk stream in
      order to emit low level edge aligned data chunks.

      *OBSOLETE*: we now use the simpler and faster `blocked_chunks_of2`.

      Parameters:
      * `chunks`: a source iterable of data chunks, handed to `scanner`
      * `scanner`: optional callable accepting a `CornuCopyBuffer` and
        returning an iterable of `int`s, such as a generator. `scanner`
        may be `None`, in which case only the rolling hash is used
        to locate boundaries.
      * `min_block`: the smallest amount of data that will be used
        to create a Block, default from `MIN_BLOCKSIZE` (`{MIN_BLOCKSIZE}`)
      * `max_block`: the largest amount of data that will be used to
        create a Block, default from `MAX_BLOCKSIZE` (`{MAX_BLOCKSIZE}`)
      * `histogram`: if not `None`, a `defaultdict(int)` to collate counts.
        Integer indices count block sizes and string indices are used
        for `'bytes_total'` and `'bytes_hash_scanned'`.

      The iterable returned from `scanner(chunks)` yields `int`s which are
      considered desirable block boundaries.
  '''
    # pylint: disable=too-many-nested-blocks,too-many-statements
    # pylint: disable=too-many-branches,too-many-locals
    with Pfx("blocked_chunks_of"):
        if min_block is None:
            min_block = MIN_BLOCKSIZE
        elif min_block < 8:
            raise ValueError("rejecting min_block < 8: %s" % (min_block, ))
        if max_block is None:
            max_block = MAX_BLOCKSIZE
        elif max_block >= 1024 * 1024:
            raise ValueError("rejecting max_block >= 1024*1024: %s" %
                             (max_block, ))
        if min_block >= max_block:
            raise ValueError("rejecting min_block:%d >= max_block:%d" %
                             (min_block, max_block))
        # obtain iterator of chunks; this avoids accidentally reusing the chunks
        # if for example chunks is a sequence
        chunk_iter = iter(chunks)
        # Set up parseQ, an iterable yielding a mix of source data and
        # offsets representing desirable block boundaries.
        # If there is no scanner, this is just chunk_iter.
        # If there is a scanner we dispatch the scanner in a separate
        # Thread and feed it a tee() of chunk_iter, which copies chunks
        # to the parseQ when chunks are obtained by the scanner. The
        # Thread runs the scanner and copies its output offsets to the
        # parseQ.
        # The tee() arranges that chunks arrive before any offsets within them.
        if scanner is None:
            # No scanner, consume the chunks directly.
            parseQ = chunk_iter
        else:
            # Consume the chunks and offsets via a queue.
            # The scanner puts offsets onto the queue.
            # When the scanner fetches from the chunks, those chunks are copied to the queue.
            # When the scanner terminates, any remaining chunks are also copied to the queue.
            parseQ = IterableQueue()
            chunk_iter = tee(chunk_iter, parseQ)

            def run_parser():
                ''' Thread body to run the supplied scanner against the input data.
        '''
                bfr = CornuCopyBuffer(chunk_iter)
                # pylint: disable=broad-except
                try:
                    for offset in scanner(bfr):
                        # the scanner should yield only offsets, not chunks and offsets
                        if not isinstance(offset, int):
                            warning("discarding non-int from scanner %s: %s",
                                    scanner, offset)
                        else:
                            parseQ.put(offset)
                except Exception as e:
                    exception("exception from scanner %s: %s", scanner, e)
                # Consume the remainder of chunk_iter; the tee() will copy it to parseQ.
                for _ in chunk_iter:
                    pass
                # end of offsets and chunks
                parseQ.close()

            bg_thread(run_parser)
        # inbound chunks and offsets
        in_offsets = []  # heap of unprocessed edge offsets
        # prime `available_chunk` with the first data chunk, ready for get_next_chunk
        try:
            available_chunk = next(parseQ)
        except StopIteration:
            # no data! just return
            return

        def get_next_chunk():
            ''' Fetch and return the next data chunk from the `parseQ`.
          Return None at end of input.
          Also gather all the following offsets from the queue before return.
          Because this inherently means collecting the chunk beyond
          these offsets, we keep that in `available_chunk` for the
          next call.
          Sets parseQ to None if the end of the iterable is reached.
      '''
            nonlocal parseQ, in_offsets, hash_value, available_chunk
            if parseQ is None:
                assert available_chunk is None
                return None
            next_chunk = available_chunk
            available_chunk = None
            assert not isinstance(next_chunk, int)
            # scan the new chunk and load potential edges into the offset heap
            hash_value, chunk_scan_offsets = scanbuf(hash_value, next_chunk)
            for cso in chunk_scan_offsets:
                heappush(in_offsets, offset + cso)
            # gather items from the parseQ until the following chunk
            # or end of input
            while True:
                try:
                    item = next(parseQ)
                except StopIteration:
                    parseQ = None
                    break
                else:
                    if isinstance(item, int):
                        heappush(in_offsets, item)
                    else:
                        available_chunk = item
                        break
            return next_chunk

        last_offset = None
        first_possible_point = None
        max_possible_point = None

        def recompute_offsets():
            ''' Recompute relevant offsets from the block parameters.
          The first_possible_point is last_offset+min_block,
            the earliest point at which we will accept a block boundary.
          The max_possible_point is last_offset+max_block,
            the latest point at which we will accept a block boundary;
            we will choose this if no next_offset or hash offset
            is found earlier.
      '''
            nonlocal last_offset, first_possible_point, max_possible_point
            first_possible_point = last_offset + min_block
            max_possible_point = last_offset + max_block

        # prepare initial state
        last_offset = 0  # latest released boundary
        recompute_offsets(
        )  # compute first_possible_point and max_possible_point
        hash_value = 0
        offset = 0
        chunk0 = None
        offset0 = None
        # unblocked outbound data
        pending = _PendingBuffer(max_block)
        # Read data chunks and locate desired boundaries.
        while True:
            chunk = get_next_chunk()
            if chunk is None:
                break
            # verify current chunk start offset against end of previous chunk
            assert chunk0 is None or offset == offset0 + len(chunk0), \
                "offset0=%s, len(chunk0)=%d: sum(%d) != current offset %d" \
                % (offset0, len(chunk0), offset0 + len(chunk0), offset)
            chunk0 = chunk
            offset0 = offset
            chunk = memoryview(chunk)
            chunk_end_offset = offset + len(chunk)
            # process current chunk
            advance_by = 0  # how much data to add to the pending buffer
            release = False  # whether we hit a boundary ==> flush the buffer
            while chunk:
                if advance_by > 0:
                    # advance through this chunk
                    # buffer the advance
                    # release ==> flush the buffer and update last_offset
                    assert advance_by is not None
                    assert advance_by >= 0
                    assert advance_by <= len(chunk)
                    # save the advance bytes and yield any overflow
                    for out_chunk in pending.append(chunk[:advance_by]):
                        yield out_chunk
                        if histogram is not None:
                            out_chunk_size = len(out_chunk)
                            histogram['bytes_total'] += out_chunk_size
                            histogram[out_chunk_size] += 1
                            histogram['buffer_overflow_chunks'] += 1
                    offset += advance_by
                    chunk = chunk[advance_by:]
                    if last_offset != pending.offset:
                        # if the flush discarded a full buffer we need to adjust our boundaries
                        last_offset = pending.offset
                        recompute_offsets()
                    if release:
                        release = False  # becomes true if we should flush after taking data
                        # yield the current pending data
                        for out_chunk in pending.flush():
                            yield out_chunk
                            if histogram is not None:
                                out_chunk_size = len(out_chunk)
                                histogram['bytes_total'] += out_chunk_size
                                histogram[out_chunk_size] += 1
                        last_offset = pending.offset
                        recompute_offsets()
                    if not chunk:
                        # consumed the end of the chunk, need a new one
                        break
                advance_by = None
                # fetch the next available edge, None if nothing available or suitable
                while True:
                    if in_offsets:
                        next_offset = heappop(in_offsets)
                        if next_offset > offset and next_offset >= first_possible_point:
                            break
                    else:
                        next_offset = None
                        break
                if next_offset is None or next_offset > chunk_end_offset:
                    # no suitable edge: consume the chunk and advance
                    take_to = chunk_end_offset
                else:
                    # edge before end of chunk: use it
                    take_to = next_offset
                    release = True
                advance_by = take_to - offset
                assert advance_by > 0
        # yield any left over data
        for out_chunk in pending.flush():
            yield out_chunk
            if histogram is not None:
                out_chunk_size = len(out_chunk)
                histogram['bytes_total'] += out_chunk_size
                histogram[out_chunk_size] += 1
Пример #12
0
def greedy(g=None, queue_depth=0):
    ''' A decorator or function for greedy computation of iterables.

      If `g` is omitted or callable
      this is a decorator for a generator function
      causing it to compute greedily,
      capacity limited by `queue_depth`.

      If `g` is iterable
      this function dispatches it in a `Thread` to compute greedily,
      capacity limited by `queue_depth`.

      Example with an iterable:

          for packet in greedy(parse_data_stream(stream)):
              ... process packet ...

      which does some readahead of the stream.

      Example as a function decorator:

          @greedy
          def g(n):
              for item in range(n):
                  yield n

      This can also be used directly on an existing iterable:

          for item in greedy(range(n)):
              yield n

      Normally a generator runs on demand.
      This function dispatches a `Thread` to run the iterable
      (typically a generator)
      putting yielded values to a queue
      and returns a new generator yielding from the queue.

      The `queue_depth` parameter specifies the depth of the queue
      and therefore how many values the original generator can compute
      before blocking at the queue's capacity.

      The default `queue_depth` is `0` which creates a `Channel`
      as the queue - a zero storage buffer - which lets the generator
      compute only a single value ahead of time.

      A larger `queue_depth` allocates a `Queue` with that much storage
      allowing the generator to compute as many as `queue_depth+1` values
      ahead of time.

      Here's a comparison of the behaviour:

      Example without `@greedy`
      where the "yield 1" step does not occur until after the "got 0":

          >>> from time import sleep
          >>> def g():
          ...   for i in range(2):
          ...     print("yield", i)
          ...     yield i
          ...   print("g done")
          ...
          >>> G = g(); sleep(0.1)
          >>> for i in G:
          ...   print("got", i)
          ...   sleep(0.1)
          ...
          yield 0
          got 0
          yield 1
          got 1
          g done

      Example with `@greedy`
      where the "yield 1" step computes before the "got 0":

          >>> from time import sleep
          >>> @greedy
          ... def g():
          ...   for i in range(2):
          ...     print("yield", i)
          ...     yield i
          ...   print("g done")
          ...
          >>> G = g(); sleep(0.1)
          yield 0
          >>> for i in G:
          ...   print("got", repr(i))
          ...   sleep(0.1)
          ...
          yield 1
          got 0
          g done
          got 1

      Example with `@greedy(queue_depth=1)`
      where the "yield 1" step computes before the "got 0":

          >>> from cs.x import X
          >>> from time import sleep
          >>> @greedy
          ... def g():
          ...   for i in range(3):
          ...     X("Y")
          ...     print("yield", i)
          ...     yield i
          ...   print("g done")
          ...
          >>> G = g(); sleep(2)
          yield 0
          yield 1
          >>> for i in G:
          ...   print("got", repr(i))
          ...   sleep(0.1)
          ...
          yield 2
          got 0
          yield 3
          got 1
          g done
          got 2

  '''
    assert queue_depth >= 0

    if g is None:
        # the parameterised @greedy(queue_depth=n) form
        # pylint: disable=no-value-for-parameter
        return _greedy_decorator(queue_depth=queue_depth)

    if callable(g):
        # the direct @greedy form
        return _greedy_decorator(g, queue_depth=queue_depth)

    # presumably an iterator - dispatch it in a Thread
    try:
        it = iter(g)
    except TypeError as e:
        # pylint: disable=raise-missing-from
        raise TypeError("g=%r: neither callable nor iterable: %s" % (g, e))

    # pylint: disable=import-outside-toplevel
    from cs.queues import Channel, IterableQueue
    if queue_depth == 0:
        q = Channel()
    else:
        q = IterableQueue(queue_depth)

    def run_generator():
        ''' Thread body for greedy generator.
    '''
        try:
            for item in it:
                q.put(item)
        finally:
            q.close()

    Thread(target=run_generator).start()
    return iter(q)
Пример #13
0
 def startup_shutdown(self):
   ''' Start up and shut down the `FilesDir`: take locks, start worker threads etc.
   '''
   self.initdir()
   self._rfds = {}
   self._unindexed = {}
   self._filemap = SqliteFilemap(self, self.statefilepath)
   hashname = self.hashname
   self.index = self.indexclass(
       self.pathto(self.INDEX_FILENAME_BASE_FORMAT.format(hashname=hashname))
   )
   self.index.open()
   self.runstate.start()
   # cache of open DataFiles
   self._cache = LRU_Cache(
       maxsize=4, on_remove=lambda k, datafile: datafile.close()
   )
   # Set up data queue.
   # The .add() method adds the data to self._unindexed, puts the
   # data onto the data queue, and returns.
   # The data queue worker saves the data to backing files and
   # updates the indices.
   self._data_progress = Progress(
       name=str(self) + " data queue ",
       total=0,
       units_scale=BINARY_BYTES_SCALE,
   )
   if defaults.show_progress:
     proxy_cmgr = upd_state.upd.insert(1)
   else:
     proxy_cmgr = nullcontext()
   with proxy_cmgr as data_proxy:
     self._data_proxy = data_proxy
     self._dataQ = IterableQueue(65536)
     self._data_Thread = bg_thread(
         self._data_queue,
         name="%s._data_queue" % (self,),
     )
     self._monitor_Thread = bg_thread(
         self._monitor_datafiles,
         name="%s-datafile-monitor" % (self,),
     )
     yield
     self.runstate.cancel()
     self.flush()
     # shut down the monitor Thread
     mon_thread = self._monitor_Thread
     if mon_thread is not None:
       mon_thread.join()
       self._monitor_Thread = None
     # drain the data queue
     self._dataQ.close()
     self._data_Thread.join()
     self._dataQ = None
     self._data_thread = None
   # update state to substrate
   self._cache = None
   self._filemap.close()
   self._filemap = None
   self.index.close()
   # close the read file descriptors
   for rfd in self._rfds.values():
     with Pfx("os.close(rfd:%d)", rfd):
       os.close(rfd)
   del self._rfds
   self.runstate.stop()
Пример #14
0
class FilesDir(SingletonMixin, HashCodeUtilsMixin, MultiOpenMixin,
               RunStateMixin, FlaggedMixin, Mapping):
  ''' Base class indexing locally stored data in files for a specific hashclass.

      There are two main subclasses of this at present:
      * `DataDir`: the data are kept in a subdirectory of UUID-named files,
        supporting easy merging and updating.
      * `PlatonicDataDir`: the data are present in a normal file tree,
        such as a preexisting media server directory or the like.
  '''

  STATE_FILENAME_FORMAT = 'index-{hashname}-state.sqlite'
  INDEX_FILENAME_BASE_FORMAT = 'index-{hashname}'
  DATA_ROLLOVER = DEFAULT_ROLLOVER

  _FD_Singleton_Key_Tuple = namedtuple(
      'FilesDir_FD_Singleton_Key_Tuple',
      'cls realdirpath hashclass indexclass rollover flags_id'
  )

  @classmethod
  def _resolve(cls, *, hashclass, indexclass, rollover, flags, flags_prefix):
    ''' Resolve the `__init__()` arguments,
        shared by `__init__` and `_singleton_key`.
    '''
    if indexclass is None:
      indexclass = choose_indexclass(
          cls.INDEX_FILENAME_BASE_FORMAT.format(hashname=hashclass.HASHNAME)
      )
    if rollover is None:
      rollover = cls.DATA_ROLLOVER
    elif rollover < 1024:
      raise ValueError(
          "rollover < 1024"
          " (a more normal size would be in megabytes or gigabytes): %r" %
          (rollover,)
      )
    if flags is None:
      if flags_prefix is None:
        flags = DummyFlags()
        flags_prefix = 'DUMMY'
    else:
      if flags_prefix is None:
        raise ValueError("flags provided but no flags_prefix")
    return SimpleNamespace(
        hashclass=hashclass,
        indexclass=indexclass,
        rollover=rollover,
        flags=flags,
        flags_prefix=flags_prefix
    )

  @classmethod
  def _singleton_key(
      cls,
      topdirpath,
      *,
      hashclass,
      indexclass=None,
      rollover=None,
      flags=None,
      flags_prefix=None,
      **_,
  ):
    resolved = cls._resolve(
        hashclass=hashclass,
        indexclass=indexclass,
        rollover=rollover,
        flags=flags,
        flags_prefix=flags_prefix
    )
    return cls._FD_Singleton_Key_Tuple(
        cls=cls,
        realdirpath=realpath(topdirpath),
        hashclass=resolved.hashclass,
        indexclass=resolved.indexclass,
        rollover=resolved.rollover,
        flags_id=id(resolved.flags)
    )

  @require(lambda topdirpath: isinstance(topdirpath, str))
  @require(lambda hashclass: issubclass(hashclass, HashCode))
  def __init__(
      self,
      topdirpath,
      *,
      hashclass,
      indexclass=None,
      rollover=None,
      flags=None,
      flags_prefix=None,
  ):
    ''' Initialise the `DataDir` with `topdirpath`.

        Parameters:
        * `topdirpath`: a directory containing state information about the
          `DataFile`s; this contains the index-state.csv file and the
          associated index dbm-ish files.
        * `hashclass`: the hashclass used for indexing
        * `indexclass`: the `IndexClass` providing the index to chunks in the
          `DataFile`s. If not specified, a supported index class with an
          existing index file will be chosen, otherwise the most favoured
          indexclass available will be chosen.
        * `rollover`: data file roll over size; if a data file grows beyond
          this a new datafile is commenced for new blocks.
          Default: `self.DATA_ROLLOVER`.
        * `flags`: optional `Flags` object for control; if specified then
          `flags_prefix` is also required.
        * `flags_prefix`: prefix for control flag names.

        Note that `__init__` only saves the settings such as the `indexclass`
        and ensures that requisite directories exist.
        The monitor thread and runtime state are set up by the `startup` method
        and closed down by the `shutdown` method.
    '''
    if hasattr(self, '_filemap'):
      return
    resolved = self._resolve(
        hashclass=hashclass,
        indexclass=indexclass,
        rollover=rollover,
        flags=flags,
        flags_prefix=flags_prefix
    )
    RunStateMixin.__init__(self)
    MultiOpenMixin.__init__(self)
    FlaggedMixin.__init__(
        self, flags=resolved.flags, prefix=resolved.flags_prefix
    )
    self.indexclass = resolved.indexclass
    self.rollover = resolved.rollover
    self.hashclass = hashclass
    self.hashname = hashclass.HASHNAME
    self.topdirpath = topdirpath
    self.statefilepath = joinpath(
        topdirpath, self.STATE_FILENAME_FORMAT.format(hashname=self.hashname)
    )
    self.index = None
    self._filemap = None
    self._unindexed = None
    self._cache = None
    self._data_proxy = None
    self._dataQ = None
    self._data_progress = None
    self._monitor_Thread = None
    self._WDFstate = None
    self._lock = RLock()

  def __str__(self):
    return '%s(%s)' % (self.__class__.__name__, shortpath(self.topdirpath))

  def __repr__(self):
    return (
        '%s(topdirpath=%r,indexclass=%s)' %
        (self.__class__.__name__, self.topdirpath, self.indexclass)
    )

  def initdir(self):
    ''' Init a directory and its "data" subdirectory.
    '''
    topdirpath = self.topdirpath
    if not isdirpath(topdirpath):
      info("mkdir %r", topdirpath)
      with Pfx("mkdir(%r)", topdirpath):
        os.mkdir(topdirpath)
    datasubdirpath = joinpath(topdirpath, 'data')
    if not isdirpath(datasubdirpath):
      info("mkdir %r", datasubdirpath)
      with Pfx("mkdir(%r)", datasubdirpath):
        os.mkdir(datasubdirpath)

  @contextmanager
  def startup_shutdown(self):
    ''' Start up and shut down the `FilesDir`: take locks, start worker threads etc.
    '''
    self.initdir()
    self._rfds = {}
    self._unindexed = {}
    self._filemap = SqliteFilemap(self, self.statefilepath)
    hashname = self.hashname
    self.index = self.indexclass(
        self.pathto(self.INDEX_FILENAME_BASE_FORMAT.format(hashname=hashname))
    )
    self.index.open()
    self.runstate.start()
    # cache of open DataFiles
    self._cache = LRU_Cache(
        maxsize=4, on_remove=lambda k, datafile: datafile.close()
    )
    # Set up data queue.
    # The .add() method adds the data to self._unindexed, puts the
    # data onto the data queue, and returns.
    # The data queue worker saves the data to backing files and
    # updates the indices.
    self._data_progress = Progress(
        name=str(self) + " data queue ",
        total=0,
        units_scale=BINARY_BYTES_SCALE,
    )
    if defaults.show_progress:
      proxy_cmgr = upd_state.upd.insert(1)
    else:
      proxy_cmgr = nullcontext()
    with proxy_cmgr as data_proxy:
      self._data_proxy = data_proxy
      self._dataQ = IterableQueue(65536)
      self._data_Thread = bg_thread(
          self._data_queue,
          name="%s._data_queue" % (self,),
      )
      self._monitor_Thread = bg_thread(
          self._monitor_datafiles,
          name="%s-datafile-monitor" % (self,),
      )
      yield
      self.runstate.cancel()
      self.flush()
      # shut down the monitor Thread
      mon_thread = self._monitor_Thread
      if mon_thread is not None:
        mon_thread.join()
        self._monitor_Thread = None
      # drain the data queue
      self._dataQ.close()
      self._data_Thread.join()
      self._dataQ = None
      self._data_thread = None
    # update state to substrate
    self._cache = None
    self._filemap.close()
    self._filemap = None
    self.index.close()
    # close the read file descriptors
    for rfd in self._rfds.values():
      with Pfx("os.close(rfd:%d)", rfd):
        os.close(rfd)
    del self._rfds
    self.runstate.stop()

  def pathto(self, rpath):
    ''' Return the path to `rpath`, which is relative to the `topdirpath`.
    '''
    return joinpath(self.topdirpath, rpath)

  def datapathto(self, rpath):
    ''' Return the path to `rpath`, which is relative to the `datadirpath`.
    '''
    return self.pathto(joinpath('data', rpath))

  @typechecked
  def new_datafile(self) -> DataFileState:
    ''' Create a new datafile.
        Return its `DataFileState`.
    '''
    while True:
      filename = str(uuid4()) + self.DATA_DOT_EXT
      pathname = self.datapathto(filename)
      if existspath(pathname):
        error("new datafile path already exists, retrying: %r", pathname)
        continue
      with Pfx(pathname):
        try:
          createpath(pathname)
        except OSError as e:
          if e.errno == errno.EEXIST:
            error("new datafile path already exists")
            continue
          raise
      break
    return self._filemap.add_path(filename)

  def add(self, data):
    ''' Add `data` to the cache, queue data for indexing, return hashcode.
    '''
    hashcode = self.hashclass.from_chunk(data)
    if hashcode not in self._unindexed:
      self._unindexed[hashcode] = data
      self._data_progress.total += len(data)
      self._dataQ.put(data)
    return hashcode

  def _data_queue(self):
    wf = None
    DFstate = None
    filenum = None
    index = self.index
    unindexed = self._unindexed
    dataQ = self._dataQ
    progress = self._data_progress
    hashchunk = self.hashclass.from_chunk
    batch_size = 128

    def data_batches(dataQ, batch_size):
      for data in dataQ:
        # assemble up to 64 chunks at a time
        data_batch = [data]
        while not dataQ.empty() and len(data_batch) < batch_size:
          data_batch.append(next(dataQ))
        yield data_batch
        data_batch = None

    batches = data_batches(dataQ, batch_size)
    if defaults.show_progress:
      batches = progress.iterbar(
          batches,
          itemlenfunc=lambda batch: sum(map(len, batch)),
          proxy=self._data_proxy
      )
    for data_batch in batches:
      batch_length = len(data_batch)
      ##print("data batch of", batch_length)
      # FileDataIndexEntry by hashcode for batch update of index after flush
      entry_bs_by_hashcode = {}
      for data in data_batch:
        hashcode = hashchunk(data)
        if hashcode not in index:
          # new data, save to a datafile and update the index
          # pretranscribe the in-file data record
          # save the data record to the current file
          if wf is None:
            DFstate = self.new_datafile()
            filenum = DFstate.filenum
            wf = open(DFstate.pathname, 'ab')
            self._WDFstate = DFstate
          bs, data_offset, data_length, flags = self.data_save_information(
              data
          )
          offset = wf.tell()
          wf.write(bs)
          length = len(bs)
          post_offset = offset + length
          # make a record for this chunk
          entry_bs_by_hashcode[hashcode] = bytes(
              FileDataIndexEntry(
                  filenum=filenum,
                  data_offset=offset + data_offset,
                  data_length=data_length,
                  flags=flags,
              )
          )
      # after the batch, flush and roll over if beyond the high water mark
      if wf is not None:
        wf.flush()
        with self._lock:
          for hashcode, entry_bs in entry_bs_by_hashcode.items():
            index[hashcode] = entry_bs
            try:
              del unindexed[hashcode]
            except KeyError:
              # this can happen when the same key is indexed twice
              # entirely plausible if a new datafile is added to the datadir
              pass
        # note that the index is up to post_offset
        DFstate.indexed_to = post_offset
        rollover = self.rollover
        if rollover is not None and wf.tell() >= rollover:
          # file now full, close it so as to start a new one on next write
          os.close(wfd)
          wfd = None
          self._filemap.set_indexed_to(DFstate.filenum, DFstate.indexed_to)
          DFstate = None
      if batch_length < batch_size:
        sleep(0.2)
    if wf is not None:
      wf.close()
      wf = None
    if DFstate is not None:
      self._filemap.set_indexed_to(DFstate.filenum, DFstate.indexed_to)

  def get_Archive(self, name=None, **kw):
    ''' Return the Archive named `name`.

        If `name` is omitted or `None`
        the Archive path is the `topdirpath`
        plus the extension `'.vt'`.
        Otherwise it is the `topdirpath` plus a dash plus the `name`
        plus the extension `'.vt'`.
        The `name` may not be empty or contain a dot or a dash.
    '''
    with Pfx("%s.get_Archive", self):
      if name is None or not name:
        archivepath = self.topdirpath + '.vt'
      else:
        if '.' in name or '/' in name:
          raise ValueError("invalid name: %r" % (name,))
        archivepath = self.topdirpath + '-' + name + '.vt'
      return Archive(archivepath, **kw)

  @locked
  def flush(self):
    ''' Flush all the components.
    '''
    self._cache.flush()
    self.index.flush()

  def __setitem__(self, hashcode, data):
    h = self.add(data)
    if hashcode != h:
      raise ValueError(
          'supplied hashcode %s does not match data, data added under %s instead'
          % (hashcode, h)
      )

  def __len__(self):
    return len(self.index)

  @pfx_method
  def hashcodes_from(self, *, start_hashcode=None):
    ''' Generator yielding the hashcodes from the database in order
        starting with optional `start_hashcode`.

        Parameters:
        * `start_hashcode`: the first hashcode; if missing or `None`,
          iteration starts with the first key in the index
    '''
    # important: consult this BEFORE self.index.keys otherwise items might
    # flow from unindexed to the index unseen
    with self._lock:
      unindexed = list(self._unindexed)
    if start_hashcode is not None and unindexed:
      unindexed = filter(lambda h: h >= start_hashcode, unindexed)
    hs = map(
        self.hashclass,
        self.index.sorted_keys(start_hashcode=start_hashcode),
    )
    unindexed = set(unindexed)
    if unindexed:
      hs = filter(lambda h: h not in unindexed, hs)
    return imerge(hs, sorted(unindexed))

  def __iter__(self):
    return self.hashcodes_from()

  # without this "in" tries to iterate over the mapping with int indices
  def __contains__(self, hashcode):
    return hashcode in self._unindexed or hashcode in self.index

  def __getitem__(self, hashcode):
    ''' Return the decompressed data associated with the supplied `hashcode`.
    '''
    unindexed = self._unindexed
    try:
      return unindexed[hashcode]
    except KeyError:
      index = self.index
      try:
        with self._lock:
          entry_bs = index[hashcode]
      except KeyError:
        raise KeyError("%s[%s]: hash not in index" % (self, hashcode))
      entry = FileDataIndexEntry.from_bytes(entry_bs)
      filenum = entry.filenum
      try:
        try:
          rfd = self._rfds[filenum]
        except KeyError:
          # TODO: shove this sideways to self.open_datafile
          # which releases an existing datafile if too many are open
          DFstate = self._filemap[filenum]
          rfd = self._rfds[filenum] = openfd_read(DFstate.pathname)
        return entry.fetch_fd(rfd)
      except Exception as e:
        exception("%s[%s]:%s not available: %s", self, hashcode, entry, e)
        raise KeyError(str(hashcode)) from e
Пример #15
0
 def _monitor_datafiles(self):
   ''' Thread body to poll the ideal tree for new or changed files.
   '''
   proxy = upd_state.proxy
   proxy.prefix = str(self) + " monitor "
   meta_store = self.meta_store
   filemap = self._filemap
   datadirpath = self.pathto('data')
   if meta_store is not None:
     topdir = self.topdir
   else:
     warning("%s: no meta_store!", self)
   updated = False
   disabled = False
   while not self.cancelled:
     sleep(self.DELAY_INTERSCAN)
     if self.flag_scan_disable:
       if not disabled:
         info("scan %r DISABLED", shortpath(datadirpath))
         disabled = True
       continue
     if disabled:
       info("scan %r ENABLED", shortpath(datadirpath))
       disabled = False
     # scan for new datafiles
     with Pfx("%r", datadirpath):
       seen = set()
       info("scan tree...")
       with proxy.extend_prefix(" scan"):
         for dirpath, dirnames, filenames in os.walk(datadirpath,
                                                     followlinks=True):
           dirnames[:] = sorted(dirnames)
           filenames = sorted(filenames)
           sleep(self.DELAY_INTRASCAN)
           if self.cancelled or self.flag_scan_disable:
             break
           rdirpath = relpath(dirpath, datadirpath)
           with Pfx(rdirpath):
             with (proxy.extend_prefix(" " + rdirpath)
                   if filenames else nullcontext()):
               # this will be the subdirectories into which to recurse
               pruned_dirnames = []
               for dname in dirnames:
                 if self.exclude_dir(joinpath(rdirpath, dname)):
                   # unwanted
                   continue
                 subdirpath = joinpath(dirpath, dname)
                 try:
                   S = os.stat(subdirpath)
                 except OSError as e:
                   # inaccessable
                   warning("stat(%r): %s, skipping", subdirpath, e)
                   continue
                 ino = S.st_dev, S.st_ino
                 if ino in seen:
                   # we have seen this subdir before, probably via a symlink
                   # TODO: preserve symlinks? attach alter ego directly as a Dir?
                   debug(
                       "seen %r (dev=%s,ino=%s), skipping", subdirpath,
                       ino[0], ino[1]
                   )
                   continue
                 seen.add(ino)
                 pruned_dirnames.append(dname)
               dirnames[:] = pruned_dirnames
               if meta_store is None:
                 warning("no meta_store")
                 D = None
               else:
                 with meta_store:
                   D = topdir.makedirs(rdirpath, force=True)
                   # prune removed names
                   names = list(D.keys())
                   for name in names:
                     if name not in dirnames and name not in filenames:
                       info("del %r", name)
                       del D[name]
               for filename in filenames:
                 with Pfx(filename):
                   if self.cancelled or self.flag_scan_disable:
                     break
                   rfilepath = joinpath(rdirpath, filename)
                   if self.exclude_file(rfilepath):
                     continue
                   filepath = joinpath(dirpath, filename)
                   if not isfilepath(filepath):
                     continue
                   # look up this file in our file state index
                   DFstate = filemap.get(rfilepath)
                   if (DFstate is not None and D is not None
                       and filename not in D):
                     # in filemap, but not in dir: start again
                     warning("in filemap but not in Dir, rescanning")
                     filemap.del_path(rfilepath)
                     DFstate = None
                   if DFstate is None:
                     DFstate = filemap.add_path(rfilepath)
                   try:
                     new_size = DFstate.stat_size(self.follow_symlinks)
                   except OSError as e:
                     if e.errno == errno.ENOENT:
                       warning("forgetting missing file")
                       self._del_datafilestate(DFstate)
                     else:
                       warning("stat: %s", e)
                     continue
                   if new_size is None:
                     # skip non files
                     debug("SKIP non-file")
                     continue
                   if meta_store:
                     try:
                       E = D[filename]
                     except KeyError:
                       E = FileDirent(filename)
                       D[filename] = E
                     else:
                       if not E.isfile:
                         info(
                             "new FileDirent replacing previous nonfile: %s",
                             E
                         )
                         E = FileDirent(filename)
                         D[filename] = E
                   if new_size > DFstate.scanned_to:
                     with proxy.extend_prefix(
                         " scan %s[%d:%d]" %
                         (filename, DFstate.scanned_to, new_size)):
                       if DFstate.scanned_to > 0:
                         info("scan from %d", DFstate.scanned_to)
                       if meta_store is not None:
                         blockQ = IterableQueue()
                         R = meta_store._defer(
                             lambda B, Q: top_block_for(spliced_blocks(B, Q)),
                             E.block, blockQ
                         )
                       scan_from = DFstate.scanned_to
                       scan_start = time()
                       scanner = DFstate.scanfrom(offset=DFstate.scanned_to)
                       if defaults.show_progress:
                         scanner = progressbar(
                             DFstate.scanfrom(offset=DFstate.scanned_to),
                             "scan " + rfilepath,
                             position=DFstate.scanned_to,
                             total=new_size,
                             units_scale=BINARY_BYTES_SCALE,
                             itemlenfunc=lambda t3: t3[2] - t3[0],
                             update_frequency=128,
                         )
                       for pre_offset, data, post_offset in scanner:
                         hashcode = self.hashclass.from_chunk(data)
                         entry = FileDataIndexEntry(
                             filenum=DFstate.filenum,
                             data_offset=pre_offset,
                             data_length=len(data),
                             flags=0,
                         )
                         entry_bs = bytes(entry)
                         with self._lock:
                           index[hashcode] = entry_bs
                         if meta_store is not None:
                           B = Block(data=data, hashcode=hashcode, added=True)
                           blockQ.put((pre_offset, B))
                         DFstate.scanned_to = post_offset
                         if self.cancelled or self.flag_scan_disable:
                           break
                     if meta_store is not None:
                       blockQ.close()
                       try:
                         top_block = R()
                       except MissingHashcodeError as e:
                         error("missing data, forcing rescan: %s", e)
                         DFstate.scanned_to = 0
                       else:
                         E.block = top_block
                         D.changed = True
                         updated = True
                     elapsed = time() - scan_start
                     scanned = DFstate.scanned_to - scan_from
                     if elapsed > 0:
                       scan_rate = scanned / elapsed
                     else:
                       scan_rate = None
                     if scan_rate is None:
                       info(
                           "scanned to %d: %s", DFstate.scanned_to,
                           transcribe_bytes_geek(scanned)
                       )
                     else:
                       info(
                           "scanned to %d: %s at %s/s", DFstate.scanned_to,
                           transcribe_bytes_geek(scanned),
                           transcribe_bytes_geek(scan_rate)
                       )
                     # stall after a file scan, briefly, to limit impact
                     if elapsed > 0:
                       sleep(min(elapsed, self.DELAY_INTRASCAN))
           # update the archive after updating from a directory
           if updated and meta_store is not None:
             self.sync_meta()
             updated = False
     self.flush()
Пример #16
0
    def __init__(self,
                 recv,
                 send,
                 request_handler=None,
                 name=None,
                 packet_grace=None,
                 tick=None):
        ''' Initialise the PacketConnection.

        Parameters:
        * `recv`: inbound binary stream.
          If this is an `int` it is taken to be an OS file descriptor,
          otherwise it should be a `cs.buffer.CornuCopyBuffer`
          or a file like object with a `read1` or `read` method.
        * `send`: outbound binary stream.
          If this is an `int` it is taken to be an OS file descriptor,
          otherwise it should be a file like object with `.write(bytes)`
          and `.flush()` methods.
          For a file descriptor sending is done via an os.dup() of
          the supplied descriptor, so the caller remains responsible
          for closing the original descriptor.
        * `packet_grace`:
          default pause in the packet sending worker
          to allow another packet to be queued
          before flushing the output stream.
          Default: `DEFAULT_PACKET_GRACE`s.
          A value of `0` will flush immediately if the queue is empty.
        * `request_handler`: an optional callable accepting
          (`rq_type`, `flags`, `payload`).
          The request_handler may return one of 5 values on success:
          * `None`: response will be 0 flags and an empty payload.
          * `int`: flags only. Response will be the flags and an empty payload.
          * `bytes`: payload only. Response will be 0 flags and the payload.
          * `str`: payload only. Response will be 0 flags and the str
                  encoded as bytes using UTF-8.
          * `(int, bytes)`: Specify flags and payload for response.
          An unsuccessful request should raise an exception, which
          will cause a failure response packet.
        * `tick`: optional tick parameter, default `None`.
          If `None`, do nothing.
          If a Boolean, call `tick_fd_2` if true, otherwise do nothing.
          Otherwise `tick` should be a callable accepting a byteslike value.
    '''
        if name is None:
            name = str(seq())
        self.name = name
        if isinstance(recv, int):
            self._recv = CornuCopyBuffer.from_fd(recv)
        elif isinstance(recv, CornuCopyBuffer):
            self._recv = recv
        else:
            self._recv = CornuCopyBuffer.from_file(recv)
        if isinstance(send, int):
            self._send = os.fdopen(os.dup(send), 'wb')
        else:
            self._send = send
        if packet_grace is None:
            packet_grace = DEFAULT_PACKET_GRACE
        if tick is None:
            tick = lambda bs: None
        elif isinstance(tick, bool):
            if tick:
                tick = tick_fd_2
            else:
                tick = lambda bs: None
        self.packet_grace = packet_grace
        self.request_handler = request_handler
        self.tick = tick
        # tags of requests in play against the local system
        self._channel_request_tags = {0: set()}
        self.notify_recv_eof = set()
        self.notify_send_eof = set()
        # LateFunctions for the requests we are performing for the remote system
        self._running = set()
        # requests we have outstanding against the remote system
        self._pending = {0: {}}
        # sequence of tag numbers
        # TODO: later, reuse old tags to prevent monotonic growth of tag field
        self._tag_seq = Seq(1)
        # work queue for local requests
        self._later = Later(4, name="%s:Later" % (self, ))
        self._later.open()
        # dispatch queue of Packets to send
        self._sendQ = IterableQueue(16)
        self._lock = Lock()
        self.closed = False
        # debugging: check for reuse of (channel,tag) etc
        self.__sent = set()
        self.__send_queued = set()
        # dispatch Thread to process received packets
        self._recv_thread = bg_thread(self._receive_loop,
                                      name="%s[_receive_loop]" % (self.name, ))
        # dispatch Thread to send data
        # primary purpose is to bundle output by deferring flushes
        self._send_thread = bg_thread(self._send_loop,
                                      name="%s[_send]" % (self.name, ))
Пример #17
0
class FileDataMappingProxy(MultiOpenMixin, RunStateMixin):
    ''' Mapping-like class to cache data chunks to bypass gdbm indices and the like.
      Data are saved immediately into an in memory cache and an asynchronous
      worker copies new data into a cache file and also to the backend
      storage.
  '''
    @pfx_method
    def __init__(
        self,
        backend,
        *,
        dirpath=None,
        max_cachefile_size=None,
        max_cachefiles=None,
        runstate=None,
    ):
        ''' Initialise the cache.

        Parameters:
        * `backend`: mapping underlying us
        * `dirpath`: directory to store cache files
        * `max_cachefile_size`: maximum cache file size; a new cache
          file is created if this is exceeded; default:
          DEFAULT_CACHEFILE_HIGHWATER
        * `max_cachefiles`: number of cache files to keep around; no
          more than this many cache files are kept at a time; default:
          DEFAULT_MAX_CACHEFILES
    '''
        RunStateMixin.__init__(self, runstate=runstate)
        if max_cachefile_size is None:
            max_cachefile_size = DEFAULT_CACHEFILE_HIGHWATER
        if max_cachefiles is None:
            max_cachefiles = DEFAULT_MAX_CACHEFILES
        self.backend = backend
        if not isdirpath(dirpath):
            raise ValueError("dirpath=%r: not a directory" % (dirpath, ))
        self.dirpath = dirpath
        self.max_cachefile_size = max_cachefile_size
        self.max_cachefiles = max_cachefiles
        self.cached = {}  # map h => data
        self.saved = {}  # map h => _CachedData(cachefile, offset, length)
        self._lock = Lock()
        self.cachefiles = []
        self._add_cachefile()
        self._workQ = None
        self._worker = None
        self.runstate.notify_cancel.add(lambda rs: self.close())

    def startup(self):
        ''' Startup the proxy.
    '''
        self._workQ = IterableQueue()
        self._worker = Thread(name="%s WORKER" % (self, ), target=self._work)
        self._worker.start()

    @pfx_method
    def shutdown(self):
        ''' Shut down the cache.
        Stop the worker, close the file cache.
    '''
        self._workQ.close()
        self._worker.join()
        if self.cached:
            error("blocks still in memory cache: %r", self.cached)
        for cachefile in self.cachefiles:
            cachefile.close()

    def _add_cachefile(self):
        cachefile = RWFileBlockCache(dirpath=self.dirpath)
        self.cachefiles.insert(0, cachefile)
        if len(self.cachefiles) > self.max_cachefiles:
            old_cachefile = self.cachefiles.pop()
            old_cachefile.close()

    def _getref(self, h):
        ''' Fetch a cache reference from self.saved, return None if missing.
        Automatically prune stale saved entries if the cachefile is closed.
    '''
        saved = self.saved
        ref = saved.get(h)
        if ref is not None:
            if ref.cachefile.closed:
                ref = None
                del saved[h]
        return ref

    def __contains__(self, h):
        ''' Mapping method supporting "in".
    '''
        with self._lock:
            if h in self.cached:
                return True
            if self._getref(h) is not None:
                return True
        backend = self.backend
        if backend:
            return h in backend
        return False

    def keys(self):
        ''' Mapping method for .keys.
    '''
        seen = set()
        for h in list(self.cached.keys()):
            yield h
            seen.add(h)
        saved = self.saved
        with self._lock:
            saved_keys = list(saved.keys())
        for h in saved_keys:
            if h not in seen and self._getref(h):
                yield h
                seen.add(h)
        backend = self.backend
        if backend:
            for h in backend.keys():
                if h not in seen:
                    yield h

    def __getitem__(self, h):
        ''' Fetch the data with key `h`. Raise KeyError if missing.
    '''
        with self._lock:
            # fetch from memory
            try:
                data = self.cached[h]
            except KeyError:
                # fetch from file
                ref = self._getref(h)
                if ref is not None:
                    return ref.fetch()
            else:
                # straight from memory cache
                return data
        # not in memory or file cache: fetch from backend, queue store into cache
        backend = self.backend
        if not backend:
            raise KeyError('no backend: h=%s' % (h, ))
        data = backend[h]
        with self._lock:
            self.cached[h] = data
        self._workQ.put((h, data, False))
        return data

    def __setitem__(self, h, data):
        ''' Store `data` against key `h`.
    '''
        with self._lock:
            if h in self.cached:
                # in memory cache, do not save
                return
            if self._getref(h):
                # in file cache, do not save
                return
            # save in memory cache
            self.cached[h] = data
        # queue for file cache and backend
        self._workQ.put((h, data, True))

    def _work(self):
        for h, data, in_backend in self._workQ:
            with self._lock:
                if self._getref(h):
                    # already in file cache, therefore already sent to backend
                    continue
            cachefile = self.cachefiles[0]
            offset = cachefile.put(data)
            with self._lock:
                self.saved[h] = CachedData(cachefile, offset, len(data))
                # release memory cache entry
                try:
                    del self.cached[h]
                except KeyError:
                    pass
                if offset + len(data) >= self.max_cachefile_size:
                    # roll over to new cache file
                    self._add_cachefile()
            # store into the backend
            if not in_backend:
                backend = self.backend
                if backend:
                    self.backend[h] = data
Пример #18
0
class POP3(MultiOpenMixin):
  ''' Simple POP3 class with support for streaming use.
  '''

  def __init__(self, conn_spec):
    if isinstance(conn_spec, str):
      conn_spec = ConnectionSpec.from_spec(conn_spec)
    self.conn_spec = conn_spec
    self._result_queue = None
    self._client_worker = None
    self._sock = None
    self.recvf = None
    self.sendf = None
    self._lock = RLock()

  @pfx
  def startup(self):
    ''' Connect to the server and log in.
    '''
    self._sock = self.conn_spec.connect()
    self.recvf = self._sock.makefile('r', encoding='iso8859-1')
    self.sendf = self._sock.makefile('w', encoding='ascii')
    self.client_begin()
    self.client_auth(self.conn_spec.user, self.conn_spec.password)
    self._result_queue = IterableQueue()
    self._client_worker = bg_thread(
        self._client_response_worker, args=(self._result_queue,)
    )
    return self

  @pfx
  def shutdown(self):
    ''' Quit and disconnect.
    '''
    logmsg = debug
    logmsg("send client QUIT")
    try:
      quitR = self.client_quit_bg()
      logmsg("flush QUIT")
      self.flush()
      logmsg("join QUIT")
      quitR.join()
    except Exception as e:
      exception("client quit: %s", e)
      logmsg = warning
    if self._result_queue:
      logmsg("close result queue")
      self._result_queue.close()
      self._result_queue = None
    if self._client_worker:
      logmsg("join client worker")
      self._client_worker.join()
      self._client_worker = None
    logmsg("close sendf")
    self.sendf.close()
    self.sendf = None
    logmsg("check for uncollected server responses")
    bs = self.recvf.read()
    if bs:
      warning("received %d bytes from the server at shutdown", len(bs))
    logmsg("close recvf")
    self.recvf.close()
    self.recvf = None
    logmsg("close socket")
    self._sock.close()
    self._sock = None
    logmsg("shutdown complete")

  def readline(self):
    ''' Read a CRLF terminated line from `self.recvf`.
        Return the text preceeding the CRLF.
        Return `None` at EOF.
    '''
    line0 = self.recvf.readline()
    if not line0:
      return None
    line = cutsuffix(line0, '\n')
    assert line is not line0, "missing LF: %r" % (line0,)
    line = cutsuffix(line, '\r')
    return line

  def readlines(self):
    ''' Generator yielding lines from `self.recf`.
    '''
    while True:
      line = self.readline()
      if line is None:
        break
      yield line

  def get_response(self):
    ''' Read a server response.
        Return `(ok,status,etc)`
        where `ok` is true if `status` is `'+OK'`, false otherwise;
        `status` is the status word
        and `etc` is the following text.
        Return `(None,None,None)` on EOF from the receive stream.
    '''
    line = self.readline()
    if line is None:
      return None, None, None
    try:
      status, etc = line.split(None, 1)
    except ValueError:
      status = line
      etc = ''
    return status == '+OK', status, etc

  def get_ok(self):
    ''' Read server response, require it to be `'OK+'`.
        Returns the `etc` part.
    '''
    ok, status, etc = self.get_response()
    if not ok:
      raise ValueError("no ok from server: %r %r" % (status, etc))
    return etc

  def get_multiline(self):
    ''' Generator yielding unstuffed lines from a multiline response.
    '''
    for line in self.readlines():
      if line == '.':
        break
      if line.startswith('.'):
        line = line[1:]
      yield line

  def flush(self):
    ''' Flush the send stream.
    '''
    self.sendf.flush()

  def sendline(self, line, do_flush=False):
    ''' Send a line (excluding its terminating CRLF).
        If `do_flush` is true (default `False`)
        also flush the sending stream.
    '''
    assert '\r' not in line and '\n' not in line
    self.sendf.write(line)
    self.sendf.write('\r\n')
    if do_flush:
      self.flush()

  def _client_response_worker(self, result_queue):
    ''' Worker to process queued request responses.
        Each completed response assigns `(etc,lines)` to the `Result`
        where `etc` is the addition text from the server ok response
        and `lines` is a list of the multiline part of the response
        or `None` if the response is not multiline.
    '''
    for R, is_multiline in result_queue:
      try:
        etc = self.get_ok()
        if is_multiline:
          lines = list(self.get_multiline())
        else:
          lines = None
      except Exception as e:  # pylint: disable=broad-except
        warning("%s: %s", R, e)
        R.exc_info = sys.exc_info
      else:
        # save a list so that we can erase it in a handler to release memory
        R.result = [etc, lines]

  def client_begin(self):
    ''' Read the opening server response.
    '''
    etc = self.get_ok()
    print(etc)

  def client_auth(self, user, password):
    ''' Perform a client authentication.
    '''
    self.sendline(f'USER {user}', do_flush=True)
    print('USER', user, self.get_ok())
    self.sendline(f'PASS {password}', do_flush=True)
    print('PASS', '****', self.get_ok())

  def client_uidl(self):
    ''' Return a mapping of message number to message UID string.
    '''
    self.sendline('UIDL', do_flush=True)
    self.get_ok()
    for line in self.get_multiline():
      n, msg_uid = line.split(None, 1)
      n = int(n)
      yield n, msg_uid

  def client_bg(self, rq_line, is_multiline=False, notify=None):
    ''' Dispatch a request `rq_line` in the background.
        Return a `Result` to collect the request result.

        Parameters:
        * `rq_line`: POP3 request text, without any terminating CRLF
        * `is_multiline`: true if a multiline response is expected,
          default `False`
        * `notify`: a optional handler for `Result.notify`,
          applied if not `None`

        *Note*: DOES NOT flush the send stream.
        Call `self.flush()` when a batch of requests has been submitted,
        before trying to collect the `Result`s.

        The `Result` will receive `[etc,lines]` on success
        where:
        * `etc` is the trailing portion of an ok response line
        * `lines` is a list of unstuffed text lines from the response
          if `is_multiline` is true, `None` otherwise
        The `Result` gets a list instead of a tuple
        so that a handler may clear it in order to release memory.

        Example:

            R = self.client_bg(f'RETR {msg_n}', is_multiline=True, notify=notify)
    '''
    with self._lock:
      self.sendline(rq_line)
      R = Result(rq_line)
      self._result_queue.put((R, is_multiline))
    R.extra.update(rq_line=rq_line)
    if notify is not None:
      R.notify(notify)
    return R

  def client_dele_bg(self, msg_n):
    ''' Queue a delete request for message `msg_n`,
        return ` Result` for collection.
    '''
    R = self.client_bg(f'DELE {msg_n}')
    R.extra.update(msg_n=msg_n)
    return R

  def client_quit_bg(self):
    ''' Queue a QUIT request.
        return ` Result` for collection.
    '''
    R = self.client_bg('QUIT')
    return R

  def client_retr_bg(self, msg_n, notify=None):
    ''' Queue a retrieve request for message `msg_n`,
        return ` Result` for collection.

        If `notify` is not `None`, apply it to the `Result`.
    '''
    R = self.client_bg(f'RETR {msg_n}', is_multiline=True, notify=notify)
    R.extra.update(msg_n=msg_n)
    return R

  def dl_bg(self, msg_n, maildir, deleRs):
    ''' Download message `msg_n` to Maildir `maildir`.
        Return the `Result` for the `RETR` request.

        After a successful save,
        queue a `DELE` for the message
        and add its `Result` to `deleRs`.
    '''

    def dl_bg_save_result(R):
      _, lines = R.result
      R.result[1] = None  # release lines
      msg_bs = b''.join(
          map(lambda line: line.encode('iso8859-1') + b'\r\n', lines)
      )
      msg = BytesParser().parsebytes(msg_bs)
      with self._lock:
        Mkey = maildir.add(msg)
        deleRs.add(self.client_dele_bg(msg_n))
      print(f'msg {msg_n}: {len(msg_bs)} octets, saved as {Mkey}, deleted.')

    R = self.client_retr_bg(msg_n, notify=dl_bg_save_result)
    return R
Пример #19
0
class SubLater(object):
  ''' A class for managing a group of deferred tasks using an existing `Later`.
  '''

  def __init__(self, L):
    ''' Initialise the `SubLater` with its parent `Later`.

        TODO: accept `discard=False` param to suppress the queue and
        associated checks.
    '''
    self._later = L
    self._later.open()
    self._lock = Lock()
    self._deferred = 0
    self._queued = 0
    self._queue = IterableQueue()
    self.closed = False

  def __str__(self):
    return "%s(%s%s,deferred=%d,completed=%d)" % (
        type(self),
        self._later,
        "[CLOSED]" if self.closed else "",
        self._deferred,
        self._queued,
    )

  def __iter__(self):
    ''' Iteration over the `SubLater`
        iterates over the queue of completed `LateFUnction`s.
    '''
    return iter(self._queue)

  def close(self):
    ''' Close the SubLater.

        This prevents further deferrals.
    '''
    with self._lock:
      closed = self.closed
      if closed:
        self._later.warning("repeated close of %s", self)
      else:
        self.closed = True
        self._queue.close()
        self._later.close()

  def defer(self, func, *a, **kw):
    ''' Defer a function, return its `LateFunction`.

        The resulting `LateFunction` will queue itself for collection
        on completion.
    '''
    with self._lock:
      LF = self._later.defer(func, *a, **kw)
      self._deferred += 1

      def on_complete(R):
        with self._lock:
          self._queue.put(R)
          self._queued += 1
          if self.closed and self._queued >= self._deferred:
            self._queue.close()

    LF.notify(on_complete)
    return LF

  def reaper(self, handler=None):
    ''' Dispatch a `Thread` to collect completed `LateFunction`s.
        Return the `Thread`.

        `handler`: an optional callable to be passed each `LateFunction`
        as it completes.
    '''

    @logexc
    def reap(Q):
      for LF in Q:
        if handler:
          try:
            handler(LF)
          except Exception as e:  # pylint: disable=broad-except
            exception("%s: reap %s: %s", self, LF, e)

    T = Thread(name="reaper(%s)" % (self,), target=reap, args=(self._queue,))
    T.start()
    return T
Пример #20
0
 def startup(self):
     ''' Startup the proxy.
 '''
     self._workQ = IterableQueue()
     self._worker = Thread(name="%s WORKER" % (self, ), target=self._work)
     self._worker.start()
Пример #21
0
class PacketConnection(object):
    ''' A bidirectional binary connection for exchanging requests and responses.
  '''

    # special packet indicating end of stream
    EOF_Packet = Packet(is_request=True,
                        channel=0,
                        tag=0,
                        flags=0,
                        rq_type=0,
                        payload=b'')

    # pylint: disable=too-many-arguments
    def __init__(self,
                 recv,
                 send,
                 request_handler=None,
                 name=None,
                 packet_grace=None,
                 tick=None):
        ''' Initialise the PacketConnection.

        Parameters:
        * `recv`: inbound binary stream.
          If this is an `int` it is taken to be an OS file descriptor,
          otherwise it should be a `cs.buffer.CornuCopyBuffer`
          or a file like object with a `read1` or `read` method.
        * `send`: outbound binary stream.
          If this is an `int` it is taken to be an OS file descriptor,
          otherwise it should be a file like object with `.write(bytes)`
          and `.flush()` methods.
          For a file descriptor sending is done via an os.dup() of
          the supplied descriptor, so the caller remains responsible
          for closing the original descriptor.
        * `packet_grace`:
          default pause in the packet sending worker
          to allow another packet to be queued
          before flushing the output stream.
          Default: `DEFAULT_PACKET_GRACE`s.
          A value of `0` will flush immediately if the queue is empty.
        * `request_handler`: an optional callable accepting
          (`rq_type`, `flags`, `payload`).
          The request_handler may return one of 5 values on success:
          * `None`: response will be 0 flags and an empty payload.
          * `int`: flags only. Response will be the flags and an empty payload.
          * `bytes`: payload only. Response will be 0 flags and the payload.
          * `str`: payload only. Response will be 0 flags and the str
                  encoded as bytes using UTF-8.
          * `(int, bytes)`: Specify flags and payload for response.
          An unsuccessful request should raise an exception, which
          will cause a failure response packet.
        * `tick`: optional tick parameter, default `None`.
          If `None`, do nothing.
          If a Boolean, call `tick_fd_2` if true, otherwise do nothing.
          Otherwise `tick` should be a callable accepting a byteslike value.
    '''
        if name is None:
            name = str(seq())
        self.name = name
        if isinstance(recv, int):
            self._recv = CornuCopyBuffer.from_fd(recv)
        elif isinstance(recv, CornuCopyBuffer):
            self._recv = recv
        else:
            self._recv = CornuCopyBuffer.from_file(recv)
        if isinstance(send, int):
            self._send = os.fdopen(os.dup(send), 'wb')
        else:
            self._send = send
        if packet_grace is None:
            packet_grace = DEFAULT_PACKET_GRACE
        if tick is None:
            tick = lambda bs: None
        elif isinstance(tick, bool):
            if tick:
                tick = tick_fd_2
            else:
                tick = lambda bs: None
        self.packet_grace = packet_grace
        self.request_handler = request_handler
        self.tick = tick
        # tags of requests in play against the local system
        self._channel_request_tags = {0: set()}
        self.notify_recv_eof = set()
        self.notify_send_eof = set()
        # LateFunctions for the requests we are performing for the remote system
        self._running = set()
        # requests we have outstanding against the remote system
        self._pending = {0: {}}
        # sequence of tag numbers
        # TODO: later, reuse old tags to prevent monotonic growth of tag field
        self._tag_seq = Seq(1)
        # work queue for local requests
        self._later = Later(4, name="%s:Later" % (self, ))
        self._later.open()
        # dispatch queue of Packets to send
        self._sendQ = IterableQueue(16)
        self._lock = Lock()
        self.closed = False
        # debugging: check for reuse of (channel,tag) etc
        self.__sent = set()
        self.__send_queued = set()
        # dispatch Thread to process received packets
        self._recv_thread = bg_thread(self._receive_loop,
                                      name="%s[_receive_loop]" % (self.name, ))
        # dispatch Thread to send data
        # primary purpose is to bundle output by deferring flushes
        self._send_thread = bg_thread(self._send_loop,
                                      name="%s[_send]" % (self.name, ))

    def __str__(self):
        return "PacketConnection[%s]" % (self.name, )

    @pfx_method
    def shutdown(self, block=False):
        ''' Shut down the PacketConnection, optionally blocking for outstanding requests.

        Parameters:
        `block`: block for outstanding requests, default False.
    '''
        with self._lock:
            if self.closed:
                # shutdown already called from another thread
                return
            # prevent further request submission either local or remote
            self.closed = True
        ps = self._pending_states()
        if ps:
            warning("PENDING STATES AT SHUTDOWN: %r", ps)
        # wait for completion of requests we're performing
        for LF in list(self._running):
            LF.join()
        # shut down sender, should trigger shutdown of remote receiver
        self._sendQ.close(enforce_final_close=True)
        self._send_thread.join()
        # we do not wait for the receiver - anyone hanging on outstaning
        # requests will get them as they come in, and in theory a network
        # disconnect might leave the receiver hanging anyway
        self._later.close()
        if block:
            self._later.wait()

    def join(self):
        ''' Wait for the receive side of the connection to terminate.
    '''
        self._recv_thread.join()

    def _new_tag(self):
        return next(self._tag_seq)

    def _pending_states(self):
        ''' Return a list of ( (channel, tag), Request_State ) for the currently pending requests.
    '''
        states = []
        pending = self._pending
        for channel, channel_states in sorted(pending.items()):
            for tag, channel_state in sorted(channel_states.items()):
                states.append(((channel, tag), channel_state))
        return states

    @locked
    def _pending_add(self, channel, tag, state):
        ''' Record some state against a (channel, tag).
    '''
        pending = self._pending
        if channel not in pending:
            raise ValueError("unknown channel %d" % (channel, ))
        channel_info = pending[channel]
        if tag in channel_info:
            raise ValueError("tag %d already pending in channel %d" %
                             (tag, channel))
        self._pending[channel][tag] = state

    @locked
    def _pending_pop(self, channel, tag):
        ''' Retrieve and remove the state associated with (channel, tag).
    '''
        pending = self._pending
        if channel not in pending:
            raise ValueError("unknown channel %d" % (channel, ))
        channel_info = pending[channel]
        if tag not in channel_info:
            raise ValueError("tag %d unknown in channel %d" % (tag, channel))
        if False and tag == 15:
            raise RuntimeError("BANG")
        return channel_info.pop(tag)

    def _pending_cancel(self):
        ''' Cancel all the pending requests.
    '''
        for chtag, _ in self._pending_states():
            channel, tag = chtag
            warning("%s: cancel pending request %d:%s", self, channel, tag)
            _, result = self._pending_pop(channel, tag)
            result.cancel()

    def _queue_packet(self, P):
        sig = (P.channel, P.tag, P.is_request)
        if sig in self.__send_queued:
            raise RuntimeError("requeue of %s: %s" % (sig, P))
        self.__send_queued.add(sig)
        try:
            self._sendQ.put(P)
        except ClosedError as e:
            warning("%s: packet not sent: %s (P=%s)", self._sendQ, e, P)

    def _reject(self, channel, tag, payload=bytes(())):
        ''' Issue a rejection of the specified request.
    '''
        error("rejecting request: " + str(payload))
        if isinstance(payload, str):
            payload = payload.encode('utf-8')
        self._queue_packet(
            Packet(is_request=False,
                   channel=channel,
                   tag=tag,
                   flags=0,
                   payload=payload))

    def _respond(self, channel, tag, flags, payload):
        ''' Issue a valid response.
        Tack a 1 (ok) flag onto the flags and dispatch.
    '''
        assert isinstance(channel, int)
        assert isinstance(tag, int)
        assert isinstance(flags, int)
        assert isinstance(payload, bytes)
        flags = (flags << 1) | 1
        self._queue_packet(
            Packet(is_request=False,
                   channel=channel,
                   tag=tag,
                   flags=flags,
                   payload=payload))

    @not_closed
    # pylint: disable=too-many-arguments
    def request(self,
                rq_type,
                flags=0,
                payload=b'',
                decode_response=None,
                channel=0):
        ''' Compose and dispatch a new request, returns a `Result`.

        Allocates a new tag, a Result to deliver the response, and
        records the response decode function for use when the
        response arrives.

        Parameters:
        * `rq_type`: request type code, an int
        * `flags`: optional flags to accompany the request, an int;
          default `0`.
        * `payload`: optional bytes-like object to accompany the request;
          default `b''`
        * `decode_response`: optional callable accepting (response_flags,
          response_payload_bytes) and returning the decoded response payload
          value; if unspecified, the response payload bytes are used

        The Result will yield an `(ok, flags, payload)` tuple, where:
        * `ok`: whether the request was successful
        * `flags`: the response flags
        * `payload`: the response payload, decoded by decode_response
          if specified
    '''
        if rq_type < 0:
            raise ValueError("rq_type may not be negative (%s)" % (rq_type, ))
        # reserve type 0 for end-of-requests
        rq_type += 1
        tag = self._new_tag()
        R = Result()
        self._pending_add(channel, tag, Request_State(decode_response, R))
        self._queue_packet(
            Packet(is_request=True,
                   channel=channel,
                   tag=tag,
                   flags=flags,
                   rq_type=rq_type,
                   payload=payload))
        return R

    @not_closed
    def do(self, *a, **kw):
        ''' Synchronous request.
        Submits the request, then calls the `Result` returned from the request.
    '''
        return self.request(*a, **kw)()

    @logexc
    # pylint: disable=too-many-arguments
    def _run_request(self, channel, tag, handler, rq_type, flags, payload):
        ''' Run a request and queue a response packet.
    '''
        with Pfx(
                "_run_request[channel=%d,tag=%d,rq_type=%d,flags=0x%02x,payload=%s",
                channel, tag, rq_type, flags,
                repr(payload) if len(payload) <= 32 else repr(payload[:32]) +
                '...'):
            result_flags = 0
            result_payload = b''
            try:
                result = handler(rq_type, flags, payload)
                if result is not None:
                    if isinstance(result, int):
                        result_flags = result
                    elif isinstance(result, bytes):
                        result_payload = result
                    elif isinstance(result, str):
                        result_payload = result.encode(
                            encoding='utf-8', errors='xmlcharrefreplace')
                    else:
                        result_flags, result_payload = result
            except Exception as e:  # pylint: disable=broad-except
                exception("exception: %s", e)
                self._reject(channel, tag, "exception during handler")
            else:
                self._respond(channel, tag, result_flags, result_payload)
            self._channel_request_tags[channel].remove(tag)

    # pylint: disable=too-many-branches,too-many-statements,too-many-locals
    def _receive_loop(self):
        ''' Receive packets from upstream, decode into requests and responses.
    '''
        XX = self.tick
        with PrePfx("_RECEIVE [%s]", self):
            with post_condition(("_recv is None", lambda: self._recv is None)):
                while True:
                    try:
                        XX(b'<')
                        packet = Packet.parse(self._recv)
                    except EOFError:
                        break
                    if packet == self.EOF_Packet:
                        break
                    channel = packet.channel
                    tag = packet.tag
                    flags = packet.flags
                    payload = packet.payload
                    if packet.is_request:
                        # request from upstream client
                        with Pfx("request[%d:%d]", channel, tag):
                            if self.closed:
                                debug("rejecting request: closed")
                                # NB: no rejection packet sent since sender also closed
                            elif self.request_handler is None:
                                self._reject(channel, tag,
                                             "no request handler")
                            else:
                                requests = self._channel_request_tags
                                if channel not in requests:
                                    # unknown channel
                                    self._reject(channel, tag,
                                                 "unknown channel %d")
                                elif tag in self._channel_request_tags[
                                        channel]:
                                    self._reject(
                                        channel, tag,
                                        "channel %d: tag already in use: %d" %
                                        (channel, tag))
                                else:
                                    # payload for requests is the request enum and data
                                    rq_type = packet.rq_type
                                    if rq_type == 0:
                                        # magic EOF rq_type - must be malformed (!=EOF_Packet)
                                        error(
                                            "malformed EOF packet received: %s",
                                            packet)
                                        break
                                    # normalise rq_type
                                    rq_type -= 1
                                    requests[channel].add(tag)
                                    # queue the work function and track it
                                    LF = self._later.defer(
                                        self._run_request, channel, tag,
                                        self.request_handler, rq_type, flags,
                                        payload)
                                    self._running.add(LF)
                                    LF.notify(self._running.remove)
                    else:
                        with Pfx("response[%d:%d]", channel, tag):
                            # response: get state of matching pending request, remove state
                            try:
                                rq_state = self._pending_pop(channel, tag)
                            except ValueError as e:
                                # no such pending pair - response to unknown request
                                error("%d.%d: response to unknown request: %s",
                                      channel, tag, e)
                            else:
                                decode_response, R = rq_state
                                # first flag is "ok"
                                ok = (flags & 0x01) != 0
                                flags >>= 1
                                payload = packet.payload
                                if ok:
                                    # successful reply
                                    # return (True, flags, decoded-response)
                                    if decode_response is None:
                                        # return payload bytes unchanged
                                        R.result = (True, flags, payload)
                                    else:
                                        # decode payload
                                        try:
                                            result = decode_response(
                                                flags, payload)
                                        except Exception:  # pylint: disable=broad-except
                                            R.exc_info = sys.exc_info()
                                        else:
                                            R.result = (True, flags, result)
                                else:
                                    # unsuccessful: return (False, other-flags, payload-bytes)
                                    R.result = (False, flags, payload)
                # end of received packets: cancel any outstanding requests
                self._pending_cancel()
                # alert any listeners of receive EOF
                for notify in self.notify_recv_eof:
                    notify(self)
                self._recv = None
                self.shutdown()

    # pylint: disable=too-many-branches
    def _send_loop(self):
        ''' Send packets upstream.
        Write every packet directly to self._send.
        Flush whenever the queue is empty.
    '''
        XX = self.tick
        ##with Pfx("%s._send", self):
        with PrePfx("_SEND [%s]", self):
            with post_condition(("_send is None", lambda: self._send is None)):
                fp = self._send
                Q = self._sendQ
                grace = self.packet_grace
                for P in Q:
                    sig = (P.channel, P.tag, P.is_request)
                    if sig in self.__sent:
                        raise RuntimeError("second send of %s" % (P, ))
                    self.__sent.add(sig)
                    try:
                        XX(b'>')
                        for bs in P.transcribe_flat():
                            fp.write(bs)
                        if Q.empty():
                            # no immediately ready further packets: flush the output buffer
                            if grace > 0:
                                # allow a little time for further Packets to queue
                                XX(b'Sg')
                                sleep(grace)
                                if Q.empty():
                                    # still nothing
                                    XX(b'F')
                                    fp.flush()
                            else:
                                XX(b'F')
                                fp.flush()
                    except OSError as e:
                        if e.errno == errno.EPIPE:
                            warning("remote end closed")
                            break
                        raise
                try:
                    XX(b'>EOF')
                    for bs in self.EOF_Packet.transcribe_flat():
                        fp.write(bs)
                    fp.close()
                except (OSError, IOError) as e:
                    if e.errno == errno.EPIPE:
                        debug("remote end closed: %s", e)
                    elif e.errno == errno.EBADF:
                        warning("local end closed: %s", e)
                    else:
                        raise
                except Exception as e:
                    error("(_SEND) UNEXPECTED EXCEPTION: %s %s", e,
                          e.__class__)
                    raise
                self._send = None