예제 #1
0
def mp_buffer(df,buffer_dist,n_workers):
      
    # array to sharedmem
    array = df[['unique_id','geometry']].to_records(index=False)
    shape, dtype = array.shape, array.dtype
    shm = SharedMemory(name='arr',create=True, size=array.nbytes)
    shm_array = np.recarray(shape=shape, dtype=dtype, buf=shm.buf)
    np.copyto(shm_array, array)
    
    shm_spec = {'name':'arr','shape':shape,'dtype':dtype}
    
    # do multiprocess
    chunk = len(df)//n_workers +1
    
    args = [(shm_spec, range(len(df))[ii*chunk:(ii+1)*chunk], buffer_dist) for ii in range(n_workers)]
    
    with mp.Pool(n_workers) as pool:
        res = pool.starmap(buffer_worker, args)
        
    res = [item for sublist in res for item in sublist]
    
    shm.close()
    shm.unlink()
        
    return [r[1] for r in res]
예제 #2
0
    def find_adv_batch(self, model: nn.Module, inputs: torch.Tensor,
                       inputs_adv_ref: torch.Tensor, labels: torch.Tensor,
                       epsilon: float, max_nr_adv):

        self._start_workers()

        with ensure_training_state(model, False):
            model_outputs: torch.Tensor = model(inputs_adv_ref)
            eval_model = model.cvt_to_eval()

        correct_mask = torch.eq(model_outputs.argmax(dim=1), labels)
        idx_remap = np.arange(inputs.shape[0], dtype=np.int32)
        np.random.shuffle(idx_remap)
        assert inputs.dtype == torch.float32

        shm_size = BUFFER_COUNTER_SIZE + 4 * inputs.numel()
        shm = SharedMemory(size=shm_size, create=True)
        shm.buf[:BUFFER_COUNTER_SIZE] = b'\0' * BUFFER_COUNTER_SIZE
        shm_name = shm.name

        args = RemoteArgs(eval_model, inputs, inputs_adv_ref, labels,
                          correct_mask, epsilon, max_nr_adv, shm_name,
                          shm_size, idx_remap)

        try:
            return self._work(shm.buf, args)
        except:
            self.close()
            raise
        finally:
            shm.close()
            shm.unlink()
예제 #3
0
    def __init__(self, args: RemoteArgs, progress_cb, is_master):
        self._args = args
        self._progress_cb = progress_cb
        self._make_verifier()

        shm = SharedMemory(args.shm_name, False)
        try:
            self._setup_shm(shm.buf)
            self._process()
            self._done_workers.fetch_add(1)
        except:
            # set to a negative value to mark failure
            self._read_idx.fetch_add(-args.inputs.shape[0]**2)
            raise
        finally:
            self._read_idx = None
            self._done_workers = None
            self._nr_adv_verifier = None
            self._nr_adv_ref = None
            self._output = None
            shm.close()
            # manual cleanup due to a bug (https://bugs.python.org/issue39959 )
            if not is_master and sys.version_info <= (3, 8, 2):
                from multiprocessing import shared_memory
                if shared_memory._USE_POSIX:
                    from multiprocessing.resource_tracker import unregister
                    unregister(shm._name, "shared_memory")
예제 #4
0
 def from_dict(self, classname: str, d: dict):
     """convert dict from `ndarray_to_dict` back to np.ndarray"""
     shm = SharedMemory(name=d["shm"], create=False)
     array = np.ndarray(d["shape"], dtype=d["dtype"], buffer=shm.buf).copy()
     shm.close()
     shm.unlink()
     return array
예제 #5
0
파일: shm_sample.py 프로젝트: zhyj3038/SDBI
def producer(conn):
    # os.environ["PYTHONWARNINGS"] = "ignore"
    feed_shm_name = '{}_{}_{}'.format('test', os.getpid(),
                                      threading.currentThread().ident)
    print('input shm name : {}'.format(feed_shm_name))

    feed_shm = SharedMemory(name=feed_shm_name, create=True, size=2 * 4)

    feed_shm_arr = np.ndarray((1, 2), dtype=np.float32, buffer=feed_shm.buf)
    input_arr = np.random.random((1, 2)).astype(np.float32)
    feed_shm_arr[:] = input_arr[:]

    conn.send(feed_shm_name)
    result_shm_name = conn.recv()
    result_shm = SharedMemory(name=result_shm_name)
    result_shm_arr = np.ndarray((1, 2),
                                dtype=np.float32,
                                buffer=result_shm.buf)
    print('Output array : {}'.format(result_shm_arr))

    conn.send('exit')
    del result_shm_arr
    result_shm.close()

    conn.recv()
    del feed_shm_arr
    feed_shm.close()
    feed_shm.unlink()

    print('clean and exit')

    return
예제 #6
0
 async def delete(self, object_id):
     try:
         shm = SharedMemory(name=object_id)
         shm.unlink()
         shm.close()
     except FileNotFoundError:
         if sys.platform == 'win32':
             # skip file not found error for windows
             pass
         else:  # pragma: no cover
             raise
     try:
         self._object_ids.remove(object_id)
     except KeyError:  # pragma: no cover
         return
예제 #7
0
    def load_file(self, *args, **kwargs):
        shm = SharedMemory(create=True, size=5 * np.dtype('<U64').itemsize)

        # get array metadata, shared memory buffer name, dtype, and shape
        array_metadata = np.ndarray(shape=(5,), dtype=np.dtype('<U64'), buffer=shm.buf)

        importer_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '_inscopix_importer.py')

        file_path = self.widget.line_edit_path.text()

        cmd = ['python', importer_path, '--isx-path', file_path, '--shm-meta-array-name', shm.name]
        print(cmd)
        proc = Popen(
            cmd,
            env=os.environ.copy()
        )
        proc.wait()

        # read the metadata
        name = array_metadata[0]
        dtype = array_metadata[1]
        shape = array_metadata[2]

        # open the shared buffer
        existing_shm = SharedMemory(name=array_metadata[0])

        shape = tuple(map(int, shape.split(',')))

        _imgseq = np.ndarray(shape, dtype=np.dtype(dtype), buffer=existing_shm.buf)

        imgseq = np.zeros(shape=shape, dtype=np.dtype(dtype))
        imgseq[:] = _imgseq[:]

        d = \
            {
                'fps': float(array_metadata[3]),
                'origin': array_metadata[4],
                'date': '00000000_000000'
            }

        imgdata = ImgData(imgseq, d)

        self.vi.viewer.workEnv = ViewerWorkEnv(imgdata)
        self.vi.update_workEnv()

        existing_shm.close()
예제 #8
0
def _transpile_circuit(
        circuit_config_tuple: Tuple[QuantumCircuit, str,
                                    Dict]) -> QuantumCircuit:
    """Select a PassManager and run a single circuit through it.
    Args:
        circuit_config_tuple (tuple):
            circuit (QuantumCircuit): circuit to transpile
            name (str): The name of the shared memory object containing a pickled dict of shared
                arguments between parallel works
            unique_config (dict): configuration dictating unique arguments for transpile.
    Returns:
        The transpiled circuit
    Raises:
        TranspilerError: if transpile_config is not valid or transpilation incurs error
    """
    circuit, name, unique_config = circuit_config_tuple
    existing_shm = SharedMemory(name=name)
    try:
        with io.BytesIO(existing_shm.buf) as buf:
            shared_transpiler_args = pickle.load(buf)
    finally:
        existing_shm.close()

    transpile_config, pass_manager = _combine_args(shared_transpiler_args,
                                                   unique_config)
    pass_manager_config = transpile_config["pass_manager_config"]

    result = pass_manager.run(circuit,
                              callback=transpile_config["callback"],
                              output_name=transpile_config["output_name"])

    if transpile_config["faulty_qubits_map"]:
        return _remap_circuit_faulty_backend(
            result,
            transpile_config["backend_num_qubits"],
            pass_manager_config.backend_properties,
            transpile_config["faulty_qubits_map"],
        )

    return result
예제 #9
0
def open_memory(name, create=False, size=0, consume=False):
    """Open shared memory via a context manager

    The shared memory will automatically be closed when the
    context ends. The shared memory may also optionally be
    created and/or consumed. If the "consume" flag is True,
    the shared memory will be unlinked as well when the context
    ends.

    Args:
        name (str): the name of the shared memory
        create (bool): whether to create the shared memory or try
                       to open a pre-existing one. If this is True,
                       the size argument must be non-zero.
        size (int): the size in bytes of the shared memory block to create.
                    Only used if create == True.
        consume (bool): whether or not to unlink the shared memory
                        when the context ends.

    Yields:
        posix_ipc.MessageQueue: the message queue
    """
    if create and size == 0:
        raise Exception('If create is True, size must be non-zero')

    kwargs = {
        'name': name,
        'create': create,
        'size': size,
    }
    shm = SharedMemory(**kwargs)
    try:
        yield shm
    finally:
        shm.close()
        if consume:
            shm.unlink()
예제 #10
0
class Horse(Process):
    def __init__(
            self,
            points,
            iD,
            kD,
            vectors_UID,
            tree_UID,
            maxsize,
            pipe,
            lock,
            distance_function=(lambda v: np.sum(v**2)),
            vector_function=(lambda p, n: p - n),
            square_distances=True,
    ):
        super(Horse, self).__init__()

        self.distance_function = distance_function
        self.vector_function = vector_function

        self.points = points

        if square_distances:
            self.iD = iD**2
            self.kD = kD**2
        else:
            self.iD = iD
            self.kD = kD

        self.maxsize = maxsize

        self.reached_bool = np.zeros(len(points), dtype=bool)
        self.reached_points = 0

        self.closest = np.ones(len(points), dtype=np.int)
        self.L = np.ones(len(points)) * np.inf
        self.dv = np.zeros((len(points), 3))

        self.vectors_sm = SharedMemory(name=vectors_UID)
        self.tree_sm = SharedMemory(name=tree_UID)

        self.nodes = as_numpy_arr((self.maxsize, 3), self.tree_sm)
        self.vectors = as_numpy_arr((self.maxsize, 3), self.vectors_sm)

        self.pipe = pipe
        self.lock = lock
        self.batch = None

    def run(self):
        if self.batch is None:
            self.batch = self.pipe.recv()

        while self.batch.run:
            res = self.compute(*self.batch.args)
            self.pipe.send(res)
            self.batch = self.pipe.recv()

        self.vectors_sm.close()
        self.tree_sm.close()

    def compute(self, start, end, trunk_mode):
        active_points = 0
        result = []
        for i, p in enumerate(self.points):
            if self.reached_bool[i]:
                pass
            else:
                for j in range(start, end):
                    n = self.nodes[j]
                    dv = self.vector_function(p, n)
                    L = self.distance_function(dv)

                    if L < self.kD:
                        self.reached_points += 1
                        self.reached_bool[i] = True
                        break

                    if L < self.L[i]:
                        self.closest[i] = j
                        self.L[i] = L
                        self.dv[i] = dv

                if not self.reached_bool[i]:
                    if self.L[i] < self.iD:
                        active_points += 1
                        result.append(self.closest[i])
                        self.protected_add(self.closest[i], self.dv[i],
                                           self.L[i])

                    elif trunk_mode:
                        result.append(self.closest[i])
                        self.protected_add(self.closest[i], self.dv[i],
                                           self.L[i])

        return active_points, self.reached_points, result

    def protected_add(self, node_idx, value, length):
        self.lock.acquire()
        if length < self.distance_function(self.vectors[node_idx]):
            self.vectors[node_idx] = value
        self.lock.release()
예제 #11
0
class SpaceColony:
    def __init__(
            self,
            points,
            roots=np.zeros((1, 3)),
            parameters=Param(r=0.04, iD=0.5, kD=0.2, bias=np.zeros(3)),
            trunk_lim=1,
            min_activation=5,
            yeet_condition=5,
            maxsize=100000,
            ncpu=cpu_count(),
            grow_function=(lambda v: normalize(v)),
    ):

        # Static information
        self.par = parameters
        self.ncpu = ncpu
        self.min_activation = min_activation
        self.trunk_lim = trunk_lim
        self.maxsize = maxsize
        self.yeet_condition = yeet_condition
        self.grow_function = grow_function

        self.nroots = len(roots)

        # Dynamic information
        self.age = 0
        self.start = 0
        self.end = len(roots)
        self.done = False
        self.trunk_mode = True
        self.yeet_count = 0

        self.activation = 0
        self.reached_points = 0
        self.stats = []

        self.dirty = True

        # Local dynamics
        self.edges = []
        self.children = [[] for _ in range(maxsize)]
        self.w = []

        # This array is sliced at start:
        self.points = points

        # This is sparta.
        self.lock = Lock()
        A = np.inf * np.ones((self.maxsize, 3), dtype=roots.dtype)

        self.vectors_sm = SharedMemory(create=True, size=A.nbytes)
        self.tree_sm = SharedMemory(create=True, size=A.nbytes)

        self.vectors = as_numpy_arr(A.shape, shared_obj=self.vectors_sm)
        self.vectors[:] = A[:]

        self.nodes = as_numpy_arr(A.shape, self.tree_sm)
        self.nodes[:] = A[:]
        for i in range(len(roots)):
            self.nodes[i] = roots[i]

        # Explicit pool creation for better control
        point_slices = np.array_split(self.points, self.ncpu)
        self.workers = []
        self.pipes = []
        for i in range(self.ncpu):
            parent_pipe, child_pipe = Pipe()
            self.pipes.append(parent_pipe)
            args = self.pack(point_slices[i], child_pipe)
            self.workers.append(Horse(*args))
            self.workers[i].start()

        self.running = True

    def iterate(self, N):
        self.dirty = True
        log.info(f"START: {time()}\n{self.__str__()}")
        for i in range(N):
            self.update_stats()
            if self.done:
                break

            for pipe in self.pipes:
                pipe.send(
                    Batch(True, 1, (self.start, self.end, self.trunk_mode)))

            result_list = [pipe.recv() for pipe in self.pipes]
            res = self.collect(result_list)
            self.grow(res)
            self.age += 1
            self.done_yet()

        log.info(f"DONE: {time()}\n{self.__str__()}")

    def stop(self):
        if not self.running:
            return

        log.info("Horse shutdown.")
        for pipe in self.pipes:
            pipe.send(Batch(False, 1, (None, )))
            pipe.close()

        for w in self.workers:
            w.join(1)
            w.terminate()
        self.pipes = []
        self.workers = []
        self.running = False

    def collect(self, result_list):
        self.activation = 0
        self.reached_points = 0
        result = []
        for res in result_list:
            self.activation += res[0]
            self.reached_points += res[1]
            for i in res[2]:
                if i not in result:
                    result.append(i)
        return result

    def grow(self, res):
        self.start = self.end
        for i in res:
            if self.end >= self.maxsize:
                log.info("Halt condition: node vector full.")
                self.done = True
                return
            self.nodes[self.end] = (
                self.nodes[i] +
                (self.grow_function(self.vectors[i]) + self.par.bias) *
                self.par.r)
            self.children[i].append(self.end)
            self.vectors[i] = np.ones(3) * np.inf
            self.end += 1

    def done_yet(self):
        if self.done:
            return True

        if self.trunk_mode:
            self.trunk_mode = self.activation <= self.trunk_lim
            if self.trunk_mode:
                return False
            else:
                log.info(f"Trunk mode disabled at {self.age} iterations.")

        if self.activation < self.min_activation:
            log.info(f"Halt condition: activation < {self.min_activation}.")
            self.done = True
            return True

        # The yeet condition is basically to inhibit periodic behaviours from growing
        # the structure ad infinitum. It stops iterating if it detects that activation
        # levels are not changing any more. There are some obvious corner cases to this,
        # same numerical activation does not imply that the same set of attractors are
        # active, but in practice this method is fast and works well enough.
        if self.age > self.yeet_condition:
            if np.abs(self.activation - self.stats[self.age - 1].act) < 3:
                self.yeet_count += 1
                if self.yeet_count >= self.yeet_condition:
                    self.end = self.stats[self.age - self.yeet_count].sz

                    for i in range(self.end):
                        self.children[i] = [
                            c for c in self.children[i] if c <= self.end - 1
                        ]

                    self.age -= self.yeet_count
                    self.stats = self.stats[:self.age]
                    log.info(f"Halt condition: yeet count {self.yeet_count}.")
                    self.done = True
                    return True
            else:
                self.yeet_count = 0
                return False

        return False

    # Populate edge table
    def walk(self):
        self.w = np.ones(self.maxsize)
        self.edges = []

        for i in range(self.nroots):
            self._walk(i)

        self.dirty = False

    def _walk(self, i):
        w = self.w[i]
        for j in self.children[i]:
            self.edges.append((i, j))
            w += self._walk(j)**2

        w = np.sqrt(w)
        self.w[i] = w
        return w

    # Use explicit packing/unpacking
    def pack(self, points, pipe):
        return (
            points,
            self.par.iD,
            self.par.kD,
            self.vectors_sm.name,
            self.tree_sm.name,
            self.maxsize,
            pipe,
            self.lock,
        )

    def update_stats(self):
        self.stats.append(Stats(self.end, self.activation,
                                self.reached_points))

    def get_stats(self):
        return self.stats

    def __str__(self):
        nproc = 0
        for w in self.workers:
            nproc += 1 if w.is_alive() else 0

        leaves = 0
        for i in range(self.end):
            if len(self.children[i]) == 0:
                leaves += 1

        return f"{self.end} nodes, {self.age} iterations \n\
                {self.activation}/{len(self.points) - self.reached_points} active points \n\
                Total {len(self.points)} points on {nproc}/{self.ncpu} processes \n\
                avg. branching: {leaves/(self.end+1)} \n\
                {self.par}"

    def __del__(self):
        log.debug("Delete SpaceColony")
        self.stop()
        self.vectors_sm.close()
        self.vectors_sm.unlink()
        self.tree_sm.close()
        self.tree_sm.unlink()
예제 #12
0
    def _process(source: str, task_queue: Queue, config: Config,
                 progress_queue: Queue):
        """
        Calculating distance matrix
        :param modulus:
        :param config:
        :return:
        """
        # preparing access for shared memories
        embedding = config.embedding.embedding

        if source == "embedding":
            # Embedding to use (first pass)
            weights_mem = SharedMemory(embedding.embedding_memory_name)
            w = np.ndarray(shape=embedding.embedding_memory_shape,
                           dtype=embedding.embedding_memory_dtype,
                           buffer=weights_mem.buf)
            # Results will be saved here
            dist_mem = SharedMemory(config.data.distance_matrix)
            distance_matrix = np.ndarray(
                shape=config.data.distance_matrix_shape, buffer=dist_mem.buf)
        else:
            # Embedding to use (second pass)
            weights_mem = SharedMemory(config.data.transformed_space)
            w = np.ndarray(shape=config.data.transformed_space_shape,
                           buffer=weights_mem.buf)
            # Results will be saved here
            dist_mem = SharedMemory(
                config.data.transformed_space_distance_matrix)
            distance_matrix = np.ndarray(
                shape=config.data.transformed_space_distance_matrix_shape,
                buffer=dist_mem.buf)

        i2w_mem = SharedMemory(embedding.i2w_memory_name)
        i2w = embedding.buff_to_dict(i2w_mem, embedding.i2w_memory_size)

        semcat = config.semantic_categories.categories

        # Iterating over the dimensions of the embedding
        while True:
            try:
                task = task_queue.get(True, 0.5)
            except Empty:
                # config.logger.info(f"Task Queue is empty")
                break
            # Iterating over the dimensions of the embedding
            # for i in tqdm.trange(w.shape[1], unit='dim', desc=f'PID -> {os.getpid()}\t'):
            i = task[0]
            j = task[1]
            dimension = w[:, i]
            # Iterating over the semantic categories
            word_indexes = np.zeros(shape=[
                w.shape[0],
            ], dtype=np.bool)
            # One-hot selection vector for in-category words
            for k in range(w.shape[0]):
                try:
                    if i2w[str(k)] == semcat.i2c[j]:
                        word_indexes[k] = True
                except KeyError:
                    continue
                except IndexError:
                    continue

            # Populate P with category word weights
            _p = dimension[word_indexes]
            # Populate Q with out of category word weights
            _q = dimension[~word_indexes]
            # calculating distance
            distance, sign = config.distance.function(_p, _q, config)
            distance_matrix[i, j, 0] = distance
            distance_matrix[i, j, 1] = sign
            progress_queue.put(0)
        weights_mem.close()
        dist_mem.close()
예제 #13
0
class DLManager(Process):
    def __init__(self,
                 download_dir,
                 base_url,
                 cache_dir=None,
                 status_q=None,
                 max_jobs=100,
                 max_failures=5,
                 max_workers=0,
                 update_interval=1.0,
                 max_shared_memory=1024 * 1024 * 1024,
                 resume_file=None):
        super().__init__(name='DLManager')
        self.log = logging.getLogger('DLM')
        self.proc_debug = False

        self.base_url = base_url
        self.dl_dir = download_dir
        self.cache_dir = cache_dir if cache_dir else os.path.join(
            download_dir, '.cache')

        # All the queues!
        self.logging_queue = None
        self.dl_worker_queue = None
        self.writer_queue = None
        self.dl_result_q = None
        self.writer_result_q = None
        self.max_jobs = max_jobs
        self.max_workers = max_workers if max_workers else min(
            cpu_count() * 2, 16)

        # Analysis stuff
        self.analysis = None
        self.tasks = deque()
        self.chunks_to_dl = deque()
        self.chunk_data_list = None

        # shared memory stuff
        self.max_shared_memory = max_shared_memory  # 1 GiB by default
        self.sms = deque()
        self.shared_memory = None

        # Interval for log updates and pushing updates to the queue
        self.update_interval = update_interval
        self.status_queue = status_q  # queue used to relay status info back to GUI/CLI

        # behaviour settings
        self.max_failures = max_failures
        self.resume_file = resume_file

        # cross-thread runtime information
        self.running = True
        self.active_tasks = 0
        self.children = []
        self.threads = []
        self.conditions = []
        # bytes downloaded and decompressed since last report
        self.bytes_downloaded_since_last = 0
        self.bytes_decompressed_since_last = 0
        # bytes written since last report
        self.bytes_written_since_last = 0
        # bytes read since last report
        self.bytes_read_since_last = 0
        # chunks written since last report
        self.num_processed_since_last = 0
        self.num_tasks_processed_since_last = 0

    def download_job_manager(self, task_cond: Condition, shm_cond: Condition):
        while self.chunks_to_dl and self.running:
            while self.active_tasks < self.max_workers * 2 and self.chunks_to_dl:
                try:
                    sms = self.sms.popleft()
                    no_shm = False
                except IndexError:  # no free cache
                    no_shm = True
                    break

                c_guid = self.chunks_to_dl.popleft()
                chunk = self.chunk_data_list.get_chunk_by_guid(c_guid)
                self.log.debug(
                    f'Adding {chunk.guid_num} (active: {self.active_tasks})')
                try:
                    self.dl_worker_queue.put(DownloaderTask(url=self.base_url +
                                                            '/' + chunk.path,
                                                            chunk_guid=c_guid,
                                                            shm=sms),
                                             timeout=1.0)
                except Exception as e:
                    self.log.warning(f'Failed to add to download queue: {e!r}')
                    self.chunks_to_dl.appendleft(c_guid)
                    break

                self.active_tasks += 1
            else:
                # active tasks limit hit, wait for tasks to finish
                with task_cond:
                    self.log.debug('Waiting for download tasks to complete..')
                    task_cond.wait(timeout=1.0)
                    continue

            if no_shm:
                # if we break we ran out of shared memory, so wait for that.
                with shm_cond:
                    self.log.debug('Waiting for more shared memory...')
                    shm_cond.wait(timeout=1.0)

        self.log.info('Download Job Manager quitting...')

    def dl_results_handler(self, task_cond: Condition):
        in_buffer = dict()

        task = self.tasks.popleft()
        current_file = ''

        while task and self.running:
            if isinstance(task,
                          FileTask):  # this wasn't necessarily a good idea...
                try:
                    if task.empty:
                        self.writer_queue.put(WriterTask(task.filename,
                                                         empty=True),
                                              timeout=1.0)
                    elif task.rename:
                        self.writer_queue.put(WriterTask(
                            task.filename,
                            rename=True,
                            delete=task.delete,
                            old_filename=task.temporary_filename),
                                              timeout=1.0)
                    elif task.delete:
                        self.writer_queue.put(WriterTask(task.filename,
                                                         delete=True),
                                              timeout=1.0)
                    elif task.open:
                        self.writer_queue.put(WriterTask(task.filename,
                                                         fopen=True),
                                              timeout=1.0)
                        current_file = task.filename
                    elif task.close:
                        self.writer_queue.put(WriterTask(task.filename,
                                                         close=True),
                                              timeout=1.0)
                except Exception as e:
                    self.tasks.appendleft(task)
                    self.log.warning(f'Adding to queue failed: {e!r}')
                    continue

                try:
                    task = self.tasks.popleft()
                except IndexError:  # finished
                    break
                continue

            while (task.chunk_guid in in_buffer) or task.chunk_file:
                res_shm = None
                if not task.chunk_file:  # not re-using from an old file
                    res_shm = in_buffer[task.chunk_guid].shm

                try:
                    self.writer_queue.put(
                        WriterTask(
                            filename=current_file,
                            shared_memory=res_shm,
                            chunk_offset=task.chunk_offset,
                            chunk_size=task.chunk_size,
                            chunk_guid=task.chunk_guid,
                            release_memory=task.cleanup,
                            old_file=task.chunk_file  # todo on-disk cache
                        ),
                        timeout=1.0)
                except Exception as e:
                    self.log.warning(f'Adding to queue failed: {e!r}')
                    break

                if task.cleanup and not task.chunk_file:
                    del in_buffer[task.chunk_guid]

                try:
                    task = self.tasks.popleft()
                    if isinstance(task, FileTask):
                        break
                except IndexError:  # finished
                    task = None
                    break
            else:  # only enter blocking code if the loop did not break
                try:
                    res = self.dl_result_q.get(timeout=1)
                    self.active_tasks -= 1
                    with task_cond:
                        task_cond.notify()

                    if res.success:
                        in_buffer[res.guid] = res
                        self.bytes_downloaded_since_last += res.compressed_size
                        self.bytes_decompressed_since_last += res.size
                    else:
                        self.log.error(
                            f'Download for {res.guid} failed, retrying...')
                        try:
                            self.dl_worker_queue.put(DownloaderTask(
                                url=res.url, chunk_guid=res.guid, shm=res.shm),
                                                     timeout=1.0)
                            self.active_tasks += 1
                        except Exception as e:
                            self.log.warning(
                                f'Failed adding retry task to queue! {e!r}')
                            # If this failed for whatever reason, put the chunk at the front of the DL list
                            self.chunks_to_dl.appendleft(res.chunk_guid)
                except Empty:
                    pass
                except Exception as e:
                    self.log.warning(
                        f'Unhandled exception when trying to read download result queue: {e!r}'
                    )

        self.log.info('Download result handler quitting...')

    def fw_results_handler(self, shm_cond: Condition):
        while self.running:
            try:
                res = self.writer_result_q.get(timeout=1.0)
                self.num_tasks_processed_since_last += 1

                if res.closed and self.resume_file:
                    # write last completed file to super simple resume file
                    with open(self.resume_file, 'ab') as rf:
                        rf.write(f'{res.filename}\n'.encode('utf-8'))

                if res.kill:
                    self.log.info(
                        'Got termination command in FW result handler')
                    break

                if not res.success:
                    # todo make this kill the installation process or at least skip the file and mark it as failed
                    self.log.fatal(f'Writing for {res.filename} failed!')
                if res.release_memory:
                    self.sms.appendleft(res.shm)
                    with shm_cond:
                        shm_cond.notify()

                if res.chunk_guid:
                    self.bytes_written_since_last += res.size
                    # if there's no shared memory we must have read from disk.
                    if not res.shm:
                        self.bytes_read_since_last += res.size
                    self.num_processed_since_last += 1

            except Empty:
                continue
            except Exception as e:
                self.log.warning(
                    f'Exception when trying to read writer result queue: {e!r}'
                )
        self.log.info('Writer result handler quitting...')

    def run_analysis(self,
                     manifest: Manifest,
                     old_manifest: Manifest = None,
                     patch=True,
                     resume=True,
                     file_prefix_filter=None,
                     file_exclude_filter=None,
                     file_install_tag=None) -> AnalysisResult:
        """
        Run analysis on manifest and old manifest (if not None) and return a result
        with a summary resources required in order to install the provided manifest.

        :param manifest: Manifest to install
        :param old_manifest: Old manifest to patch from (if applicable)
        :param patch: Patch instead of redownloading the entire file
        :param resume: Continue based on resume file if it exists
        :param file_prefix_filter: Only download files that start with this prefix
        :param file_exclude_filter: Exclude files with this prefix from download
        :return: AnalysisResult
        """

        analysis_res = AnalysisResult()
        analysis_res.install_size = sum(
            fm.file_size for fm in manifest.file_manifest_list.elements)
        analysis_res.biggest_chunk = max(
            c.window_size for c in manifest.chunk_data_list.elements)
        analysis_res.biggest_file_size = max(
            f.file_size for f in manifest.file_manifest_list.elements)
        is_1mib = analysis_res.biggest_chunk == 1024 * 1024
        self.log.debug(
            f'Biggest chunk size: {analysis_res.biggest_chunk} bytes (== 1 MiB? {is_1mib})'
        )

        self.log.debug(f'Creating manifest comparison...')
        mc = ManifestComparison.create(manifest, old_manifest)
        analysis_res.manifest_comparison = mc

        if resume and self.resume_file and os.path.exists(self.resume_file):
            try:
                completed_files = set(
                    i.strip() for i in open(self.resume_file).readlines())
                # remove completed files from changed/added and move them to unchanged for the analysis.
                mc.added -= completed_files
                mc.changed -= completed_files
                mc.unchanged |= completed_files
                self.log.debug(
                    f'Skipped {len(completed_files)} files based on resume data!'
                )
            except Exception as e:
                self.log.warning(
                    f'Reading resume file failed: {e!r}, continuing as normal...'
                )

        # Not entirely sure what install tags are used for, only some titles have them.
        # Let's add it for testing anyway.
        if file_install_tag:
            files_to_skip = set(i.filename
                                for i in manifest.file_manifest_list.elements
                                if file_install_tag not in i.install_tags)
            self.log.info(
                f'Found {len(files_to_skip)} files to skip based on install tag.'
            )
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        # if include/exclude prefix has been set: mark all files that are not to be downloaded as unchanged
        if file_exclude_filter:
            file_exclude_filter = file_exclude_filter.lower()
            files_to_skip = set(i for i in mc.added | mc.changed
                                if i.lower().startswith(file_exclude_filter))
            self.log.info(
                f'Found {len(files_to_skip)} files to skip based on exclude prefix.'
            )
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        if file_prefix_filter:
            file_prefix_filter = file_prefix_filter.lower()
            files_to_skip = set(
                i for i in mc.added | mc.changed
                if not i.lower().startswith(file_prefix_filter))
            self.log.info(
                f'Found {len(files_to_skip)} files to skip based on include prefix.'
            )
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        if file_prefix_filter or file_exclude_filter or file_install_tag:
            self.log.info(
                f'Remaining files after filtering: {len(mc.added) + len(mc.changed)}'
            )
            # correct install size after filtering
            analysis_res.install_size = sum(
                fm.file_size for fm in manifest.file_manifest_list.elements
                if fm.filename in mc.added)

        if mc.removed:
            analysis_res.removed = len(mc.removed)
            self.log.debug(f'{analysis_res.removed} removed files')
        if mc.added:
            analysis_res.added = len(mc.added)
            self.log.debug(f'{analysis_res.added} added files')
        if mc.changed:
            analysis_res.changed = len(mc.changed)
            self.log.debug(f'{analysis_res.changed} changed files')
        if mc.unchanged:
            analysis_res.unchanged = len(mc.unchanged)
            self.log.debug(f'{analysis_res.unchanged} unchanged files')

        # count references to chunks for determining runtime cache size later
        references = Counter()
        for fm in manifest.file_manifest_list.elements:
            # chunks of unchanged files are not downloaded so we can skip them
            if fm.filename in mc.unchanged:
                analysis_res.unchanged += fm.file_size
                continue

            for cp in fm.chunk_parts:
                references[cp.guid_num] += 1

        # determine reusable chunks and prepare lookup table for reusable ones
        re_usable = defaultdict(dict)
        if old_manifest and mc.changed and patch:
            self.log.debug('Analyzing manifests for re-usable chunks...')
            for changed in mc.changed:
                old_file = old_manifest.file_manifest_list.get_file_by_path(
                    changed)
                new_file = manifest.file_manifest_list.get_file_by_path(
                    changed)

                existing_chunks = dict()
                off = 0
                for cp in old_file.chunk_parts:
                    existing_chunks[(cp.guid_num, cp.offset, cp.size)] = off
                    off += cp.size

                for cp in new_file.chunk_parts:
                    key = (cp.guid_num, cp.offset, cp.size)
                    if key in existing_chunks:
                        references[cp.guid_num] -= 1
                        re_usable[changed][key] = existing_chunks[key]
                        analysis_res.reuse_size += cp.size

        last_cache_size = current_cache_size = 0
        # set to determine whether a file is currently cached or not
        cached = set()
        # Using this secondary set is orders of magnitude faster than checking the deque.
        chunks_in_dl_list = set()
        # This is just used to count all unique guids that have been cached
        dl_cache_guids = set()

        # run through the list of files and create the download jobs and also determine minimum
        # runtime cache requirement by simulating adding/removing from cache during download.
        self.log.debug('Creating filetasks and chunktasks...')
        for current_file in sorted(manifest.file_manifest_list.elements,
                                   key=lambda a: a.filename.lower()):
            # skip unchanged and empty files
            if current_file.filename in mc.unchanged:
                continue
            elif not current_file.chunk_parts:
                self.tasks.append(FileTask(current_file.filename, empty=True))
                continue

            existing_chunks = re_usable.get(current_file.filename, None)
            chunk_tasks = []
            reused = 0

            for cp in current_file.chunk_parts:
                ct = ChunkTask(cp.guid_num, cp.offset, cp.size)

                # re-use the chunk from the existing file if we can
                if existing_chunks and (cp.guid_num, cp.offset,
                                        cp.size) in existing_chunks:
                    reused += 1
                    ct.chunk_file = current_file.filename
                    ct.chunk_offset = existing_chunks[(cp.guid_num, cp.offset,
                                                       cp.size)]
                else:
                    # add to DL list if not already in it
                    if cp.guid_num not in chunks_in_dl_list:
                        self.chunks_to_dl.append(cp.guid_num)
                        chunks_in_dl_list.add(cp.guid_num)

                    # if chunk has more than one use or is already in cache,
                    # check if we need to add or remove it again.
                    if references[cp.guid_num] > 1 or cp.guid_num in cached:
                        references[cp.guid_num] -= 1

                        # delete from cache if no references left
                        if references[cp.guid_num] < 1:
                            current_cache_size -= analysis_res.biggest_chunk
                            cached.remove(cp.guid_num)
                            ct.cleanup = True
                        # add to cache if not already cached
                        elif cp.guid_num not in cached:
                            dl_cache_guids.add(cp.guid_num)
                            cached.add(cp.guid_num)
                            current_cache_size += analysis_res.biggest_chunk
                    else:
                        ct.cleanup = True

                chunk_tasks.append(ct)

            if reused:
                self.log.debug(
                    f' + Reusing {reused} chunks from: {current_file.filename}'
                )
                # open temporary file that will contain download + old file contents
                self.tasks.append(
                    FileTask(current_file.filename + u'.tmp', fopen=True))
                self.tasks.extend(chunk_tasks)
                self.tasks.append(
                    FileTask(current_file.filename + u'.tmp', close=True))
                # delete old file and rename temproary
                self.tasks.append(
                    FileTask(current_file.filename,
                             delete=True,
                             rename=True,
                             temporary_filename=current_file.filename +
                             u'.tmp'))
            else:
                self.tasks.append(FileTask(current_file.filename, fopen=True))
                self.tasks.extend(chunk_tasks)
                self.tasks.append(FileTask(current_file.filename, close=True))

            # check if runtime cache size has changed
            if current_cache_size > last_cache_size:
                self.log.debug(
                    f' * New maximum cache size: {current_cache_size / 1024 / 1024:.02f} MiB'
                )
                last_cache_size = current_cache_size

        self.log.debug(
            f'Final cache size requirement: {last_cache_size / 1024 / 1024} MiB.'
        )
        analysis_res.min_memory = last_cache_size + (
            1024 * 1024 * 32)  # add some padding just to be safe

        # Todo implement on-disk caching to avoid this issue.
        if analysis_res.min_memory > self.max_shared_memory:
            shared_mib = f'{self.max_shared_memory / 1024 / 1024:.01f} MiB'
            required_mib = f'{analysis_res.min_memory / 1024 / 1024:.01} MiB'
            raise MemoryError(
                f'Current shared memory cache is smaller than required! {shared_mib} < {required_mib}'
            )

        # calculate actual dl and patch write size.
        analysis_res.dl_size = \
            sum(c.file_size for c in manifest.chunk_data_list.elements if c.guid_num in chunks_in_dl_list)
        analysis_res.uncompressed_dl_size = \
            sum(c.window_size for c in manifest.chunk_data_list.elements if c.guid_num in chunks_in_dl_list)

        # add jobs to remove files
        for fname in mc.removed:
            self.tasks.append(FileTask(fname, delete=True))

        analysis_res.num_chunks_cache = len(dl_cache_guids)
        self.chunk_data_list = manifest.chunk_data_list
        self.analysis = analysis_res

        return analysis_res

    def run(self):
        if not self.analysis:
            raise ValueError(
                'Did not run analysis before trying to run download!')

        # Subprocess will use its own root logger that logs to a Queue instead
        _root = logging.getLogger()
        _root.setLevel(logging.DEBUG if self.proc_debug else logging.INFO)
        if self.logging_queue:
            _root.handlers = []
            _root.addHandler(QueueHandler(self.logging_queue))

        self.log = logging.getLogger('DLMProc')
        self.log.info(
            f'Download Manager running with process-id: {os.getpid()}')

        try:
            self.run_real()
        except KeyboardInterrupt:
            self.log.warning('Immediate exit requested!')
            self.running = False

            # send conditions to unlock threads if they aren't already
            for cond in self.conditions:
                with cond:
                    cond.notify()

            # make sure threads are dead.
            for t in self.threads:
                t.join(timeout=5.0)
                if t.is_alive():
                    self.log.warning(f'Thread did not terminate! {repr(t)}')

            # clean up all the queues, otherwise this process won't terminate properly
            for name, q in zip(('Download jobs', 'Writer jobs',
                                'Download results', 'Writer results'),
                               (self.dl_worker_queue, self.writer_queue,
                                self.dl_result_q, self.writer_result_q)):
                self.log.debug(f'Cleaning up queue "{name}"')
                try:
                    while True:
                        _ = q.get_nowait()
                except Empty:
                    q.close()
                    q.join_thread()

    def run_real(self):
        self.shared_memory = SharedMemory(create=True,
                                          size=self.max_shared_memory)
        self.log.debug(
            f'Created shared memory of size: {self.shared_memory.size / 1024 / 1024:.02f} MiB'
        )

        # create the shared memory segments and add them to their respective pools
        for i in range(
                int(self.shared_memory.size / self.analysis.biggest_chunk)):
            _sms = SharedMemorySegment(offset=i * self.analysis.biggest_chunk,
                                       end=i * self.analysis.biggest_chunk +
                                       self.analysis.biggest_chunk)
            self.sms.append(_sms)

        self.log.debug(f'Created {len(self.sms)} shared memory segments.')

        # Create queues
        self.dl_worker_queue = MPQueue(-1)
        self.writer_queue = MPQueue(-1)
        self.dl_result_q = MPQueue(-1)
        self.writer_result_q = MPQueue(-1)

        self.log.info(f'Starting download workers...')
        for i in range(self.max_workers):
            w = DLWorker(f'DLWorker {i + 1}',
                         self.dl_worker_queue,
                         self.dl_result_q,
                         self.shared_memory.name,
                         logging_queue=self.logging_queue)
            self.children.append(w)
            w.start()

        self.log.info('Starting file writing worker...')
        writer_p = FileWorker(self.writer_queue, self.writer_result_q,
                              self.dl_dir, self.shared_memory.name,
                              self.cache_dir, self.logging_queue)
        self.children.append(writer_p)
        writer_p.start()

        num_chunk_tasks = sum(isinstance(t, ChunkTask) for t in self.tasks)
        num_dl_tasks = len(self.chunks_to_dl)
        num_tasks = len(self.tasks)
        num_shared_memory_segments = len(self.sms)
        self.log.debug(
            f'Chunks to download: {num_dl_tasks}, File tasks: {num_tasks}, Chunk tasks: {num_chunk_tasks}'
        )

        # active downloader tasks
        self.active_tasks = 0
        processed_chunks = 0
        processed_tasks = 0
        total_dl = 0
        total_write = 0

        # synchronization conditions
        shm_cond = Condition()
        task_cond = Condition()
        self.conditions = [shm_cond, task_cond]

        # start threads
        s_time = time.time()
        self.threads.append(
            Thread(target=self.download_job_manager,
                   args=(task_cond, shm_cond)))
        self.threads.append(
            Thread(target=self.dl_results_handler, args=(task_cond, )))
        self.threads.append(
            Thread(target=self.fw_results_handler, args=(shm_cond, )))

        for t in self.threads:
            t.start()

        last_update = time.time()

        while processed_tasks < num_tasks:
            delta = time.time() - last_update
            if not delta:
                time.sleep(self.update_interval)
                continue

            # update all the things
            processed_chunks += self.num_processed_since_last
            processed_tasks += self.num_tasks_processed_since_last

            total_dl += self.bytes_downloaded_since_last
            total_write += self.bytes_written_since_last

            dl_speed = self.bytes_downloaded_since_last / delta
            dl_unc_speed = self.bytes_decompressed_since_last / delta
            w_speed = self.bytes_written_since_last / delta
            r_speed = self.bytes_read_since_last / delta
            c_speed = self.num_processed_since_last / delta

            # set temporary counters to 0
            self.bytes_read_since_last = self.bytes_written_since_last = 0
            self.bytes_downloaded_since_last = self.num_processed_since_last = 0
            self.bytes_decompressed_since_last = self.num_tasks_processed_since_last = 0
            last_update = time.time()

            perc = (processed_chunks / num_chunk_tasks) * 100
            self.log.info(
                f'\n============== {time.time() - s_time:.01f} seconds since start'
            )
            self.log.info(
                f'Progress: {processed_chunks}/{num_chunk_tasks} ({perc:.02f}%) chunk tasks processed.'
            )
            self.log.info(f'Downloaded: {total_dl / 1024 / 1024:.02f} MiB, '
                          f'Written: {total_write / 1024 / 1024:.02f} MiB')

            # speed meters
            self.log.info('Speeds:')
            self.log.info(
                f' + Download     - {dl_speed / 1024 / 1024:.02f} MiB/s (raw) '
                f'/ {dl_unc_speed / 1024 / 1024:.02f} MiB/s (decompressed)')
            self.log.info(
                f' + Write (disk) - {w_speed / 1024 / 1024:.02f} MiB/s')
            self.log.info(
                f' + Read (disk)  - {r_speed / 1024 / 1024:.02f} MiB/s')
            self.log.info(f' + Tasks        - {c_speed:.02f} Chunks/s')
            self.log.info(f'Active download tasks: {self.active_tasks}')

            # shared memory debugging
            total_avail = len(self.sms)
            total_used = (num_shared_memory_segments - total_avail) * (
                self.analysis.biggest_chunk / 1024 / 1024)
            self.log.info(
                f'Shared memory usage: {total_used} MiB, available: {total_avail}'
            )

            # send status update to back to instantiator (if queue exists)
            if self.status_queue:
                try:
                    self.status_queue.put(UIUpdate(progress=perc,
                                                   download_speed=dl_unc_speed,
                                                   write_speed=w_speed,
                                                   read_speed=r_speed,
                                                   memory_usage=total_used *
                                                   1024 * 1024),
                                          timeout=1.0)
                except Exception as e:
                    self.log.warning(
                        f'Failed to send status update to queue: {e!r}')

            time.sleep(self.update_interval)

        for i in range(self.max_workers):
            self.dl_worker_queue.put_nowait(DownloaderTask(kill=True))

        self.writer_queue.put_nowait(WriterTask('', kill=True))
        self.log.info('Waiting for writer process to finish...')

        writer_p.join(timeout=10.0)
        if writer_p.exitcode is None:
            self.log.warning(f'Terminating writer process {e!r}')
            writer_p.terminate()

        # forcibly kill DL workers that are not actually dead yet
        for child in self.children:
            if child.exitcode is None:
                child.terminate()

        # make sure all the threads are dead.
        for t in self.threads:
            t.join(timeout=5.0)
            if t.is_alive():
                self.log.warning(f'Thread did not terminate! {repr(t)}')

        # clean up resume file
        if self.resume_file:
            try:
                os.remove(self.resume_file)
            except OSError as e:
                self.log.warning(f'Failed to remove resume file: {e!r}')

        # close up shared memory
        self.shared_memory.close()
        self.shared_memory.unlink()
        self.shared_memory = None

        # finally, exit the process.
        exit(0)
    def _process(source: str, modulus: int, config: Config):
        """
        Calculating distance matrix
        :param modulus:
        :param config:
        :return:
        """
        # preparing access for shared memories
        embedding = config.embedding.embedding
        # Setting random seed
        np.random.seed(config.semantic_categories.seed)

        if source == "embedding":
            # Embedding to use (first pass)
            weights_mem = SharedMemory(embedding.embedding_memory_name)
            w = np.ndarray(shape=embedding.embedding_memory_shape, dtype=embedding.embedding_memory_dtype,
                           buffer=weights_mem.buf)
            # Results will be saved here
            dist_mem = SharedMemory(config.data.distance_matrix)
            distance_matrix = np.ndarray(shape=config.data.distance_matrix_shape, buffer=dist_mem.buf)
        else:
            # Embedding to use (second pass)
            weights_mem = SharedMemory(config.data.transformed_space)
            w = np.ndarray(shape=config.data.transformed_space_shape, buffer=weights_mem.buf)
            # Results will be saved here
            dist_mem = SharedMemory(config.data.transformed_space_distance_matrix)
            distance_matrix = np.ndarray(shape=config.data.transformed_space_distance_matrix_shape, buffer=dist_mem.buf)

        w2i_mem = SharedMemory(embedding.w2i_memory_name)
        w2i = embedding.buff_to_dict(w2i_mem, embedding.w2i_memory_size)

        semcat = config.semantic_categories.categories

        # Iterating over the dimensions of the embedding
        for i in tqdm.trange(w.shape[1], unit='dim', desc=f'PID -> {os.getpid()}\t'):
            if i % config.project.processes == modulus:
                dimension = w[:, i]
                # Perturbation
                estimation = MCMCModel.mcmc(dimension, minimum_acceptance=config.model.mcmc_acceptance)
                # Iterating over the semantic categories
                for j in range(config.semantic_categories.categories.i2c.__len__()):
                    word_indexes = np.zeros(shape=[w.shape[0], ], dtype=np.bool)
                    # One-hot selection vector for in-category words
                    for word in semcat.vocab[semcat.i2c[j]]:
                        try:
                            word_indexes[w2i[word]] = True
                        except KeyError:
                            continue

                    # Populate P with category word weights
                    _p = dimension[word_indexes]
                    # Populate Q with out of category word weights
                    _q = dimension[~word_indexes]

                    samples = estimation.rvs(size=int(round(_p.shape[0]*config.model.mcmc_noise)))

                    _p = np.concatenate([_p, np.array(samples)])
                    # calculating distance
                    distance, sign = config.distance.function(_p, _q, config)
                    distance_matrix[i, j, 0] = distance
                    distance_matrix[i, j, 1] = sign
        weights_mem.close()
        dist_mem.close()
        w2i_mem.close()
예제 #15
0
class DLManager(Process):
    def __init__(self, download_dir, base_url, cache_dir=None, status_q=None,
                 max_workers=0, update_interval=1.0, dl_timeout=10, resume_file=None,
                 max_shared_memory=1024 * 1024 * 1024):
        super().__init__(name='DLManager')
        self.log = logging.getLogger('DLM')
        self.proc_debug = False

        self.base_url = base_url
        self.dl_dir = download_dir
        self.cache_dir = cache_dir if cache_dir else os.path.join(download_dir, '.cache')

        # All the queues!
        self.logging_queue = None
        self.dl_worker_queue = None
        self.writer_queue = None
        self.dl_result_q = None
        self.writer_result_q = None
        self.max_workers = max_workers if max_workers else min(cpu_count() * 2, 16)
        self.dl_timeout = dl_timeout

        # Analysis stuff
        self.analysis = None
        self.tasks = deque()
        self.chunks_to_dl = deque()
        self.chunk_data_list = None

        # shared memory stuff
        self.max_shared_memory = max_shared_memory  # 1 GiB by default
        self.sms = deque()
        self.shared_memory = None

        # Interval for log updates and pushing updates to the queue
        self.update_interval = update_interval
        self.status_queue = status_q  # queue used to relay status info back to GUI/CLI

        # Resume file stuff
        self.resume_file = resume_file
        self.hash_map = dict()

        # cross-thread runtime information
        self.running = True
        self.active_tasks = 0
        self.children = []
        self.threads = []
        self.conditions = []
        # bytes downloaded and decompressed since last report
        self.bytes_downloaded_since_last = 0
        self.bytes_decompressed_since_last = 0
        # bytes written since last report
        self.bytes_written_since_last = 0
        # bytes read since last report
        self.bytes_read_since_last = 0
        # chunks written since last report
        self.num_processed_since_last = 0
        self.num_tasks_processed_since_last = 0

    def run_analysis(self, manifest: Manifest, old_manifest: Manifest = None,
                     patch=True, resume=True, file_prefix_filter=None,
                     file_exclude_filter=None, file_install_tag=None,
                     processing_optimization=False) -> AnalysisResult:
        """
        Run analysis on manifest and old manifest (if not None) and return a result
        with a summary resources required in order to install the provided manifest.

        :param manifest: Manifest to install
        :param old_manifest: Old manifest to patch from (if applicable)
        :param patch: Patch instead of redownloading the entire file
        :param resume: Continue based on resume file if it exists
        :param file_prefix_filter: Only download files that start with this prefix
        :param file_exclude_filter: Exclude files with this prefix from download
        :param file_install_tag: Only install files with the specified tag
        :param processing_optimization: Attempt to optimize processing order and RAM usage
        :return: AnalysisResult
        """

        analysis_res = AnalysisResult()
        analysis_res.install_size = sum(fm.file_size for fm in manifest.file_manifest_list.elements)
        analysis_res.biggest_chunk = max(c.window_size for c in manifest.chunk_data_list.elements)
        analysis_res.biggest_file_size = max(f.file_size for f in manifest.file_manifest_list.elements)
        is_1mib = analysis_res.biggest_chunk == 1024 * 1024
        self.log.debug(f'Biggest chunk size: {analysis_res.biggest_chunk} bytes (== 1 MiB? {is_1mib})')

        self.log.debug(f'Creating manifest comparison...')
        mc = ManifestComparison.create(manifest, old_manifest)
        analysis_res.manifest_comparison = mc

        if resume and self.resume_file and os.path.exists(self.resume_file):
            self.log.info('Found previously interrupted download. Download will be resumed if possible.')
            try:
                missing = 0
                mismatch = 0
                completed_files = set()

                for line in open(self.resume_file).readlines():
                    file_hash, _, filename = line.strip().partition(':')
                    _p = os.path.join(self.dl_dir, filename)
                    if not os.path.exists(_p):
                        self.log.debug(f'File does not exist but is in resume file: "{_p}"')
                        missing += 1
                    elif file_hash != manifest.file_manifest_list.get_file_by_path(filename).sha_hash.hex():
                        mismatch += 1
                    else:
                        completed_files.add(filename)

                if missing:
                    self.log.warning(f'{missing} previously completed file(s) are missing, they will be redownloaded.')
                if mismatch:
                    self.log.warning(f'{mismatch} existing file(s) have been changed and will be redownloaded.')

                # remove completed files from changed/added and move them to unchanged for the analysis.
                mc.added -= completed_files
                mc.changed -= completed_files
                mc.unchanged |= completed_files
                self.log.info(f'Skipping {len(completed_files)} files based on resume data.')
            except Exception as e:
                self.log.warning(f'Reading resume file failed: {e!r}, continuing as normal...')

        # Install tags are used for selective downloading, e.g. for language packs
        additional_deletion_tasks = []
        if file_install_tag is not None:
            if isinstance(file_install_tag, str):
                file_install_tag = [file_install_tag]

            files_to_skip = set(i.filename for i in manifest.file_manifest_list.elements
                                if not any((fit in i.install_tags) or (not fit and not i.install_tags)
                                           for fit in file_install_tag))
            self.log.info(f'Found {len(files_to_skip)} files to skip based on install tag.')
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip
            for fname in sorted(files_to_skip):
                additional_deletion_tasks.append(FileTask(fname, delete=True, silent=True))

        # if include/exclude prefix has been set: mark all files that are not to be downloaded as unchanged
        if file_exclude_filter:
            if isinstance(file_exclude_filter, str):
                file_exclude_filter = [file_exclude_filter]

            file_exclude_filter = [f.lower() for f in file_exclude_filter]
            files_to_skip = set(i.filename for i in manifest.file_manifest_list.elements if
                                any(i.filename.lower().startswith(pfx) for pfx in file_exclude_filter))
            self.log.info(f'Found {len(files_to_skip)} files to skip based on exclude prefix.')
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        if file_prefix_filter:
            if isinstance(file_prefix_filter, str):
                file_prefix_filter = [file_prefix_filter]

            file_prefix_filter = [f.lower() for f in file_prefix_filter]
            files_to_skip = set(i.filename for i in manifest.file_manifest_list.elements if not
                                any(i.filename.lower().startswith(pfx) for pfx in file_prefix_filter))
            self.log.info(f'Found {len(files_to_skip)} files to skip based on include prefix(es)')
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        if file_prefix_filter or file_exclude_filter or file_install_tag:
            self.log.info(f'Remaining files after filtering: {len(mc.added) + len(mc.changed)}')
            # correct install size after filtering
            analysis_res.install_size = sum(fm.file_size for fm in manifest.file_manifest_list.elements
                                            if fm.filename in mc.added)

        if mc.removed:
            analysis_res.removed = len(mc.removed)
            self.log.debug(f'{analysis_res.removed} removed files')
        if mc.added:
            analysis_res.added = len(mc.added)
            self.log.debug(f'{analysis_res.added} added files')
        if mc.changed:
            analysis_res.changed = len(mc.changed)
            self.log.debug(f'{analysis_res.changed} changed files')
        if mc.unchanged:
            analysis_res.unchanged = len(mc.unchanged)
            self.log.debug(f'{analysis_res.unchanged} unchanged files')

        if processing_optimization and len(manifest.file_manifest_list.elements) > 100_000:
            self.log.warning('Manifest contains too many files, processing optimizations will be disabled.')
            processing_optimization = False
        elif processing_optimization:
            self.log.info('Processing order optimization is enabled, analysis may take a few seconds longer...')

        # count references to chunks for determining runtime cache size later
        references = Counter()
        fmlist = sorted(manifest.file_manifest_list.elements,
                        key=lambda a: a.filename.lower())

        for fm in fmlist:
            self.hash_map[fm.filename] = fm.sha_hash.hex()

            # chunks of unchanged files are not downloaded so we can skip them
            if fm.filename in mc.unchanged:
                analysis_res.unchanged += fm.file_size
                continue

            for cp in fm.chunk_parts:
                references[cp.guid_num] += 1

        if processing_optimization:
            s_time = time.time()
            # reorder the file manifest list to group files that share many chunks
            # 4 is mostly arbitrary but has shown in testing to be a good choice
            min_overlap = 4
            # ignore files with less than N chunk parts, this speeds things up dramatically
            cp_threshold = 5

            remaining_files = {fm.filename: {cp.guid_num for cp in fm.chunk_parts}
                               for fm in fmlist if fm.filename not in mc.unchanged}
            _fmlist = []

            # iterate over all files that will be downloaded and pair up those that share the most chunks
            for fm in fmlist:
                if fm.filename not in remaining_files:
                    continue

                _fmlist.append(fm)
                f_chunks = remaining_files.pop(fm.filename)
                if len(f_chunks) < cp_threshold:
                    continue

                best_overlap, match = 0, None
                for fname, chunks in remaining_files.items():
                    if len(chunks) < cp_threshold:
                        continue
                    overlap = len(f_chunks & chunks)
                    if overlap > min_overlap and overlap > best_overlap:
                        best_overlap, match = overlap, fname

                if match:
                    _fmlist.append(manifest.file_manifest_list.get_file_by_path(match))
                    remaining_files.pop(match)

            fmlist = _fmlist
            opt_delta = time.time() - s_time
            self.log.debug(f'Processing optimizations took {opt_delta:.01f} seconds.')

        # determine reusable chunks and prepare lookup table for reusable ones
        re_usable = defaultdict(dict)
        if old_manifest and mc.changed and patch:
            self.log.debug('Analyzing manifests for re-usable chunks...')
            for changed in mc.changed:
                old_file = old_manifest.file_manifest_list.get_file_by_path(changed)
                new_file = manifest.file_manifest_list.get_file_by_path(changed)

                existing_chunks = defaultdict(list)
                off = 0
                for cp in old_file.chunk_parts:
                    existing_chunks[cp.guid_num].append((off, cp.offset, cp.offset + cp.size))
                    off += cp.size

                for cp in new_file.chunk_parts:
                    key = (cp.guid_num, cp.offset, cp.size)
                    for file_o, cp_o, cp_end_o in existing_chunks[cp.guid_num]:
                        # check if new chunk part is wholly contained in the old chunk part
                        if cp_o <= cp.offset and (cp.offset + cp.size) <= cp_end_o:
                            references[cp.guid_num] -= 1
                            re_usable[changed][key] = file_o + (cp.offset - cp_o)
                            analysis_res.reuse_size += cp.size
                            break

        last_cache_size = current_cache_size = 0
        # set to determine whether a file is currently cached or not
        cached = set()
        # Using this secondary set is orders of magnitude faster than checking the deque.
        chunks_in_dl_list = set()
        # This is just used to count all unique guids that have been cached
        dl_cache_guids = set()

        # run through the list of files and create the download jobs and also determine minimum
        # runtime cache requirement by simulating adding/removing from cache during download.
        self.log.debug('Creating filetasks and chunktasks...')
        for current_file in fmlist:
            # skip unchanged and empty files
            if current_file.filename in mc.unchanged:
                continue
            elif not current_file.chunk_parts:
                self.tasks.append(FileTask(current_file.filename, empty=True))
                continue

            existing_chunks = re_usable.get(current_file.filename, None)
            chunk_tasks = []
            reused = 0

            for cp in current_file.chunk_parts:
                ct = ChunkTask(cp.guid_num, cp.offset, cp.size)

                # re-use the chunk from the existing file if we can
                if existing_chunks and (cp.guid_num, cp.offset, cp.size) in existing_chunks:
                    reused += 1
                    ct.chunk_file = current_file.filename
                    ct.chunk_offset = existing_chunks[(cp.guid_num, cp.offset, cp.size)]
                else:
                    # add to DL list if not already in it
                    if cp.guid_num not in chunks_in_dl_list:
                        self.chunks_to_dl.append(cp.guid_num)
                        chunks_in_dl_list.add(cp.guid_num)

                    # if chunk has more than one use or is already in cache,
                    # check if we need to add or remove it again.
                    if references[cp.guid_num] > 1 or cp.guid_num in cached:
                        references[cp.guid_num] -= 1

                        # delete from cache if no references left
                        if references[cp.guid_num] < 1:
                            current_cache_size -= analysis_res.biggest_chunk
                            cached.remove(cp.guid_num)
                            ct.cleanup = True
                        # add to cache if not already cached
                        elif cp.guid_num not in cached:
                            dl_cache_guids.add(cp.guid_num)
                            cached.add(cp.guid_num)
                            current_cache_size += analysis_res.biggest_chunk
                    else:
                        ct.cleanup = True

                chunk_tasks.append(ct)

            if reused:
                self.log.debug(f' + Reusing {reused} chunks from: {current_file.filename}')
                # open temporary file that will contain download + old file contents
                self.tasks.append(FileTask(current_file.filename + u'.tmp', fopen=True))
                self.tasks.extend(chunk_tasks)
                self.tasks.append(FileTask(current_file.filename + u'.tmp', close=True))
                # delete old file and rename temporary
                self.tasks.append(FileTask(current_file.filename, delete=True, rename=True,
                                           temporary_filename=current_file.filename + u'.tmp'))
            else:
                self.tasks.append(FileTask(current_file.filename, fopen=True))
                self.tasks.extend(chunk_tasks)
                self.tasks.append(FileTask(current_file.filename, close=True))

            # check if runtime cache size has changed
            if current_cache_size > last_cache_size:
                self.log.debug(f' * New maximum cache size: {current_cache_size / 1024 / 1024:.02f} MiB')
                last_cache_size = current_cache_size

        self.log.debug(f'Final cache size requirement: {last_cache_size / 1024 / 1024} MiB.')
        analysis_res.min_memory = last_cache_size + (1024 * 1024 * 32)  # add some padding just to be safe

        # Todo implement on-disk caching to avoid this issue.
        if analysis_res.min_memory > self.max_shared_memory:
            shared_mib = f'{self.max_shared_memory / 1024 / 1024:.01f} MiB'
            required_mib = f'{analysis_res.min_memory / 1024 / 1024:.01f} MiB'
            suggested_mib = round(self.max_shared_memory / 1024 / 1024 +
                                  (analysis_res.min_memory - self.max_shared_memory) / 1024 / 1024 + 32)

            if processing_optimization:
                message = f'Try running legendary with "--enable-reordering --max-shared-memory {suggested_mib:.0f}"'
            else:
                message = 'Try running legendary with "--enable-reordering" to reduce memory usage, ' \
                          f'or use "--max-shared-memory {suggested_mib:.0f}" to increase the limit.'

            raise MemoryError(f'Current shared memory cache is smaller than required: {shared_mib} < {required_mib}. '
                              + message)

        # calculate actual dl and patch write size.
        analysis_res.dl_size = \
            sum(c.file_size for c in manifest.chunk_data_list.elements if c.guid_num in chunks_in_dl_list)
        analysis_res.uncompressed_dl_size = \
            sum(c.window_size for c in manifest.chunk_data_list.elements if c.guid_num in chunks_in_dl_list)

        # add jobs to remove files
        for fname in mc.removed:
            self.tasks.append(FileTask(fname, delete=True))
        self.tasks.extend(additional_deletion_tasks)

        analysis_res.num_chunks_cache = len(dl_cache_guids)
        self.chunk_data_list = manifest.chunk_data_list
        self.analysis = analysis_res

        return analysis_res

    def download_job_manager(self, task_cond: Condition, shm_cond: Condition):
        while self.chunks_to_dl and self.running:
            while self.active_tasks < self.max_workers * 2 and self.chunks_to_dl:
                try:
                    sms = self.sms.popleft()
                    no_shm = False
                except IndexError:  # no free cache
                    no_shm = True
                    break

                c_guid = self.chunks_to_dl.popleft()
                chunk = self.chunk_data_list.get_chunk_by_guid(c_guid)
                self.log.debug(f'Adding {chunk.guid_num} (active: {self.active_tasks})')
                try:
                    self.dl_worker_queue.put(DownloaderTask(url=self.base_url + '/' + chunk.path,
                                                            chunk_guid=c_guid, shm=sms),
                                             timeout=1.0)
                except Exception as e:
                    self.log.warning(f'Failed to add to download queue: {e!r}')
                    self.chunks_to_dl.appendleft(c_guid)
                    break

                self.active_tasks += 1
            else:
                # active tasks limit hit, wait for tasks to finish
                with task_cond:
                    self.log.debug('Waiting for download tasks to complete..')
                    task_cond.wait(timeout=1.0)
                    continue

            if no_shm:
                # if we break we ran out of shared memory, so wait for that.
                with shm_cond:
                    self.log.debug('Waiting for more shared memory...')
                    shm_cond.wait(timeout=1.0)

        self.log.debug('Download Job Manager quitting...')

    def dl_results_handler(self, task_cond: Condition):
        in_buffer = dict()

        task = self.tasks.popleft()
        current_file = ''

        while task and self.running:
            if isinstance(task, FileTask):  # this wasn't necessarily a good idea...
                try:
                    if task.empty:
                        self.writer_queue.put(WriterTask(task.filename, empty=True), timeout=1.0)
                    elif task.rename:
                        self.writer_queue.put(WriterTask(task.filename, rename=True,
                                                         delete=task.delete,
                                                         old_filename=task.temporary_filename),
                                              timeout=1.0)
                    elif task.delete:
                        self.writer_queue.put(WriterTask(task.filename, delete=True, silent=task.silent), timeout=1.0)
                    elif task.open:
                        self.writer_queue.put(WriterTask(task.filename, fopen=True), timeout=1.0)
                        current_file = task.filename
                    elif task.close:
                        self.writer_queue.put(WriterTask(task.filename, close=True), timeout=1.0)
                except Exception as e:
                    self.tasks.appendleft(task)
                    self.log.warning(f'Adding to queue failed: {e!r}')
                    continue

                try:
                    task = self.tasks.popleft()
                except IndexError:  # finished
                    break
                continue

            while (task.chunk_guid in in_buffer) or task.chunk_file:
                res_shm = None
                if not task.chunk_file:  # not re-using from an old file
                    res_shm = in_buffer[task.chunk_guid].shm

                try:
                    self.log.debug(f'Adding {task.chunk_guid} to writer queue')
                    self.writer_queue.put(WriterTask(
                        filename=current_file, shared_memory=res_shm,
                        chunk_offset=task.chunk_offset, chunk_size=task.chunk_size,
                        chunk_guid=task.chunk_guid, release_memory=task.cleanup,
                        old_file=task.chunk_file  # todo on-disk cache
                    ), timeout=1.0)
                except Exception as e:
                    self.log.warning(f'Adding to queue failed: {e!r}')
                    break

                if task.cleanup and not task.chunk_file:
                    del in_buffer[task.chunk_guid]

                try:
                    task = self.tasks.popleft()
                    if isinstance(task, FileTask):
                        break
                except IndexError:  # finished
                    task = None
                    break
            else:  # only enter blocking code if the loop did not break
                try:
                    res = self.dl_result_q.get(timeout=1)
                    self.active_tasks -= 1
                    with task_cond:
                        task_cond.notify()

                    if res.success:
                        self.log.debug(f'Download for {res.guid} succeeded, adding to in_buffer...')
                        in_buffer[res.guid] = res
                        self.bytes_downloaded_since_last += res.compressed_size
                        self.bytes_decompressed_since_last += res.size
                    else:
                        self.log.error(f'Download for {res.guid} failed, retrying...')
                        try:
                            self.dl_worker_queue.put(DownloaderTask(
                                url=res.url, chunk_guid=res.guid, shm=res.shm
                            ), timeout=1.0)
                            self.active_tasks += 1
                        except Exception as e:
                            self.log.warning(f'Failed adding retry task to queue! {e!r}')
                            # If this failed for whatever reason, put the chunk at the front of the DL list
                            self.chunks_to_dl.appendleft(res.chunk_guid)
                except Empty:
                    pass
                except Exception as e:
                    self.log.warning(f'Unhandled exception when trying to read download result queue: {e!r}')

        self.log.debug('Download result handler quitting...')

    def fw_results_handler(self, shm_cond: Condition):
        while self.running:
            try:
                res = self.writer_result_q.get(timeout=1.0)
                self.num_tasks_processed_since_last += 1

                if res.closed and self.resume_file and res.success:
                    if res.filename.endswith('.tmp'):
                        res.filename = res.filename[:-4]

                    file_hash = self.hash_map[res.filename]
                    # write last completed file to super simple resume file
                    with open(self.resume_file, 'ab') as rf:
                        rf.write(f'{file_hash}:{res.filename}\n'.encode('utf-8'))

                if res.kill:
                    self.log.debug('Got termination command in FW result handler')
                    break

                if not res.success:
                    # todo make this kill the installation process or at least skip the file and mark it as failed
                    self.log.fatal(f'Writing for {res.filename} failed!')
                if res.release_memory:
                    self.sms.appendleft(res.shm)
                    with shm_cond:
                        shm_cond.notify()

                if res.chunk_guid:
                    self.bytes_written_since_last += res.size
                    # if there's no shared memory we must have read from disk.
                    if not res.shm:
                        self.bytes_read_since_last += res.size
                    self.num_processed_since_last += 1

            except Empty:
                continue
            except Exception as e:
                self.log.warning(f'Exception when trying to read writer result queue: {e!r}')
        self.log.debug('Writer result handler quitting...')

    def run(self):
        if not self.analysis:
            raise ValueError('Did not run analysis before trying to run download!')

        # Subprocess will use its own root logger that logs to a Queue instead
        _root = logging.getLogger()
        _root.setLevel(logging.DEBUG if self.proc_debug else logging.INFO)
        if self.logging_queue:
            _root.handlers = []
            _root.addHandler(QueueHandler(self.logging_queue))

        self.log = logging.getLogger('DLManager')
        self.log.info(f'Download Manager running with process-id: {os.getpid()}')

        try:
            self.run_real()
        except KeyboardInterrupt:
            self.log.warning('Immediate exit requested!')
            self.running = False

            # send conditions to unlock threads if they aren't already
            for cond in self.conditions:
                with cond:
                    cond.notify()

            # make sure threads are dead.
            for t in self.threads:
                t.join(timeout=5.0)
                if t.is_alive():
                    self.log.warning(f'Thread did not terminate! {repr(t)}')

            # clean up all the queues, otherwise this process won't terminate properly
            for name, q in zip(('Download jobs', 'Writer jobs', 'Download results', 'Writer results'),
                               (self.dl_worker_queue, self.writer_queue, self.dl_result_q, self.writer_result_q)):
                self.log.debug(f'Cleaning up queue "{name}"')
                try:
                    while True:
                        _ = q.get_nowait()
                except Empty:
                    q.close()
                    q.join_thread()

    def run_real(self):
        self.shared_memory = SharedMemory(create=True, size=self.max_shared_memory)
        self.log.debug(f'Created shared memory of size: {self.shared_memory.size / 1024 / 1024:.02f} MiB')

        # create the shared memory segments and add them to their respective pools
        for i in range(int(self.shared_memory.size / self.analysis.biggest_chunk)):
            _sms = SharedMemorySegment(offset=i * self.analysis.biggest_chunk,
                                       end=i * self.analysis.biggest_chunk + self.analysis.biggest_chunk)
            self.sms.append(_sms)

        self.log.debug(f'Created {len(self.sms)} shared memory segments.')

        # Create queues
        self.dl_worker_queue = MPQueue(-1)
        self.writer_queue = MPQueue(-1)
        self.dl_result_q = MPQueue(-1)
        self.writer_result_q = MPQueue(-1)

        self.log.info(f'Starting download workers...')
        for i in range(self.max_workers):
            w = DLWorker(f'DLWorker {i + 1}', self.dl_worker_queue, self.dl_result_q,
                         self.shared_memory.name, logging_queue=self.logging_queue,
                         dl_timeout=self.dl_timeout)
            self.children.append(w)
            w.start()

        self.log.info('Starting file writing worker...')
        writer_p = FileWorker(self.writer_queue, self.writer_result_q, self.dl_dir,
                              self.shared_memory.name, self.cache_dir, self.logging_queue)
        self.children.append(writer_p)
        writer_p.start()

        num_chunk_tasks = sum(isinstance(t, ChunkTask) for t in self.tasks)
        num_dl_tasks = len(self.chunks_to_dl)
        num_tasks = len(self.tasks)
        num_shared_memory_segments = len(self.sms)
        self.log.debug(f'Chunks to download: {num_dl_tasks}, File tasks: {num_tasks}, Chunk tasks: {num_chunk_tasks}')

        # active downloader tasks
        self.active_tasks = 0
        processed_chunks = 0
        processed_tasks = 0
        total_dl = 0
        total_write = 0

        # synchronization conditions
        shm_cond = Condition()
        task_cond = Condition()
        self.conditions = [shm_cond, task_cond]

        # start threads
        s_time = time.time()
        self.threads.append(Thread(target=self.download_job_manager, args=(task_cond, shm_cond)))
        self.threads.append(Thread(target=self.dl_results_handler, args=(task_cond,)))
        self.threads.append(Thread(target=self.fw_results_handler, args=(shm_cond,)))

        for t in self.threads:
            t.start()

        last_update = time.time()

        while processed_tasks < num_tasks:
            delta = time.time() - last_update
            if not delta:
                time.sleep(self.update_interval)
                continue

            # update all the things
            processed_chunks += self.num_processed_since_last
            processed_tasks += self.num_tasks_processed_since_last

            total_dl += self.bytes_downloaded_since_last
            total_write += self.bytes_written_since_last

            dl_speed = self.bytes_downloaded_since_last / delta
            dl_unc_speed = self.bytes_decompressed_since_last / delta
            w_speed = self.bytes_written_since_last / delta
            r_speed = self.bytes_read_since_last / delta
            # c_speed = self.num_processed_since_last / delta

            # set temporary counters to 0
            self.bytes_read_since_last = self.bytes_written_since_last = 0
            self.bytes_downloaded_since_last = self.num_processed_since_last = 0
            self.bytes_decompressed_since_last = self.num_tasks_processed_since_last = 0
            last_update = time.time()

            perc = (processed_chunks / num_chunk_tasks) * 100
            runtime = time.time() - s_time
            total_avail = len(self.sms)
            total_used = (num_shared_memory_segments - total_avail) * (self.analysis.biggest_chunk / 1024 / 1024)

            if runtime and processed_chunks:
                rt_hours, runtime = int(runtime // 3600), runtime % 3600
                rt_minutes, rt_seconds = int(runtime // 60), int(runtime % 60)

                average_speed = processed_chunks / runtime
                estimate = (num_chunk_tasks - processed_chunks) / average_speed
                hours, estimate = int(estimate // 3600), estimate % 3600
                minutes, seconds = int(estimate // 60), int(estimate % 60)
            else:
                hours = minutes = seconds = 0
                rt_hours = rt_minutes = rt_seconds = 0

            bar.set_fraction(perc)
            bar.set_text( f'{perc:.02f}% ({processed_chunks}/{num_chunk_tasks}), '
                          f'Elapsed: {rt_hours:02d}:{rt_minutes:02d}:{rt_seconds:02d}, '
                          f'ETA: {hours:02d}:{minutes:02d}:{seconds:02d}'
                          f'{dl_speed / 1024 / 1024:.02f} MiB/s'
            )

            #self.log.info(f'= Progress: {perc:.02f}% ({processed_chunks}/{num_chunk_tasks}), '
            #              f'Running for {rt_hours:02d}:{rt_minutes:02d}:{rt_seconds:02d}, '
            #              f'ETA: {hours:02d}:{minutes:02d}:{seconds:02d}')
            #self.log.info(f' - Downloaded: {total_dl / 1024 / 1024:.02f} MiB, '
            #              f'Written: {total_write / 1024 / 1024:.02f} MiB')
            #self.log.info(f' - Cache usage: {total_used} MiB, active tasks: {self.active_tasks}')
            #self.log.info(f' + Download\t- {dl_speed / 1024 / 1024:.02f} MiB/s (raw) '
            #              f'/ {dl_unc_speed / 1024 / 1024:.02f} MiB/s (decompressed)')
            #self.log.info(f' + Disk\t- {w_speed / 1024 / 1024:.02f} MiB/s (write) / '
            #              f'{r_speed / 1024 / 1024:.02f} MiB/s (read)')

            # send status update to back to instantiator (if queue exists)
            if self.status_queue:
                try:
                    self.status_queue.put(UIUpdate(
                        progress=perc, download_speed=dl_unc_speed, write_speed=w_speed, read_speed=r_speed,
                        memory_usage=total_used * 1024 * 1024
                    ), timeout=1.0)
                except Exception as e:
                    self.log.warning(f'Failed to send status update to queue: {e!r}')

            time.sleep(self.update_interval)

        for i in range(self.max_workers):
            self.dl_worker_queue.put_nowait(DownloaderTask(kill=True))

        self.log.info('Waiting for installation to finish...')
        self.writer_queue.put_nowait(WriterTask('', kill=True))

        writer_p.join(timeout=10.0)
        if writer_p.exitcode is None:
            self.log.warning(f'Terminating writer process, no exit code!')
            writer_p.terminate()

        # forcibly kill DL workers that are not actually dead yet
        for child in self.children:
            if child.exitcode is None:
                child.terminate()

        # make sure all the threads are dead.
        for t in self.threads:
            t.join(timeout=5.0)
            if t.is_alive():
                self.log.warning(f'Thread did not terminate! {repr(t)}')

        # clean up resume file
        if self.resume_file:
            try:
                os.remove(self.resume_file)
            except OSError as e:
                self.log.warning(f'Failed to remove resume file: {e!r}')

        # close up shared memory
        self.shared_memory.close()
        self.shared_memory.unlink()
        self.shared_memory = None

        self.log.info('All done! Download manager quitting...')
        # finally, exit the process.
        exit(0)
예제 #16
0
class DLWorker(Process):
    def __init__(self, name, queue, out_queue, shm, max_retries=5,
                 logging_queue=None, dl_timeout=10):
        super().__init__(name=name)
        self.q = queue
        self.o_q = out_queue
        self.session = requests.session()
        self.session.headers.update({
            'User-Agent': 'EpicGamesLauncher/10.18.6-14188424+++Portal+Release-Live Windows/10.0.18363.1.256.64bit'
        })
        self.max_retries = max_retries
        self.shm = SharedMemory(name=shm)
        self.log_level = logging.getLogger().level
        self.logging_queue = logging_queue
        self.dl_timeout = float(dl_timeout) if dl_timeout else 10.0

    def run(self):
        # we have to fix up the logger before we can start
        _root = logging.getLogger()
        _root.handlers = []
        _root.addHandler(QueueHandler(self.logging_queue))

        logger = logging.getLogger(self.name)
        logger.setLevel(self.log_level)
        logger.debug(f'Download worker reporting for duty!')

        empty = False
        while True:
            try:
                job = self.q.get(timeout=10.0)
                empty = False
            except Empty:
                if not empty:
                    logger.debug(f'Queue Empty, waiting for more...')
                empty = True
                continue

            if job.kill:  # let worker die
                logger.debug(f'Worker received kill signal, shutting down...')
                break

            tries = 0
            dl_start = dl_end = 0
            compressed = 0
            chunk = None

            try:
                while tries < self.max_retries:
                    # print('Downloading', job.url)
                    logger.debug(f'Downloading {job.url}')
                    dl_start = time.time()

                    try:
                        r = self.session.get(job.url, timeout=self.dl_timeout)
                        r.raise_for_status()
                    except Exception as e:
                        logger.warning(f'Chunk download for {job.guid} failed: ({e!r}), retrying...')
                        continue

                    dl_end = time.time()
                    if r.status_code != 200:
                        logger.warning(f'Chunk download for {job.guid} failed: status {r.status_code}, retrying...')
                        continue
                    else:
                        compressed = len(r.content)
                        chunk = Chunk.read_buffer(r.content)
                        break
                else:
                    raise TimeoutError('Max retries reached')
            except Exception as e:
                logger.error(f'Job for {job.guid} failed with: {e!r}, fetching next one...')
                # add failed job to result queue to be requeued
                self.o_q.put(DownloaderTaskResult(success=False, chunk_guid=job.guid, shm=job.shm, url=job.url))
            except KeyboardInterrupt:
                logger.warning('Immediate exit requested, quitting...')
                break

            if not chunk:
                logger.warning(f'Chunk somehow None?')
                self.o_q.put(DownloaderTaskResult(success=False, chunk_guid=job.guid, shm=job.shm, url=job.url))
                continue

            # decompress stuff
            try:
                size = len(chunk.data)
                if size > job.shm.size:
                    logger.fatal(f'Downloaded chunk is longer than SharedMemorySegment!')

                self.shm.buf[job.shm.offset:job.shm.offset + size] = bytes(chunk.data)
                del chunk
                self.o_q.put(DownloaderTaskResult(success=True, chunk_guid=job.guid, shm=job.shm,
                                                  url=job.url, size=size, compressed_size=compressed,
                                                  time_delta=dl_end - dl_start))
            except Exception as e:
                logger.warning(f'Job for {job.guid} failed with: {e!r}, fetching next one...')
                self.o_q.put(DownloaderTaskResult(success=False, chunk_guid=job.guid, shm=job.shm, url=job.url))
                continue
            except KeyboardInterrupt:
                logger.warning('Immediate exit requested, quitting...')
                break

        self.shm.close()
예제 #17
0
    def _execute_with_strace(self: ActionBase, cmd: str, *args,
                             **kwargs) -> Dict[str, Any]:
        """Execute commands with strace.

        This inner function modifies module commands to be run with strace
        before delegating to the original execute function. Ansible runs tasks
        through worker processes, which means that invocations of this function
        do not share memory/variables with the main process.
        """
        # Just execute the command if gathering facts
        if self._task.action == 'gather_facts':
            return action_base_execute(self, cmd, *args, **kwargs)

        # Get command parts
        parts = cmd.split()

        # Module metadata. This will be None if module is not traced, and
        # defined if the module is traced.
        metadata = None

        # If there are at least two parts, the command may be running a module
        if len(parts) >= 2:

            # Get the potential executable and module path
            executable = parts[0]
            module = parts[1]

            # Check for python and module
            python_match = R_PYTHON.match(executable)
            module_match = R_MODULE.match(module)

            # If it matches a python module that is not being excluded,
            # modify the command to run strace.
            if (python_match and module_match
                    and module_match.group('module') not in excluded_modules):

                # Get index shared memory, parse index, and increment the
                # value stored in shared memory
                index_shm = SharedMemory(name=INDEX_NAME)
                index = int.from_bytes(bytes=index_shm.buf.tobytes(),
                                       byteorder=sys.byteorder)
                index_shm.buf[:] = (index + 1).to_bytes(
                    byteorder=sys.byteorder, length=INDEX_BYTES)
                index_shm.close()

                # Create module output directory
                module_dir = output_dir / str(index)
                module_dir.mkdir(exist_ok=True, parents=True)

                # Read module source
                with open(module, 'r') as fd:
                    source_lines = fd.readlines()

                # Parse zip data
                zip_data = ''
                for line in source_lines:
                    line = line.strip()
                    if line.startswith(ZIPDATA):
                        zip_data = line[len(ZIPDATA) + 3:-3]
                        break

                # Extract zipped data for output
                with ExitStack() as stack:
                    t_fd = stack.enter_context(tempfile.NamedTemporaryFile())
                    t_fd.write(base64.b64decode(zip_data))
                    t_fd.flush()
                    z_fd = stack.enter_context(zipfile.ZipFile(t_fd.name))
                    z_fd.extractall(module_dir)

                # Copy module for output
                shutil.copy(module, module_dir)

                # Parse module arguments
                module_args = {}
                for line in source_lines:
                    line = line.strip()
                    if line.startswith(ANSIBALLZ_PARAMS):
                        start = len(ANSIBALLZ_PARAMS) + 1
                        try:
                            ansiballz_params_str = (
                                line[start:-1].encode('utf-8').decode(
                                    'unicode_escape', errors='ignore'))
                            ansiballz_params = json.loads(ansiballz_params_str)
                            module_args = {
                                key: value
                                for key, value in
                                ansiballz_params[ANSIBLE_MODULE_ARGS].items()
                                if not key.startswith('_ansible')
                            }
                        except (
                                UnicodeError,
                                JSONDecodeError,
                        ):
                            print('    Error parsing module params.')

                        break

                # Modify command and print info
                original_command = cmd
                cmd = (f'strace -DDD -f -y -yy -X raw -I 2 -o "| awk '
                       f'\'NR>{MAX_ROWS}{{print "\\""TRUNCATED"\\""; exit}}; '
                       f'{{print}}\' > {module_dir / "strace.txt"}" '
                       f'-e trace=!close {cmd}')
                print(f'    Modified Command: {cmd}')
                print(f'    Args: {module_args}')
                sys.stdout.flush()

                # Compute metadata
                metadata = {
                    'name': self._task.name,
                    'action': self._task.action,
                    'module': module_match.group('module'),
                    'index': index,
                    'original_cmd': original_command,
                    'modified_cmd': cmd,
                    'args': module_args,
                }

        # Don't worry about it. It's probably fine.
        self._task.ignore_errors = True

        # Delegate to the execute method.
        # The loader basedir replacement is to make sure the command is
        # executed in the correct working directory.
        loader_basedir = self._loader.get_basedir()
        self._loader.set_basedir(Path.cwd())
        execute_start_s = time()
        result = action_base_execute(self, cmd, *args, **kwargs)
        execute_end_s = time()
        self._loader.set_basedir(loader_basedir)
        execute_duration_s = execute_end_s - execute_start_s

        # Write metadata if the command was traced.
        # module_dir is defined iff metadata is.
        if metadata is not None:

            # Set execution duration and print info
            metadata['duration'] = execute_duration_s
            print(f'    Execution time: {execute_duration_s:.2f}s')

            # Deep copy result to be safe. This prevents us from accidentally
            # overriding anything when we parse stdout and stderr.
            metadata['result'] = copy.deepcopy(result)

            # Parse stdout as JSON if possible
            try:
                metadata['result']['stdout'] = json.loads(result['stdout'])
            except (TypeError, JSONDecodeError):
                pass

            # Parse stderr as JSON if possible
            try:
                metadata['result']['stderr'] = json.loads(result['stderr'])
            except (TypeError, JSONDecodeError):
                pass

            # Add result to metadata before writing
            with open(module_dir / 'metadata.json', 'w') as fd:
                json.dump(metadata, fd)

        # Return result
        return result