Exemplo n.º 1
0
class PushThread(Thread):
    PUSH, PUT = range(2)

    def __init__(self, session, document):
        self.session = session
        self.document = document
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                self.session.store_objects(obj)
            elif priority == PushThread.PUSH:
                self.session.store_document(self.document)
                # delete queued objects when training has finished
                if obj == 'after_training':
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
Exemplo n.º 2
0
class PushThread(Thread):
    PUSH, PUT = range(2)

    def __init__(self, session, document):
        self.session = session
        self.document = document
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                self.session.store_objects(obj)
            elif priority == PushThread.PUSH:
                self.session.store_document(self.document)
                # delete queued objects when training has finished
                if obj == 'after_training':
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
Exemplo n.º 3
0
class ConnectionThreadPoolExecutor(ThreadPoolExecutor):
    """
    A wrapper class to maintain a pool of connections alongside the thread
    pool. We start by creating a priority queue of connections, and each job
    submitted takes one of those connections (initialising if necessary) and
    passes it as the first arg to the executed function.

    At the end of execution that connection is returned to the queue.

    By using a PriorityQueue we avoid creating more connections than required.
    We will only create as many connections as are required concurrently.
    """
    def __init__(self, create_connection, max_workers):
        self._connections = PriorityQueue()
        self._create_connection = create_connection
        for p in range(0, max_workers):
            self._connections.put((p, None))
        super(ConnectionThreadPoolExecutor, self).__init__(max_workers)

    def submit(self, fn, *args, **kwargs):
        def conn_fn():
            priority = None
            conn = None
            try:
                # If we get a connection we must put it back later
                (priority, conn) = self._connections.get()
                if conn is None:
                    conn = self._create_connection()
                conn_args = (conn, ) + args
                return fn(*conn_args, **kwargs)
            finally:
                if priority is not None:
                    self._connections.put((priority, conn))

        return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
class ConnectionThreadPoolExecutor(ThreadPoolExecutor):
    """
    A wrapper class to maintain a pool of connections alongside the thread
    pool. We start by creating a priority queue of connections, and each job
    submitted takes one of those connections (initialising if necessary) and
    passes it as the first arg to the executed function.

    At the end of execution that connection is returned to the queue.

    By using a PriorityQueue we avoid creating more connections than required.
    We will only create as many connections as are required concurrently.
    """
    def __init__(self, create_connection, max_workers):
        self._connections = PriorityQueue()
        self._create_connection = create_connection
        for p in range(0, max_workers):
            self._connections.put((p, None))
        super(ConnectionThreadPoolExecutor, self).__init__(max_workers)

    def submit(self, fn, *args, **kwargs):
        def conn_fn():
            priority = None
            conn = None
            try:
                # If we get a connection we must put it back later
                (priority, conn) = self._connections.get()
                if conn is None:
                    conn = self._create_connection()
                conn_args = (conn,) + args
                return fn(*conn_args, **kwargs)
            finally:
                if priority is not None:
                    self._connections.put((priority, conn))

        return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
Exemplo n.º 5
0
class PushThread(Thread):
    # Define priority constants
    PUSH = 1
    PUT = 2

    def __init__(self):
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                cursession().store_objects(obj)
            elif priority == PushThread.PUSH:
                push()
                # delete queued objects when training has finished
                if obj == "after_training":
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
Exemplo n.º 6
0
class PushThread(Thread):
    PUSH, PUT = range(2)

    def __init__(self, session, document):
        self.session = session
        self.document = document
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            # does it even make sense to have a priority que?
            # (instead of a simple FIFO, I mean we have a single-producer single-consumer
            #  scenario)
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                self.session.store_objects(obj)
            elif priority == PushThread.PUSH:
                self.session.store_document(self.document)
                # delete queued objects when training has finished
                if obj == 'after_training':
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
Exemplo n.º 7
0
    def __init__(self,
                 data,
                 buffer_size=8,
                 mode='threaded',
                 workers=None,
                 on_batch_loaded=None):
        valid = (
            'threaded',
            'multiprocessing',
        )
        utils.assert_raise(mode in valid, ValueError,
                           'mode must be one of: ' + ', '.join(valid))
        utils.assert_raise(buffer_size >= 2, ValueError,
                           'buffer_size must be greater or equal to 2')
        if mode == 'threaded':
            self._executor = C.ThreadPoolExecutor(workers)
        else:
            self._executor = C.ProcessPoolExecutor(workers)

        if on_batch_loaded is None:
            on_batch_loaded = _identity

        self._queue = PriorityQueue(buffer_size)
        self._data = data
        self._thread = None
        self._on_batch_loaded = on_batch_loaded
        self._cache_buffer = []
        self._caching = False
Exemplo n.º 8
0
class PushThread(Thread):
    # Define priority constants
    PUSH = 1
    PUT = 2

    def __init__(self):
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                cursession().store_objects(obj)
            elif priority == PushThread.PUSH:
                push()
                # delete queued objects when training has finished
                if obj == "after_training":
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
class LayersApplier(object):
    """ Most layers replace content. We try to do this intelligently here,
    so that layers don't step over each other. """
    HTML_TAG_REGEX = re.compile(r'<[^>]*?>')

    def __init__(self):
        self.queue = PriorityQueue()
        self.text = None

    def enqueue_from_list(self, elements_list):
        for le in elements_list:
            self.enqueue(le)

    def enqueue(self, layer_element):
        original, replacement, locations = layer_element
        priority = len(original)
        item = (original, replacement, locations)
        self.queue.put((-priority, item))

    def location_replace(self, xml_node, original, replacement, locations):
        LocationReplace().location_replace(xml_node, original, replacement,
                                           locations)

    def replace_all(self, original, replacement):
        """ Replace all occurrences of original with replacement. This is HTML
        aware; it effectively looks at all of the text in between HTML tags"""
        text_chunks = []
        index = 0
        for match in self.HTML_TAG_REGEX.finditer(self.text):
            text = self.text[index:match.start()]
            text_chunks.append(text.replace(original, replacement))
            text_chunks.append(self.text[match.start():match.end()])  # tag
            index = match.end()
        text_chunks.append(self.text[index:])  # trailing text
        self.text = "".join(text_chunks)

    def replace_at(self, original, replacement, locations):
        """ Replace the occurrences of original at all the locations with
        replacement. """

        locations.sort()
        self.text = LocationReplace().location_replace_text(
            self.text, original, replacement, locations)

    def apply_layers(self, original_text):
        self.text = original_text

        while not self.queue.empty():
            priority, layer_element = self.queue.get()
            original, replacement, locations = layer_element

            if not locations:
                self.replace_all(original, replacement)
            else:
                self.replace_at(original, replacement, locations)

        return self.text
Exemplo n.º 10
0
def create_huffman_tree(word_counts):
    """Make a huffman tree from a dictionary containing word counts.

    This method creates a binary huffman tree, that is required for
    :class:`BinaryHierarchicalSoftmax`.
    For example, ``{0: 8, 1: 5, 2: 6, 3: 4}`` is converted to
    ``((3, 1), (2, 0))``.

    Args:
        word_counts (``dict`` of ``int`` key and ``int`` or ``float`` values.): 
            Dictionary representing counts of words.

    Returns:
        Binary huffman tree with tuples and keys of ``word_coutns``.
    """
    if len(word_counts) == 0:
        raise ValueError('Empty vocabulary')

    q = PriorityQueue()
    for w, c in iteritems(word_counts):
        q.put((c, w))

    while q.qsize() >= 2:
        (count1, word1) = q.get()
        (count2, word2) = q.get()
        count = count1 + count2
        tree = (word1, word2)
        q.put((count, tree))

    return q.get()[1]
Exemplo n.º 11
0
    def __init__(self, create_connection, max_workers):
        """
        Initializes a new ThreadPoolExecutor instance.

        :param create_connection: callable to use to create new connections
        :param max_workers: the maximum number of threads that can be used
        """
        self._connections = PriorityQueue()
        self._create_connection = create_connection
        for p in range(0, max_workers):
            self._connections.put((p, None))
        super(ConnectionThreadPoolExecutor, self).__init__(max_workers)
Exemplo n.º 12
0
def create_huffman_tree(word_counts):
    """Make a huffman tree from a dictionary containing word counts.

    This method creates a binary huffman tree, that is required for
    :class:`BinaryHierarchicalSoftmax`.
    For example, ``{0: 8, 1: 5, 2: 6, 3: 4}`` is converted to
    ``((3, 1), (2, 0))``.

    Args:
        word_counts (``dict`` of ``int`` key and ``int`` or ``float`` values.): 
            Dictionary representing counts of words.

    Returns:
        Binary huffman tree with tuples and keys of ``word_coutns``.
    """
    if len(word_counts) == 0:
        raise ValueError('Empty vocabulary')

    q = PriorityQueue()
    for w, c in iteritems(word_counts):
        q.put((c, w))

    while q.qsize() >= 2:
        (count1, word1) = q.get()
        (count2, word2) = q.get()
        count = count1 + count2
        tree = (word1, word2)
        q.put((count, tree))

    return q.get()[1]
Exemplo n.º 13
0
class ConnectionThreadPoolExecutor(ThreadPoolExecutor):
    """
    A wrapper class to maintain a pool of connections alongside the thread
    pool. We start by creating a priority queue of connections, and each job
    submitted takes one of those connections (initialising if necessary) and
    passes it as the first arg to the executed function.

    At the end of execution that connection is returned to the queue.

    By using a PriorityQueue we avoid creating more connections than required.
    We will only create as many connections as are required concurrently.
    """
    def __init__(self, create_connection, max_workers):
        """
        Initializes a new ThreadPoolExecutor instance.

        :param create_connection: callable to use to create new connections
        :param max_workers: the maximum number of threads that can be used
        """
        self._connections = PriorityQueue()
        self._create_connection = create_connection
        for p in range(0, max_workers):
            self._connections.put((p, None))
        super(ConnectionThreadPoolExecutor, self).__init__(max_workers)

    def submit(self, fn, *args, **kwargs):
        """
        Schedules the callable, `fn`, to be executed

        :param fn: the callable to be invoked
        :param args: the positional arguments for the callable
        :param kwargs: the keyword arguments for the callable
        :returns: a Future object representing the execution of the callable
        """
        def conn_fn():
            priority = None
            conn = None
            try:
                # If we get a connection we must put it back later
                (priority, conn) = self._connections.get()
                if conn is None:
                    conn = self._create_connection()
                conn_args = (conn, ) + args
                return fn(*conn_args, **kwargs)
            finally:
                if priority is not None:
                    self._connections.put((priority, conn))

        return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
Exemplo n.º 14
0
    def _get_backfill_events(self, txn, room_id, event_list, limit):
        logger.debug(
            "_get_backfill_events: %s, %s, %s",
            room_id, repr(event_list), limit
        )

        event_results = set()

        # We want to make sure that we do a breadth-first, "depth" ordered
        # search.

        query = (
            "SELECT depth, prev_event_id FROM event_edges"
            " INNER JOIN events"
            " ON prev_event_id = events.event_id"
            " WHERE event_edges.event_id = ?"
            " AND event_edges.is_state = ?"
            " LIMIT ?"
        )

        queue = PriorityQueue()

        for event_id in event_list:
            depth = self._simple_select_one_onecol_txn(
                txn,
                table="events",
                keyvalues={
                    "event_id": event_id,
                    "room_id": room_id,
                },
                retcol="depth",
                allow_none=True,
            )

            if depth:
                queue.put((-depth, event_id))

        while not queue.empty() and len(event_results) < limit:
            try:
                _, event_id = queue.get_nowait()
            except Empty:
                break

            if event_id in event_results:
                continue

            event_results.add(event_id)

            txn.execute(
                query,
                (event_id, False, limit - len(event_results))
            )

            for row in txn:
                if row[1] not in event_results:
                    queue.put((-row[0], row[1]))

        return event_results
Exemplo n.º 15
0
    def run(self, distributable):
        _JustCheckExists().input(distributable)

        priority_queue = PriorityQueue()
        thread_list = []
        shaped_distributable = _shape_to_desired_workcount(
            distributable, self.taskcount)
        for taskindex in range(self.taskcount):

            def _target(taskindex=taskindex):
                result_list = []
                for work in _work_sequence_for_one_index(
                        shaped_distributable, self.taskcount, taskindex):
                    result_list.append(_run_all_in_memory(work))
                priority_queue.put((taskindex, result_list))

            if not self.just_one_process:
                thread = threading.Thread(target=_target, name=str(taskindex))
                thread_list.append(thread)
                thread.start()
            else:
                thread_list.append(None)
                _target()

        result_sequence = self._result_sequence(thread_list, priority_queue,
                                                shaped_distributable)
        result = shaped_distributable.reduce(result_sequence)

        _JustCheckExists().output(distributable)
        return result
Exemplo n.º 16
0
class MockResponseResponseFuture():
    """
    This is a mock ResponseFuture. It is used to allow us to hook into the underlying session
    and invoke callback with various timing.
    """

    _query_trace = None
    _col_names = None
    _col_types = None

    # a list pending callbacks, these will be prioritized in reverse or normal orderd
    pending_callbacks = PriorityQueue()

    def __init__(self, reverse):

        # if this is true invoke callback in the reverse order then what they were insert
        self.reverse = reverse
        # hardcoded to avoid paging logic
        self.has_more_pages = False

        if (reverse):
            self.priority = 100
        else:
            self.priority = 0

    def add_callback(self, fn, *args, **kwargs):
        """
        This is used to add a callback our pending list of callbacks.
        If reverse is specified we will invoke the callback in the opposite order that we added it
        """
        time_added = time.time()
        self.pending_callbacks.put(
            (self.priority, (fn, args, kwargs, time_added)))
        if not reversed:
            self.priority += 1
        else:
            self.priority -= 1

    def add_callbacks(self,
                      callback,
                      errback,
                      callback_args=(),
                      callback_kwargs=None,
                      errback_args=(),
                      errback_kwargs=None):

        self.add_callback(callback, *callback_args, **(callback_kwargs or {}))

    def get_next_callback(self):
        return self.pending_callbacks.get()

    def has_next_callback(self):
        return not self.pending_callbacks.empty()

    def has_more_pages(self):
        return False

    def clear_callbacks(self):
        return
Exemplo n.º 17
0
    def _get_backfill_events(self, txn, room_id, event_list, limit):
        logger.debug(
            "_get_backfill_events: %s, %s, %s",
            room_id, repr(event_list), limit
        )

        event_results = set()

        # We want to make sure that we do a breadth-first, "depth" ordered
        # search.

        query = (
            "SELECT depth, prev_event_id FROM event_edges"
            " INNER JOIN events"
            " ON prev_event_id = events.event_id"
            " WHERE event_edges.event_id = ?"
            " AND event_edges.is_state = ?"
            " LIMIT ?"
        )

        queue = PriorityQueue()

        for event_id in event_list:
            depth = self._simple_select_one_onecol_txn(
                txn,
                table="events",
                keyvalues={
                    "event_id": event_id,
                    "room_id": room_id,
                },
                retcol="depth",
                allow_none=True,
            )

            if depth:
                queue.put((-depth, event_id))

        while not queue.empty() and len(event_results) < limit:
            try:
                _, event_id = queue.get_nowait()
            except Empty:
                break

            if event_id in event_results:
                continue

            event_results.add(event_id)

            txn.execute(
                query,
                (event_id, False, limit - len(event_results))
            )

            for row in txn:
                if row[1] not in event_results:
                    queue.put((-row[0], row[1]))

        return event_results
Exemplo n.º 18
0
    def __init__(self, thread_manager, thread_count=10):
        """Initialization method

        :param thread_manager: the thread manager to use
        :param thread_count: the number of workers to instantiate
        """
        self.logger = logging.getLogger(
            'storj.downstream_farmer.utils.ThreadPool')
        self.tasks = PriorityQueue()
        self.thread_manager = thread_manager
        self.workers = list()
        self.workers_lock = threading.Lock()
        self.max_thread_count = 50
        self.load_minimum = 0.01
        self.load_maximum = 0.5
        # managed monitor thread
        self.monitor_thread = self.thread_manager.create_thread(
            name='MonitorThread', target=self._monitor)
        for i in range(0, thread_count):
            self._add_thread()
Exemplo n.º 19
0
 def _create_files_list(self):
     priorityQueue = PriorityQueue()
     for txt_file in self._txt_files:
         wav_file = os.path.splitext(txt_file)[0] + ".wav"
         wav_file_size = os.path.getsize(wav_file)
         priorityQueue.put((wav_file_size, (txt_file, wav_file)))
     files_list = []
     while not priorityQueue.empty():
         priority, (txt_file, wav_file) = priorityQueue.get()
         files_list.append((txt_file, wav_file))
     return files_list
Exemplo n.º 20
0
    def __init__(self, thread_manager, thread_count=10):
        """Initialization method

        :param thread_manager: the thread manager to use
        :param thread_count: the number of workers to instantiate
        """
        self.logger = logging.getLogger(
            'storj.downstream_farmer.utils.ThreadPool')
        self.tasks = PriorityQueue()
        self.thread_manager = thread_manager
        self.workers = list()
        self.workers_lock = threading.Lock()
        self.max_thread_count = 50
        self.load_minimum = 0.01
        self.load_maximum = 0.5
        # managed monitor thread
        self.monitor_thread = self.thread_manager.create_thread(
            name='MonitorThread',
            target=self._monitor)
        for i in range(0, thread_count):
            self._add_thread()
Exemplo n.º 21
0
class AsyncDataLoader(object):
    """The AsyncDataLoader is a wrapper to asynchronous loading multiples batches of data.

    It keeps a buffer of batches, so when the model asks for a new batch, it's
    already in memory. After sending the batch to the model, it is removed from
    the buffer, and a new batch can be loaded.

    The buffer is filled using a separated thread. Then, each batch can be loaded
    using multiple processes, or multiples threads.

    This async batch loader is designed for heavy IO or heavy CPU batch generation.

    .. warning:: When using the multiprocessing batch loader, watch the ram usage,
        and avoid a high number of processes. Multiprocessing can easily lead
        to memory overflow.

    - Should I use the Async Data Loader ?

        If you check the Cogitare's DataHolder, it already provides the execution
        of the data loading through multiple threads or multiple processes. So you should
        use this if the time to generate a whole batch is expensive.

    - Should I use threads or processes ?

        It's recommended to use threads, they are lightweight and fast.

        Multiple processing usually will lead to a worse performance and memory
        usage, due to the communication pipe between processes and due to
        the extension sharing of the memory. However, it can be
        useful for CPU expensive operations, because will not suffer from GIL.

        Threads, in the other way, are lightweight and usually fast, but can suffer from GIL.
        For tasks with heavy IO, it is a good choice.

    Args:
        data (DataSet, AbsDataHolder, SequentialDataSet, SequentialAbsDataHolder): data holder,
            or dataset instance.
        buffer_size (int): size of the batch buffer. The async data loader will keep around
            ``buffer_size`` batches in memory.
        mode (str): should be ``threaded`` or ``multiprocessing``, indicating how to fetch batches.
        workers (int): the number of threads/processes used to load the batches. If None,
            will use the number of cores in the CPU.
        on_batch_loaded (callable): if provided, this function will be called when a new batch is loaded. It must
            receive one argument, the batch data. And return the batch after applying some operation on the data. This
            can be used to apply pre-processing functions on a batch of data (such as image filtering, moving the

    Example::

        >>> mnist = fetch_mldata('MNIST original')
        >>> mnist.data = mnist.data / 255
        >>> data = DataSet([mnist.data, mnist.target.astype(int)], batch_size=64)
        >>> data_train, data_validation = data.split(0.8)

        >>> # wraps the data_train dataset with the async loader.
        >>> data_train = AsyncDataLoader(data_train)

        >>> model.learn(data_train, optimizer)
    """
    def __init__(self,
                 data,
                 buffer_size=8,
                 mode='threaded',
                 workers=None,
                 on_batch_loaded=None):
        valid = (
            'threaded',
            'multiprocessing',
        )
        utils.assert_raise(mode in valid, ValueError,
                           'mode must be one of: ' + ', '.join(valid))
        utils.assert_raise(buffer_size >= 2, ValueError,
                           'buffer_size must be greater or equal to 2')
        if mode == 'threaded':
            self._executor = C.ThreadPoolExecutor(workers)
        else:
            self._executor = C.ProcessPoolExecutor(workers)

        if on_batch_loaded is None:
            on_batch_loaded = _identity

        self._queue = PriorityQueue(buffer_size)
        self._data = data
        self._thread = None
        self._on_batch_loaded = on_batch_loaded
        self._cache_buffer = []
        self._caching = False

    def __repr__(self):
        return repr(self._data)

    def _start(self):
        if self._thread is None:
            self._thread = Thread(target=self._produce)
            self._thread.daemon = True
            self._thread.start()

    def cache(self):
        """Start to load batches to buffer, and wait the buffer be full.
        This can be used before start the model training to cache the samples
        and speed up the model execution.

        Example::

            >>> dh = CallableHolder(s.__next__, mode='sequential', total_samples=20000000, single=True)
            >>> dh = AsyncDataLoader(dh, buffer_size=64000, mode='threaded', workers=1)
            >>> print('caching ...')
            >>> dh.cache()
            >>> print('done')
        """

        self._caching = True
        self._cache_buffer = []

        self._start()
        while not self._queue.full():
            time.sleep(0.1)

        while not all(f.done() for f in self._cache_buffer):
            time.sleep(0.1)

        self._caching = False

    def _produce(self):
        idx = 0

        while True:
            future = self._executor.submit(_fetch, self._on_batch_loaded,
                                           self._data)
            self._queue.put((idx, future))
            idx += 1

            if self._caching:
                self._cache_buffer.append(future)

    def __iter__(self):
        return self

    def __next__(self):
        self._start()
        return self._queue.get()[1].result()

    next = __next__

    def __len__(self):
        return len(self._data)
Exemplo n.º 22
0
 def __init__(self, session, document):
     self.session = session
     self.document = document
     super(PushThread, self).__init__()
     self.queue = PriorityQueue()
     self.setDaemon(True)
Exemplo n.º 23
0
 def __init__(self):
     super(PushThread, self).__init__()
     self.queue = PriorityQueue()
     self.setDaemon(True)
Exemplo n.º 24
0
class LayersApplier(object):
    """ Most layers replace content. We try to do this intelligently here,
    so that layers don't step over each other. """

    HTML_TAG_REGEX = re.compile(r"<[^>]*?>")

    def __init__(self):
        self.queue = PriorityQueue()
        self.text = None

    def enqueue_from_list(self, elements_list):
        for le in elements_list:
            self.enqueue(le)

    def enqueue(self, layer_element):
        original, replacement, locations = layer_element
        priority = len(original)
        item = (original, replacement, locations)
        self.queue.put((-priority, item))

    def location_replace(self, xml_node, original, replacement, locations):
        LocationReplace().location_replace(xml_node, original, replacement, locations)

    def unescape_text(self):
        """ Because of the way we do replace_all(), we need to unescape HTML
        entities.  """
        self.text = HTMLParser().unescape(self.text)

    def replace_all(self, original, replacement):
        """ Replace all occurrences of original with replacement. This is HTML
        aware; it effectively looks at all of the text in between HTML tags"""
        text_chunks = []
        index = 0
        for match in self.HTML_TAG_REGEX.finditer(self.text):
            text = self.text[index : match.start()]
            text_chunks.append(text.replace(original, replacement))
            text_chunks.append(self.text[match.start() : match.end()])  # tag
            index = match.end()
        text_chunks.append(self.text[index:])  # trailing text
        self.text = "".join(text_chunks)
        self.unescape_text()

    def replace_at(self, original, replacement, locations):
        """ Replace the occurrences of original at all the locations with
        replacement. """

        locations.sort()
        self.text = LocationReplace().location_replace_text(self.text, original, replacement, locations)
        self.unescape_text()

    def apply_layers(self, original_text):
        self.text = original_text

        while not self.queue.empty():
            priority, layer_element = self.queue.get()
            original, replacement, locations = layer_element

            if not locations:
                self.replace_all(original, replacement)
            else:
                self.replace_at(original, replacement, locations)

        return self.text
Exemplo n.º 25
0
class ThreadPool(object):
    def __init__(self, thread_manager, thread_count=10):
        """Initialization method

        :param thread_manager: the thread manager to use
        :param thread_count: the number of workers to instantiate
        """
        self.logger = logging.getLogger(
            'storj.downstream_farmer.utils.ThreadPool')
        self.tasks = PriorityQueue()
        self.thread_manager = thread_manager
        self.workers = list()
        self.workers_lock = threading.Lock()
        self.max_thread_count = 50
        self.load_minimum = 0.01
        self.load_maximum = 0.5
        # managed monitor thread
        self.monitor_thread = self.thread_manager.create_thread(
            name='MonitorThread', target=self._monitor)
        for i in range(0, thread_count):
            self._add_thread()

    def thread_count(self):
        with self.workers_lock:
            return len(self.workers)

    def _add_thread(self):
        # unmanaged worker threads
        if (len(self.workers) < self.max_thread_count):
            self.logger.debug('{0} : adding worker'.format(
                threading.current_thread()))
            worker = WorkerThread(self)
            with self.workers_lock:
                self.workers.append(worker)
            return worker
        else:
            return None

    def _remove_thread(self):
        with self.workers_lock:
            if (len(self.workers) > 1):
                self.logger.debug('{0} : removing worker'.format(
                    threading.current_thread()))
                # make sure to retain one worker
                thread = self.workers.pop()
                thread.stop()

    def calculate_loading(self):
        total_time = 0
        work_time = 0
        with self.workers_lock:
            for w in self.workers:
                total_time += w.load_tracker.total_time()
                work_time += w.load_tracker.work_time()
        if (total_time > 0):
            load = float(work_time) / float(total_time)
        else:
            load = 0
        return load

    def max_load(self):
        max = 0
        with self.workers_lock:
            for w in self.workers:
                load = w.load_tracker.load()
                if (load > max):
                    max = load
        return max

    def check_loading(self):
        self.monitor_thread.wake()

    def _monitor(self):
        """This runs until the thread manager wakes it up during
        shutdown, at which time it will wait for any unfinished work in the
        queue, and then finish, allowing the program to exit
        """
        # wait until shutdown is called
        while (self.thread_manager.running):
            # check loading every second to see if we should add another
            # thread.
            load = self.calculate_loading()
            if (load > self.load_maximum):
                worker = self._add_thread()
                if (worker is not None):
                    worker.start()
            elif (load < self.load_minimum):
                self._remove_thread()
            self.thread_manager.sleep(10)
        # wait for any existing work to finish
        self.logger.debug('MonitorThread waiting for tasks to finish')
        self.tasks.join()
        self.logger.debug('MonitorThread finishing')
        # now, managed thread can exit so program can close cleanly

    def put_work(self, target, args=[], kwargs={}, priority=50):
        """Puts work in the work queue.
        :param work: callable work object
        """
        self.tasks.put(WorkItem(target, args, kwargs, priority))

    def start(self):
        """Starts the thread pool and all its workers and the monitor thread
        """
        with self.workers_lock:
            for worker in self.workers:
                worker.start()
        self.monitor_thread.start()
Exemplo n.º 26
0
class ThreadPool(object):

    def __init__(self, thread_manager, thread_count=10):
        """Initialization method

        :param thread_manager: the thread manager to use
        :param thread_count: the number of workers to instantiate
        """
        self.logger = logging.getLogger(
            'storj.downstream_farmer.utils.ThreadPool')
        self.tasks = PriorityQueue()
        self.thread_manager = thread_manager
        self.workers = list()
        self.workers_lock = threading.Lock()
        self.max_thread_count = 50
        self.load_minimum = 0.01
        self.load_maximum = 0.5
        # managed monitor thread
        self.monitor_thread = self.thread_manager.create_thread(
            name='MonitorThread',
            target=self._monitor)
        for i in range(0, thread_count):
            self._add_thread()

    def thread_count(self):
        with self.workers_lock:
            return len(self.workers)

    def _add_thread(self):
        # unmanaged worker threads
        if (len(self.workers) < self.max_thread_count):
            self.logger.debug(
                '{0} : adding worker'.format(threading.current_thread()))
            worker = WorkerThread(self)
            with self.workers_lock:
                self.workers.append(worker)
            return worker
        else:
            return None

    def _remove_thread(self):
        with self.workers_lock:
            if (len(self.workers) > 1):
                self.logger.debug(
                    '{0} : removing worker'.format(threading.current_thread()))
                # make sure to retain one worker
                thread = self.workers.pop()
                thread.stop()

    def calculate_loading(self):
        total_time = 0
        work_time = 0
        with self.workers_lock:
            for w in self.workers:
                total_time += w.load_tracker.total_time()
                work_time += w.load_tracker.work_time()
        if (total_time > 0):
            load = float(work_time) / float(total_time)
        else:
            load = 0
        return load

    def max_load(self):
        max = 0
        with self.workers_lock:
            for w in self.workers:
                load = w.load_tracker.load()
                if (load > max):
                    max = load
        return max

    def check_loading(self):
        self.monitor_thread.wake()

    def _monitor(self):
        """This runs until the thread manager wakes it up during
        shutdown, at which time it will wait for any unfinished work in the
        queue, and then finish, allowing the program to exit
        """
        # wait until shutdown is called
        while (self.thread_manager.running):
            # check loading every second to see if we should add another
            # thread.
            load = self.calculate_loading()
            if (load > self.load_maximum):
                worker = self._add_thread()
                if (worker is not None):
                    worker.start()
            elif (load < self.load_minimum):
                self._remove_thread()
            self.thread_manager.sleep(10)
        # wait for any existing work to finish
        self.logger.debug('MonitorThread waiting for tasks to finish')
        self.tasks.join()
        self.logger.debug('MonitorThread finishing')
        # now, managed thread can exit so program can close cleanly

    def put_work(self, target, args=[], kwargs={}, priority=50):
        """Puts work in the work queue.
        :param work: callable work object
        """
        self.tasks.put(WorkItem(target, args, kwargs, priority))

    def start(self):
        """Starts the thread pool and all its workers and the monitor thread
        """
        with self.workers_lock:
            for worker in self.workers:
                worker.start()
        self.monitor_thread.start()
Exemplo n.º 27
0
 def __init__(self, session, document):
     self.session = session
     self.document = document
     super(PushThread, self).__init__()
     self.queue = PriorityQueue()
     self.setDaemon(True)
Exemplo n.º 28
0
 def __init__(self, worker_count=1, maxsize=0):
     self.__worker_count = worker_count
     self.__workers = set([])
     self.__started = False
     self.__queue = PriorityQueue(maxsize)
     self.__lock = threading.Lock()
Exemplo n.º 29
0
 def __init__(self):
     super(PushThread, self).__init__()
     self.queue = PriorityQueue()
     self.setDaemon(True)
Exemplo n.º 30
0
 def __init__(self):
     self.queue = PriorityQueue()
     self.text = None
 def __init__(self):
     self.queue = PriorityQueue()
     self.text = None
Exemplo n.º 32
0
 def __init__(self, create_connection, max_workers):
     self._connections = PriorityQueue()
     self._create_connection = create_connection
     for p in range(0, max_workers):
         self._connections.put((p, None))
     super(ConnectionThreadPoolExecutor, self).__init__(max_workers)
Exemplo n.º 33
0
class ThreadedExecutor(Executor):
    """\
    This executor provides a method of executing callables in a threaded worker
    pool. The number of outstanding requests can be limited by the ``maxsize``
    parameter, which has the same behavior as the parameter of the same name
    for the ``PriorityQueue`` constructor.

    All threads are daemon threads and will remain alive until the main thread
    exits. Any items remaining in the queue at this point may not be executed!
    """
    def __init__(self, worker_count=1, maxsize=0):
        self.__worker_count = worker_count
        self.__workers = set([])
        self.__started = False
        self.__queue = PriorityQueue(maxsize)
        self.__lock = threading.Lock()

    def __worker(self):
        queue = self.__queue
        while True:
            priority, (function, future) = queue.get(True)
            if not future.set_running_or_notify_cancel():
                continue
            try:
                result = function()
            except Exception as e:
                if six.PY3:
                    future.set_exception(e)
                else:
                    future.set_exception_info(*sys.exc_info()[1:])
            else:
                future.set_result(result)
            queue.task_done()

    def start(self):
        with self.__lock:
            if self.__started:
                return

            for i in xrange(self.__worker_count):
                t = threading.Thread(target=self.__worker)
                t.daemon = True
                t.start()
                self.__workers.add(t)

            self.__started = True

    def submit(self, callable, priority=0, block=True, timeout=None):
        """\
        Enqueue a task to be executed, returning a ``TimedFuture``.

        Tasks can be prioritized by providing a value for the ``priority``
        argument, which follows the same specification as the standard library
        ``Queue.PriorityQueue`` (lowest valued entries are retrieved first.)

        If the worker pool has not already been started, calling this method
        will cause all of the worker threads to start running.
        """
        if not self.__started:
            self.start()

        future = self.Future()
        task = (priority, (callable, future))
        try:
            self.__queue.put(task, block=block, timeout=timeout)
        except Full as error:
            if future.set_running_or_notify_cancel():
                future.set_exception(error)
        return future
Exemplo n.º 34
0
 def __init__(self, create_connection, max_workers):
     self._connections = PriorityQueue()
     self._create_connection = create_connection
     for p in range(0, max_workers):
         self._connections.put((p, None))
     super(ConnectionThreadPoolExecutor, self).__init__(max_workers)