Пример #1
0
def create_huffman_tree(word_counts):
    """Make a huffman tree from a dictionary containing word counts.

    This method creates a binary huffman tree, that is required for
    :class:`BinaryHierarchicalSoftmax`.
    For example, ``{0: 8, 1: 5, 2: 6, 3: 4}`` is converted to
    ``((3, 1), (2, 0))``.

    Args:
        word_counts (``dict`` of ``int`` key and ``int`` or ``float`` values.): 
            Dictionary representing counts of words.

    Returns:
        Binary huffman tree with tuples and keys of ``word_coutns``.
    """
    if len(word_counts) == 0:
        raise ValueError('Empty vocabulary')

    q = PriorityQueue()
    for w, c in iteritems(word_counts):
        q.put((c, w))

    while q.qsize() >= 2:
        (count1, word1) = q.get()
        (count2, word2) = q.get()
        count = count1 + count2
        tree = (word1, word2)
        q.put((count, tree))

    return q.get()[1]
Пример #2
0
def create_huffman_tree(word_counts):
    """Make a huffman tree from a dictionary containing word counts.

    This method creates a binary huffman tree, that is required for
    :class:`BinaryHierarchicalSoftmax`.
    For example, ``{0: 8, 1: 5, 2: 6, 3: 4}`` is converted to
    ``((3, 1), (2, 0))``.

    Args:
        word_counts (``dict`` of ``int`` key and ``int`` or ``float`` values.): 
            Dictionary representing counts of words.

    Returns:
        Binary huffman tree with tuples and keys of ``word_coutns``.
    """
    if len(word_counts) == 0:
        raise ValueError('Empty vocabulary')

    q = PriorityQueue()
    for w, c in iteritems(word_counts):
        q.put((c, w))

    while q.qsize() >= 2:
        (count1, word1) = q.get()
        (count2, word2) = q.get()
        count = count1 + count2
        tree = (word1, word2)
        q.put((count, tree))

    return q.get()[1]
Пример #3
0
class PushThread(Thread):
    PUSH, PUT = range(2)

    def __init__(self, session, document):
        self.session = session
        self.document = document
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            # does it even make sense to have a priority que?
            # (instead of a simple FIFO, I mean we have a single-producer single-consumer
            #  scenario)
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                self.session.store_objects(obj)
            elif priority == PushThread.PUSH:
                self.session.store_document(self.document)
                # delete queued objects when training has finished
                if obj == 'after_training':
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
class ConnectionThreadPoolExecutor(ThreadPoolExecutor):
    """
    A wrapper class to maintain a pool of connections alongside the thread
    pool. We start by creating a priority queue of connections, and each job
    submitted takes one of those connections (initialising if necessary) and
    passes it as the first arg to the executed function.

    At the end of execution that connection is returned to the queue.

    By using a PriorityQueue we avoid creating more connections than required.
    We will only create as many connections as are required concurrently.
    """
    def __init__(self, create_connection, max_workers):
        self._connections = PriorityQueue()
        self._create_connection = create_connection
        for p in range(0, max_workers):
            self._connections.put((p, None))
        super(ConnectionThreadPoolExecutor, self).__init__(max_workers)

    def submit(self, fn, *args, **kwargs):
        def conn_fn():
            priority = None
            conn = None
            try:
                # If we get a connection we must put it back later
                (priority, conn) = self._connections.get()
                if conn is None:
                    conn = self._create_connection()
                conn_args = (conn,) + args
                return fn(*conn_args, **kwargs)
            finally:
                if priority is not None:
                    self._connections.put((priority, conn))

        return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
Пример #5
0
class PushThread(Thread):
    PUSH, PUT = range(2)

    def __init__(self, session, document):
        self.session = session
        self.document = document
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                self.session.store_objects(obj)
            elif priority == PushThread.PUSH:
                self.session.store_document(self.document)
                # delete queued objects when training has finished
                if obj == 'after_training':
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
Пример #6
0
class PushThread(Thread):
    PUSH, PUT = range(2)

    def __init__(self, session, document):
        self.session = session
        self.document = document
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                self.session.store_objects(obj)
            elif priority == PushThread.PUSH:
                self.session.store_document(self.document)
                # delete queued objects when training has finished
                if obj == 'after_training':
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
Пример #7
0
class PushThread(Thread):
    # Define priority constants
    PUSH = 1
    PUT = 2

    def __init__(self):
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                cursession().store_objects(obj)
            elif priority == PushThread.PUSH:
                push()
                # delete queued objects when training has finished
                if obj == "after_training":
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
Пример #8
0
class ConnectionThreadPoolExecutor(ThreadPoolExecutor):
    """
    A wrapper class to maintain a pool of connections alongside the thread
    pool. We start by creating a priority queue of connections, and each job
    submitted takes one of those connections (initialising if necessary) and
    passes it as the first arg to the executed function.

    At the end of execution that connection is returned to the queue.

    By using a PriorityQueue we avoid creating more connections than required.
    We will only create as many connections as are required concurrently.
    """
    def __init__(self, create_connection, max_workers):
        self._connections = PriorityQueue()
        self._create_connection = create_connection
        for p in range(0, max_workers):
            self._connections.put((p, None))
        super(ConnectionThreadPoolExecutor, self).__init__(max_workers)

    def submit(self, fn, *args, **kwargs):
        def conn_fn():
            priority = None
            conn = None
            try:
                # If we get a connection we must put it back later
                (priority, conn) = self._connections.get()
                if conn is None:
                    conn = self._create_connection()
                conn_args = (conn, ) + args
                return fn(*conn_args, **kwargs)
            finally:
                if priority is not None:
                    self._connections.put((priority, conn))

        return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
Пример #9
0
class PushThread(Thread):
    # Define priority constants
    PUSH = 1
    PUT = 2

    def __init__(self):
        super(PushThread, self).__init__()
        self.queue = PriorityQueue()
        self.setDaemon(True)

    def put(self, obj, priority):
        self.queue.put(_WorkItem(priority, obj))

    def run(self):
        while True:
            priority, obj = self.queue.get()
            if priority == PushThread.PUT:
                cursession().store_objects(obj)
            elif priority == PushThread.PUSH:
                push()
                # delete queued objects when training has finished
                if obj == "after_training":
                    with self.queue.mutex:
                        del self.queue.queue[:]
                    break
            self.queue.task_done()
class LayersApplier(object):
    """ Most layers replace content. We try to do this intelligently here,
    so that layers don't step over each other. """
    HTML_TAG_REGEX = re.compile(r'<[^>]*?>')

    def __init__(self):
        self.queue = PriorityQueue()
        self.text = None

    def enqueue_from_list(self, elements_list):
        for le in elements_list:
            self.enqueue(le)

    def enqueue(self, layer_element):
        original, replacement, locations = layer_element
        priority = len(original)
        item = (original, replacement, locations)
        self.queue.put((-priority, item))

    def location_replace(self, xml_node, original, replacement, locations):
        LocationReplace().location_replace(xml_node, original, replacement,
                                           locations)

    def replace_all(self, original, replacement):
        """ Replace all occurrences of original with replacement. This is HTML
        aware; it effectively looks at all of the text in between HTML tags"""
        text_chunks = []
        index = 0
        for match in self.HTML_TAG_REGEX.finditer(self.text):
            text = self.text[index:match.start()]
            text_chunks.append(text.replace(original, replacement))
            text_chunks.append(self.text[match.start():match.end()])  # tag
            index = match.end()
        text_chunks.append(self.text[index:])  # trailing text
        self.text = "".join(text_chunks)

    def replace_at(self, original, replacement, locations):
        """ Replace the occurrences of original at all the locations with
        replacement. """

        locations.sort()
        self.text = LocationReplace().location_replace_text(
            self.text, original, replacement, locations)

    def apply_layers(self, original_text):
        self.text = original_text

        while not self.queue.empty():
            priority, layer_element = self.queue.get()
            original, replacement, locations = layer_element

            if not locations:
                self.replace_all(original, replacement)
            else:
                self.replace_at(original, replacement, locations)

        return self.text
Пример #11
0
 def _create_files_list(self):
     priorityQueue = PriorityQueue()
     for txt_file in self._txt_files:
         wav_file = os.path.splitext(txt_file)[0] + ".wav"
         wav_file_size = os.path.getsize(wav_file)
         priorityQueue.put((wav_file_size, (txt_file, wav_file)))
     files_list = []
     while not priorityQueue.empty():
         priority, (txt_file, wav_file) = priorityQueue.get()
         files_list.append((txt_file, wav_file))
     return files_list
Пример #12
0
class ConnectionThreadPoolExecutor(ThreadPoolExecutor):
    """
    A wrapper class to maintain a pool of connections alongside the thread
    pool. We start by creating a priority queue of connections, and each job
    submitted takes one of those connections (initialising if necessary) and
    passes it as the first arg to the executed function.

    At the end of execution that connection is returned to the queue.

    By using a PriorityQueue we avoid creating more connections than required.
    We will only create as many connections as are required concurrently.
    """
    def __init__(self, create_connection, max_workers):
        """
        Initializes a new ThreadPoolExecutor instance.

        :param create_connection: callable to use to create new connections
        :param max_workers: the maximum number of threads that can be used
        """
        self._connections = PriorityQueue()
        self._create_connection = create_connection
        for p in range(0, max_workers):
            self._connections.put((p, None))
        super(ConnectionThreadPoolExecutor, self).__init__(max_workers)

    def submit(self, fn, *args, **kwargs):
        """
        Schedules the callable, `fn`, to be executed

        :param fn: the callable to be invoked
        :param args: the positional arguments for the callable
        :param kwargs: the keyword arguments for the callable
        :returns: a Future object representing the execution of the callable
        """
        def conn_fn():
            priority = None
            conn = None
            try:
                # If we get a connection we must put it back later
                (priority, conn) = self._connections.get()
                if conn is None:
                    conn = self._create_connection()
                conn_args = (conn, ) + args
                return fn(*conn_args, **kwargs)
            finally:
                if priority is not None:
                    self._connections.put((priority, conn))

        return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
Пример #13
0
class AsyncDataLoader(object):
    """The AsyncDataLoader is a wrapper to asynchronous loading multiples batches of data.

    It keeps a buffer of batches, so when the model asks for a new batch, it's
    already in memory. After sending the batch to the model, it is removed from
    the buffer, and a new batch can be loaded.

    The buffer is filled using a separated thread. Then, each batch can be loaded
    using multiple processes, or multiples threads.

    This async batch loader is designed for heavy IO or heavy CPU batch generation.

    .. warning:: When using the multiprocessing batch loader, watch the ram usage,
        and avoid a high number of processes. Multiprocessing can easily lead
        to memory overflow.

    - Should I use the Async Data Loader ?

        If you check the Cogitare's DataHolder, it already provides the execution
        of the data loading through multiple threads or multiple processes. So you should
        use this if the time to generate a whole batch is expensive.

    - Should I use threads or processes ?

        It's recommended to use threads, they are lightweight and fast.

        Multiple processing usually will lead to a worse performance and memory
        usage, due to the communication pipe between processes and due to
        the extension sharing of the memory. However, it can be
        useful for CPU expensive operations, because will not suffer from GIL.

        Threads, in the other way, are lightweight and usually fast, but can suffer from GIL.
        For tasks with heavy IO, it is a good choice.

    Args:
        data (DataSet, AbsDataHolder, SequentialDataSet, SequentialAbsDataHolder): data holder,
            or dataset instance.
        buffer_size (int): size of the batch buffer. The async data loader will keep around
            ``buffer_size`` batches in memory.
        mode (str): should be ``threaded`` or ``multiprocessing``, indicating how to fetch batches.
        workers (int): the number of threads/processes used to load the batches. If None,
            will use the number of cores in the CPU.
        on_batch_loaded (callable): if provided, this function will be called when a new batch is loaded. It must
            receive one argument, the batch data. And return the batch after applying some operation on the data. This
            can be used to apply pre-processing functions on a batch of data (such as image filtering, moving the

    Example::

        >>> mnist = fetch_mldata('MNIST original')
        >>> mnist.data = mnist.data / 255
        >>> data = DataSet([mnist.data, mnist.target.astype(int)], batch_size=64)
        >>> data_train, data_validation = data.split(0.8)

        >>> # wraps the data_train dataset with the async loader.
        >>> data_train = AsyncDataLoader(data_train)

        >>> model.learn(data_train, optimizer)
    """
    def __init__(self,
                 data,
                 buffer_size=8,
                 mode='threaded',
                 workers=None,
                 on_batch_loaded=None):
        valid = (
            'threaded',
            'multiprocessing',
        )
        utils.assert_raise(mode in valid, ValueError,
                           'mode must be one of: ' + ', '.join(valid))
        utils.assert_raise(buffer_size >= 2, ValueError,
                           'buffer_size must be greater or equal to 2')
        if mode == 'threaded':
            self._executor = C.ThreadPoolExecutor(workers)
        else:
            self._executor = C.ProcessPoolExecutor(workers)

        if on_batch_loaded is None:
            on_batch_loaded = _identity

        self._queue = PriorityQueue(buffer_size)
        self._data = data
        self._thread = None
        self._on_batch_loaded = on_batch_loaded
        self._cache_buffer = []
        self._caching = False

    def __repr__(self):
        return repr(self._data)

    def _start(self):
        if self._thread is None:
            self._thread = Thread(target=self._produce)
            self._thread.daemon = True
            self._thread.start()

    def cache(self):
        """Start to load batches to buffer, and wait the buffer be full.
        This can be used before start the model training to cache the samples
        and speed up the model execution.

        Example::

            >>> dh = CallableHolder(s.__next__, mode='sequential', total_samples=20000000, single=True)
            >>> dh = AsyncDataLoader(dh, buffer_size=64000, mode='threaded', workers=1)
            >>> print('caching ...')
            >>> dh.cache()
            >>> print('done')
        """

        self._caching = True
        self._cache_buffer = []

        self._start()
        while not self._queue.full():
            time.sleep(0.1)

        while not all(f.done() for f in self._cache_buffer):
            time.sleep(0.1)

        self._caching = False

    def _produce(self):
        idx = 0

        while True:
            future = self._executor.submit(_fetch, self._on_batch_loaded,
                                           self._data)
            self._queue.put((idx, future))
            idx += 1

            if self._caching:
                self._cache_buffer.append(future)

    def __iter__(self):
        return self

    def __next__(self):
        self._start()
        return self._queue.get()[1].result()

    next = __next__

    def __len__(self):
        return len(self._data)
Пример #14
0
class LayersApplier(object):
    """ Most layers replace content. We try to do this intelligently here,
    so that layers don't step over each other. """

    HTML_TAG_REGEX = re.compile(r"<[^>]*?>")

    def __init__(self):
        self.queue = PriorityQueue()
        self.text = None

    def enqueue_from_list(self, elements_list):
        for le in elements_list:
            self.enqueue(le)

    def enqueue(self, layer_element):
        original, replacement, locations = layer_element
        priority = len(original)
        item = (original, replacement, locations)
        self.queue.put((-priority, item))

    def location_replace(self, xml_node, original, replacement, locations):
        LocationReplace().location_replace(xml_node, original, replacement, locations)

    def unescape_text(self):
        """ Because of the way we do replace_all(), we need to unescape HTML
        entities.  """
        self.text = HTMLParser().unescape(self.text)

    def replace_all(self, original, replacement):
        """ Replace all occurrences of original with replacement. This is HTML
        aware; it effectively looks at all of the text in between HTML tags"""
        text_chunks = []
        index = 0
        for match in self.HTML_TAG_REGEX.finditer(self.text):
            text = self.text[index : match.start()]
            text_chunks.append(text.replace(original, replacement))
            text_chunks.append(self.text[match.start() : match.end()])  # tag
            index = match.end()
        text_chunks.append(self.text[index:])  # trailing text
        self.text = "".join(text_chunks)
        self.unescape_text()

    def replace_at(self, original, replacement, locations):
        """ Replace the occurrences of original at all the locations with
        replacement. """

        locations.sort()
        self.text = LocationReplace().location_replace_text(self.text, original, replacement, locations)
        self.unescape_text()

    def apply_layers(self, original_text):
        self.text = original_text

        while not self.queue.empty():
            priority, layer_element = self.queue.get()
            original, replacement, locations = layer_element

            if not locations:
                self.replace_all(original, replacement)
            else:
                self.replace_at(original, replacement, locations)

        return self.text