Пример #1
0
    def _create_multi_th_gen(self, file_openers):
        """
        Creates a multi-threaded generator of data-chunks by reading data from
        data_paths. Works by spawning thread workers and assigning csv files to
        them. Threads populate a queue with raw data-chunks.

        :param file_openers: a list of function that return opened files
        """
        # the queue will accum raw data-chunks produced by threads
        chunk_queue = Queue(maxsize=self.buffer_size)
        # function's partial that only will expect a file opener used by workers
        parser_kwargs = self.adjust_kwargs_to_engine(self.parser_kwargs)
        iter_creator = fun_partial(self.get_data_chunk_iter,
                                   chunksize=self.chunk_size,
                                   **parser_kwargs)
        queue_populator = fun_partial(populate_queue_with_chunks,
                                      itr_creator=iter_creator,
                                      queue=chunk_queue)

        # creating a pool of threads, and assigning jobs to them
        pool = Pool(self.worker_threads_num)
        pool.map_async(queue_populator, file_openers)
        pool.close()  # indicating that never going to submit more work

        # the inf. while loop is broken when all files are read,
        # i.e. a termination token is received for each file
        received_termin_tokens_count = 0
        while True:
            chunk = chunk_queue.get(timeout=self.timeout)
            if isinstance(chunk, Exception):
                raise chunk
            if chunk == TERMINATION_TOKEN:
                received_termin_tokens_count += 1
                if received_termin_tokens_count == len(file_openers):
                    pool.join()
                    break
            else:
                yield chunk
Пример #2
0
def create_openers_of_valid_files(paths, ext='csv', encoding='utf-8'):
    """
    Returns a list of file opener functions, on call return a file object.
    In such a way we hide the details on how to open files of different types
    and avoid opening all files at once.
    """
    valid_file_openers = []
    for path in paths:
        fs = LocalFsAccessor()
        file_paths = fs.list_file_paths(path)
        valid_file_paths = filter_file_paths_by_extension(file_paths, ext=ext)
        valid_file_openers += [
            fun_partial(fs.open_file, path=p, mode='r', encoding=encoding)
            for p in valid_file_paths
        ]
    return valid_file_openers
Пример #3
0
def create_openers_of_valid_files(paths, ext='.csv'):
    """
    Returns a list of (valid) file paths openers. Concretely, functions on call
    return opened file objects. In such a way we hide the details on how to open
    files of different types (s3 or local atm) and avoid opening all files at
    once.
    """
    valid_file_openers = []
    for path in paths:
        fs = fs_accessor_factory("s3" if is_s3_path(path) else "local")
        file_paths = fs.list_file_paths(path)
        valid_file_paths = filter_file_paths_by_extension(file_paths, ext=ext)
        valid_file_openers += [
            fun_partial(fs.open_file, path=p, mode='r')
            for p in valid_file_paths
        ]
    return valid_file_openers