def _create_multi_th_gen(self, file_openers): """ Creates a multi-threaded generator of data-chunks by reading data from data_paths. Works by spawning thread workers and assigning csv files to them. Threads populate a queue with raw data-chunks. :param file_openers: a list of function that return opened files """ # the queue will accum raw data-chunks produced by threads chunk_queue = Queue(maxsize=self.buffer_size) # function's partial that only will expect a file opener used by workers parser_kwargs = self.adjust_kwargs_to_engine(self.parser_kwargs) iter_creator = fun_partial(self.get_data_chunk_iter, chunksize=self.chunk_size, **parser_kwargs) queue_populator = fun_partial(populate_queue_with_chunks, itr_creator=iter_creator, queue=chunk_queue) # creating a pool of threads, and assigning jobs to them pool = Pool(self.worker_threads_num) pool.map_async(queue_populator, file_openers) pool.close() # indicating that never going to submit more work # the inf. while loop is broken when all files are read, # i.e. a termination token is received for each file received_termin_tokens_count = 0 while True: chunk = chunk_queue.get(timeout=self.timeout) if isinstance(chunk, Exception): raise chunk if chunk == TERMINATION_TOKEN: received_termin_tokens_count += 1 if received_termin_tokens_count == len(file_openers): pool.join() break else: yield chunk
def create_openers_of_valid_files(paths, ext='csv', encoding='utf-8'): """ Returns a list of file opener functions, on call return a file object. In such a way we hide the details on how to open files of different types and avoid opening all files at once. """ valid_file_openers = [] for path in paths: fs = LocalFsAccessor() file_paths = fs.list_file_paths(path) valid_file_paths = filter_file_paths_by_extension(file_paths, ext=ext) valid_file_openers += [ fun_partial(fs.open_file, path=p, mode='r', encoding=encoding) for p in valid_file_paths ] return valid_file_openers
def create_openers_of_valid_files(paths, ext='.csv'): """ Returns a list of (valid) file paths openers. Concretely, functions on call return opened file objects. In such a way we hide the details on how to open files of different types (s3 or local atm) and avoid opening all files at once. """ valid_file_openers = [] for path in paths: fs = fs_accessor_factory("s3" if is_s3_path(path) else "local") file_paths = fs.list_file_paths(path) valid_file_paths = filter_file_paths_by_extension(file_paths, ext=ext) valid_file_openers += [ fun_partial(fs.open_file, path=p, mode='r') for p in valid_file_paths ] return valid_file_openers