def _parse_pack(self, data_source: str) -> Iterator[DataPack]: if data_source is None: raise ProcessExecutionException( "Data source is None, cannot deserialize.") pack: DataPack = DataPack.deserialize(data_source) if pack is None: raise ProcessExecutionException( f"Cannot recover pack from the following data source: \n" f"{data_source}") yield pack
def multipack_name(self, pack: MultiPack) -> str: name = pack.pack_name if name is None: raise ProcessExecutionException( 'Cannot used the DocIdMultiPackWriter because the doc id is ' 'not assigned for the pack.') return name
def pack_name(self, pack: DataPack) -> str: name = pack.pack_name if name is None: raise ProcessExecutionException( 'Cannot used the DocIdMultiPackWriter because the [pack_name] ' 'is not assigned for the pack.') return name
def _process_with_component( self, selector: Selector, component: PipelineComponent, raw_job: ProcessJob): for pack in selector.select(raw_job.pack): # First, perform the component action on the pack try: if isinstance(component, Caster): # Replacing the job pack with the casted version. raw_job.alter_pack(component.cast(pack)) elif isinstance(component, BaseBatchProcessor): pack.set_control_component(component.name) component.process(pack) elif isinstance(component, Evaluator): pack.set_control_component(component.name) component.consume_next( pack, self._predict_to_gold[raw_job.id] ) elif isinstance(component, BaseProcessor): # Should be BasePackProcessor: # All other processor are considered to be # streaming processor like this. pack.set_control_component(component.name) component.process(pack) # After the component action, make sure the entry is # added into the index. pack.add_all_remaining_entries() except ValueError as e: raise ProcessExecutionException( f'Exception occurred when running ' f'{component.name}') from e
def pack_name(self, pack: DataPack) -> str: name = pack.meta.doc_id if name is None: raise ProcessExecutionException( 'Cannot used the DocIdMultiPackWriter because the doc id ' 'is not assigned for the pack %d.' % pack.meta.pack_id) return name
def _parse_pack(self, data_source: Any) -> Iterator[DataPack]: if data_source is None: raise ProcessExecutionException( "Data source is None, cannot deserialize.") pack: DataPack = DataPack.deserialize( data_source, serialize_method=self.configs.serialize_method, zip_pack=self.configs.zip_pack, ) if pack is None: raise ProcessExecutionException( f"Cannot recover pack from the following data source: \n" f"{data_source}") yield pack
def parse_pack(self, collection: Any) -> Iterator[PackType]: r"""Calls :meth:`_parse_pack` to create packs from the collection. This internally setup the component meta data. Users should implement the :meth:`_parse_pack` method. """ if collection is None: raise ProcessExecutionException( "Got None collection, cannot parse as data pack.") for p in self._parse_pack(collection): p.add_all_remaining_entries(self.name) yield p
def _get_cache_location(self, collection: Any) -> str: r"""Gets the path to the cache file for a collection. Args: collection: information to compute cache key. Returns (Path): file path to the cache file for a Pack. """ # pylint: disable=assignment-from-none file_path = self._cache_key_function(collection) if file_path is None: raise ProcessExecutionException( "Cache key is None. You probably set `from_cache` to true but " "fail to implement the _cache_key_function") return os.path.join(str(self._cache_directory), file_path)
def set_text( self, text: str, replace_func: Optional[Callable[[str], ReplaceOperationsType]] = None): if len(text) < len(self._text): raise ProcessExecutionException( "The new text is overwriting the original one with shorter " "length, which might cause unexpected behavior.") if len(self._text): logging.warning("Need to be cautious when changing the text of a " "data pack, existing entries may get affected. ") span_ops = [] if replace_func is None else replace_func(text) # The spans should be mutually exclusive ( self._text, self.replace_back_operations, self.processed_original_spans, self.orig_text_len ) = data_utils_io.modify_text_and_track_ops(text, span_ops)
def _process_packs(self, data_iter: Iterator[PackType]) -> Iterator[PackType]: r"""Process the packs received from the reader by the running through the pipeline. Args: data_iter (iterator): Iterator yielding jobs that contain packs Returns: Yields packs that are processed by the pipeline. """ # pylint: disable=line-too-long # Here is the logic for the execution of the pipeline. # The basic idea is to yield a pack as soon as it gets processed by all # the processors instead of waiting for later jobs to get processed. # 1) A job can be in three status # - UNPROCESSED # - QUEUED # - PROCESSED # # 2) Each processor maintains a queue to hold the jobs to be executed # next. # # 3) In case of a BatchProcessor, a job enters into QUEUED status if the # job does not satisfy the `batch_size` requirement of that processor. # In that case, the pipeline requests for additional jobs from the # reader and starts the execution loop from the beginning. # # 4) At any point, while moving to the next processor, the pipeline # ensures that all jobs are either in QUEUED or PROCESSED status. If # they are PROCESSED, they will be moved to the next queue. This design # ensures that at any point, while processing the job at processor `i`, # all the jobs in the previous queues are in QUEUED status. So whenever # a new job is needed, the pipeline can directly request it from the # reader instead of looking at previous queues for UNPROCESSED jobs. # # 5) When a processor receives a poison pack, it flushes all the # remaining batches in its memory (this actually has no effect in # PackProcessors) and moves the jobs including the poison pack to the # next queue. If there is no next processor, the packs are yield. # # 6) The loop terminates when the last queue contains only a poison pack # # Here is the sample pipeline and its execution # # Assume 1 pack corresponds to a batch of size 1 # # After 1st step (iteration), reading from the reader, # # batch_size = 2 batch_size = 2 # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor) # # |___________| # |___________| # |___________| # |___________| # |_J1:QUEUED_| # # B1 needs another pack to process job J1 # # After 2nd step (iteration), # # batch_size = 2 batch_size = 2 # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor) # # |___________| |_______________| # |___________| |_______________| # |___________| |_______________| # |___________| |_J2:UNPROCESSED_| # |___________| |_J1:UNPROCESSED_| # # B1 processes both the packs, the jobs are moved to the next queue. # # After 3rd step (iteration), # # batch_size = 2 batch_size = 2 # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor) # # |___________| |_______________| |_______________| # |___________| |_______________| |_______________| # |___________| |_______________| |_______________| # |___________| |_______________| |_______________| # |___________| |_J2:UNPROCESSED_| |_J1:UNPROCESSED_| # # P1 processes the first job. However, there exists one UNPROCESSED job # J2 in the queue. Pipeline first processes this job before moving to the # next processor # # After 4th step (iteration), # # batch_size = 2 batch_size = 2 # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor) # # |___________| |_______________| |_______________| # |___________| |_______________| |_______________| # |___________| |_______________| |_______________| # |___________| |_______________| |_J2:UNPROCESSED_| # |___________| |_______________| |_J1:UNPROCESSED_| # # # After 5th step (iteration), # # batch_size = 2 batch_size = 2 # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor) # # |___________| |_______________| |_______________| # |___________| |_______________| |_______________| # |___________| |_______________| |_______________| --> Yield J1.pack and J2.pack # |___________| |_______________| |_______________| # |___________| |_______________| |_______________| if not self.initialized: raise ProcessFlowException( "Please call initialize before running the pipeline") buffer = ProcessBuffer(self, data_iter) if len(self.components) == 0: yield from data_iter # Write return here instead of using if..else to reduce indent. return while not self.proc_mgr.exhausted(): # job has to be the first UNPROCESSED element # the status of the job now is UNPROCESSED unprocessed_job: ProcessJob = next(buffer) processor_index = self.proc_mgr.current_processor_index processor = self.components[processor_index] selector = self._selectors[processor_index] current_queue_index = self.proc_mgr.current_queue_index current_queue = self.proc_mgr.current_queue pipeline_length = self.proc_mgr.pipeline_length unprocessed_queue_indices = \ self.proc_mgr.unprocessed_queue_indices processed_queue_indices = \ self.proc_mgr.processed_queue_indices next_queue_index = current_queue_index + 1 should_yield = next_queue_index >= pipeline_length if not unprocessed_job.is_poison: for pack in selector.select(unprocessed_job.pack): # First, perform the component action on the pack try: if isinstance(processor, Caster): # Replacing the job pack with the casted version. unprocessed_job.alter_pack(processor.cast(pack)) elif isinstance(processor, BaseProcessor): processor.process(pack) elif isinstance(processor, Evaluator): processor.consume_next( pack, self._predict_to_gold[unprocessed_job.id]) # After the component action, make sure the entry is # added into the index. pack.add_all_remaining_entries() except Exception as e: raise ProcessExecutionException( f'Exception occurred when running ' f'{processor.name}') from e # Then, based on component type, handle the queue. if isinstance(processor, BaseBatchProcessor): index = unprocessed_queue_indices[current_queue_index] # check status of all the jobs up to "index" for i, job_i in enumerate( itertools.islice(current_queue, 0, index + 1)): if job_i.status == ProcessJobStatus.PROCESSED: processed_queue_indices[ current_queue_index] = i # there are UNPROCESSED jobs in the queue if index < len(current_queue) - 1: unprocessed_queue_indices[current_queue_index] \ += 1 # Fetch more data from the reader to process the # first job elif (processed_queue_indices[current_queue_index] == -1): unprocessed_queue_indices[current_queue_index] \ = len(current_queue) self.proc_mgr.current_processor_index = 0 self.proc_mgr.current_queue_index = -1 else: processed_queue_index = \ processed_queue_indices[current_queue_index] # move or yield the pack c_queue = list(current_queue) for job_i in \ c_queue[:processed_queue_index + 1]: if should_yield: if job_i.id in self._predict_to_gold: self._predict_to_gold.pop(job_i.id) yield job_i.pack else: self.proc_mgr.add_to_queue( queue_index=next_queue_index, job=job_i) current_queue.popleft() # set the UNPROCESSED and PROCESSED indices unprocessed_queue_indices[current_queue_index] \ = len(current_queue) processed_queue_indices[current_queue_index] \ = -1 if should_yield: self.proc_mgr.current_processor_index = 0 self.proc_mgr.current_queue_index = -1 else: self.proc_mgr.current_processor_index \ = next_queue_index self.proc_mgr.current_queue_index \ = next_queue_index # Besides Batch Processors, the other component type only # deal with one pack at a time, these include: PackProcessor # Evaluator, Caster. # - Move them to the next queue else: index = unprocessed_queue_indices[current_queue_index] # there are UNPROCESSED jobs in the queue if index < len(current_queue) - 1: unprocessed_queue_indices[current_queue_index] \ += 1 else: # current_queue is modified in this array for job_i in list(current_queue): if should_yield: if job_i.id in self._predict_to_gold: self._predict_to_gold.pop(job_i.id) yield job_i.pack else: self.proc_mgr.add_to_queue( queue_index=next_queue_index, job=job_i) current_queue.popleft() # set the UNPROCESSED index # we do not use "processed_queue_indices" as the # jobs get PROCESSED whenever they are passed # into a PackProcessor unprocessed_queue_indices[current_queue_index] \ = len(current_queue) # update the current queue and processor only # when all the jobs are processed in the current # queue if should_yield: self.proc_mgr.current_processor_index = 0 self.proc_mgr.current_queue_index = -1 else: self.proc_mgr.current_processor_index \ = next_queue_index self.proc_mgr.current_queue_index \ = next_queue_index else: processor.flush() # current queue is modified in the loop for job in list(current_queue): if job.status != ProcessJobStatus.PROCESSED and \ not job.is_poison: raise ValueError("Job is neither PROCESSED nor is " "a poison. Something went wrong " "during execution.") if not job.is_poison and should_yield: if job.id in self._predict_to_gold: self._predict_to_gold.pop(job.id) yield job.pack elif not should_yield: self.proc_mgr.add_to_queue( queue_index=next_queue_index, job=job) if not job.is_poison: current_queue.popleft() if not should_yield: # set next processor and queue as current self.proc_mgr.current_processor_index = next_queue_index self.proc_mgr.current_queue_index = next_queue_index