예제 #1
0
    def _parse_pack(self, data_source: str) -> Iterator[DataPack]:
        if data_source is None:
            raise ProcessExecutionException(
                "Data source is None, cannot deserialize.")

        pack: DataPack = DataPack.deserialize(data_source)

        if pack is None:
            raise ProcessExecutionException(
                f"Cannot recover pack from the following data source: \n"
                f"{data_source}")

        yield pack
예제 #2
0
 def multipack_name(self, pack: MultiPack) -> str:
     name = pack.pack_name
     if name is None:
         raise ProcessExecutionException(
             'Cannot used the DocIdMultiPackWriter because the doc id is '
             'not assigned for the pack.')
     return name
예제 #3
0
 def pack_name(self, pack: DataPack) -> str:
     name = pack.pack_name
     if name is None:
         raise ProcessExecutionException(
             'Cannot used the DocIdMultiPackWriter because the [pack_name] '
             'is not assigned for the pack.')
     return name
예제 #4
0
 def _process_with_component(
         self, selector: Selector, component: PipelineComponent,
         raw_job: ProcessJob):
     for pack in selector.select(raw_job.pack):
         # First, perform the component action on the pack
         try:
             if isinstance(component, Caster):
                 # Replacing the job pack with the casted version.
                 raw_job.alter_pack(component.cast(pack))
             elif isinstance(component, BaseBatchProcessor):
                 pack.set_control_component(component.name)
                 component.process(pack)
             elif isinstance(component, Evaluator):
                 pack.set_control_component(component.name)
                 component.consume_next(
                     pack, self._predict_to_gold[raw_job.id]
                 )
             elif isinstance(component, BaseProcessor):
                 # Should be BasePackProcessor:
                 # All other processor are considered to be
                 # streaming processor like this.
                 pack.set_control_component(component.name)
                 component.process(pack)
             # After the component action, make sure the entry is
             # added into the index.
             pack.add_all_remaining_entries()
         except ValueError as e:
             raise ProcessExecutionException(
                 f'Exception occurred when running '
                 f'{component.name}') from e
예제 #5
0
 def pack_name(self, pack: DataPack) -> str:
     name = pack.meta.doc_id
     if name is None:
         raise ProcessExecutionException(
             'Cannot used the DocIdMultiPackWriter because the doc id '
             'is not assigned for the pack %d.' % pack.meta.pack_id)
     return name
예제 #6
0
    def _parse_pack(self, data_source: Any) -> Iterator[DataPack]:
        if data_source is None:
            raise ProcessExecutionException(
                "Data source is None, cannot deserialize.")

        pack: DataPack = DataPack.deserialize(
            data_source,
            serialize_method=self.configs.serialize_method,
            zip_pack=self.configs.zip_pack,
        )

        if pack is None:
            raise ProcessExecutionException(
                f"Cannot recover pack from the following data source: \n"
                f"{data_source}")

        yield pack
예제 #7
0
    def parse_pack(self, collection: Any) -> Iterator[PackType]:
        r"""Calls :meth:`_parse_pack` to create packs from the collection.
        This internally setup the component meta data. Users should implement
        the :meth:`_parse_pack` method.
        """
        if collection is None:
            raise ProcessExecutionException(
                "Got None collection, cannot parse as data pack.")

        for p in self._parse_pack(collection):
            p.add_all_remaining_entries(self.name)
            yield p
예제 #8
0
    def _get_cache_location(self, collection: Any) -> str:
        r"""Gets the path to the cache file for a collection.

        Args:
            collection: information to compute cache key.

        Returns (Path): file path to the cache file for a Pack.
        """
        # pylint: disable=assignment-from-none
        file_path = self._cache_key_function(collection)
        if file_path is None:
            raise ProcessExecutionException(
                "Cache key is None. You probably set `from_cache` to true but "
                "fail to implement the _cache_key_function")

        return os.path.join(str(self._cache_directory), file_path)
예제 #9
0
    def set_text(
            self, text: str, replace_func:
            Optional[Callable[[str], ReplaceOperationsType]] = None):

        if len(text) < len(self._text):
            raise ProcessExecutionException(
                "The new text is overwriting the original one with shorter "
                "length, which might cause unexpected behavior.")

        if len(self._text):
            logging.warning("Need to be cautious when changing the text of a "
                            "data pack, existing entries may get affected. ")

        span_ops = [] if replace_func is None else replace_func(text)

        # The spans should be mutually exclusive
        (
            self._text, self.replace_back_operations,
            self.processed_original_spans, self.orig_text_len
        ) = data_utils_io.modify_text_and_track_ops(text, span_ops)
예제 #10
0
    def _process_packs(self,
                       data_iter: Iterator[PackType]) -> Iterator[PackType]:
        r"""Process the packs received from the reader by the running through
        the pipeline.

        Args:
             data_iter (iterator): Iterator yielding jobs that contain packs

        Returns:
            Yields packs that are processed by the pipeline.
        """

        # pylint: disable=line-too-long

        # Here is the logic for the execution of the pipeline.

        # The basic idea is to yield a pack as soon as it gets processed by all
        # the processors instead of waiting for later jobs to get processed.

        # 1) A job can be in three status
        #  - UNPROCESSED
        #  - QUEUED
        #  - PROCESSED
        #
        # 2) Each processor maintains a queue to hold the jobs to be executed
        # next.
        #
        # 3) In case of a BatchProcessor, a job enters into QUEUED status if the
        # job does not satisfy the `batch_size` requirement of that processor.
        # In that case, the pipeline requests for additional jobs from the
        # reader and starts the execution loop from the beginning.
        #
        # 4) At any point, while moving to the next processor, the pipeline
        # ensures that all jobs are either in QUEUED or PROCESSED status. If
        # they are PROCESSED, they will be moved to the next queue. This design
        # ensures that at any point, while processing the job at processor `i`,
        # all the jobs in the previous queues are in QUEUED status. So whenever
        # a new job is needed, the pipeline can directly request it from the
        # reader instead of looking at previous queues for UNPROCESSED jobs.
        #
        # 5) When a processor receives a poison pack, it flushes all the
        # remaining batches in its memory (this actually has no effect in
        # PackProcessors) and moves the jobs including the poison pack to the
        # next queue. If there is no next processor, the packs are yield.
        #
        # 6) The loop terminates when the last queue contains only a poison pack
        #
        # Here is the sample pipeline and its execution
        #
        # Assume 1 pack corresponds to a batch of size 1
        #
        # After 1st step (iteration), reading from the reader,
        #
        #            batch_size = 2                               batch_size = 2
        #  Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|
        #          |___________|
        #          |___________|
        #          |___________|
        #          |_J1:QUEUED_|
        #
        # B1 needs another pack to process job J1
        #
        # After 2nd step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|       |_______________|
        #          |___________|       |_______________|
        #          |___________|       |_______________|
        #          |___________|       |_J2:UNPROCESSED_|
        #          |___________|       |_J1:UNPROCESSED_|
        #
        # B1 processes both the packs, the jobs are moved to the next queue.
        #
        # After 3rd step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_J2:UNPROCESSED_|     |_J1:UNPROCESSED_|
        #
        # P1 processes the first job. However, there exists one UNPROCESSED job
        # J2 in the queue. Pipeline first processes this job before moving to the
        # next processor
        #
        # After 4th step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_J2:UNPROCESSED_|
        #        |___________|       |_______________|     |_J1:UNPROCESSED_|
        #
        #
        # After 5th step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|    --> Yield J1.pack and J2.pack
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|

        if not self.initialized:
            raise ProcessFlowException(
                "Please call initialize before running the pipeline")

        buffer = ProcessBuffer(self, data_iter)

        if len(self.components) == 0:
            yield from data_iter
            # Write return here instead of using if..else to reduce indent.
            return

        while not self.proc_mgr.exhausted():
            # job has to be the first UNPROCESSED element
            # the status of the job now is UNPROCESSED
            unprocessed_job: ProcessJob = next(buffer)

            processor_index = self.proc_mgr.current_processor_index
            processor = self.components[processor_index]
            selector = self._selectors[processor_index]
            current_queue_index = self.proc_mgr.current_queue_index
            current_queue = self.proc_mgr.current_queue
            pipeline_length = self.proc_mgr.pipeline_length
            unprocessed_queue_indices = \
                self.proc_mgr.unprocessed_queue_indices
            processed_queue_indices = \
                self.proc_mgr.processed_queue_indices
            next_queue_index = current_queue_index + 1
            should_yield = next_queue_index >= pipeline_length

            if not unprocessed_job.is_poison:
                for pack in selector.select(unprocessed_job.pack):
                    # First, perform the component action on the pack
                    try:
                        if isinstance(processor, Caster):
                            # Replacing the job pack with the casted version.
                            unprocessed_job.alter_pack(processor.cast(pack))
                        elif isinstance(processor, BaseProcessor):
                            processor.process(pack)
                        elif isinstance(processor, Evaluator):
                            processor.consume_next(
                                pack,
                                self._predict_to_gold[unprocessed_job.id])

                        # After the component action, make sure the entry is
                        # added into the index.
                        pack.add_all_remaining_entries()
                    except Exception as e:
                        raise ProcessExecutionException(
                            f'Exception occurred when running '
                            f'{processor.name}') from e

                    # Then, based on component type, handle the queue.
                    if isinstance(processor, BaseBatchProcessor):
                        index = unprocessed_queue_indices[current_queue_index]

                        # check status of all the jobs up to "index"
                        for i, job_i in enumerate(
                                itertools.islice(current_queue, 0, index + 1)):

                            if job_i.status == ProcessJobStatus.PROCESSED:
                                processed_queue_indices[
                                    current_queue_index] = i

                        # there are UNPROCESSED jobs in the queue
                        if index < len(current_queue) - 1:
                            unprocessed_queue_indices[current_queue_index] \
                                += 1

                        # Fetch more data from the reader to process the
                        # first job
                        elif (processed_queue_indices[current_queue_index] ==
                              -1):

                            unprocessed_queue_indices[current_queue_index] \
                                = len(current_queue)

                            self.proc_mgr.current_processor_index = 0

                            self.proc_mgr.current_queue_index = -1

                        else:
                            processed_queue_index = \
                                processed_queue_indices[current_queue_index]

                            # move or yield the pack
                            c_queue = list(current_queue)
                            for job_i in \
                                    c_queue[:processed_queue_index + 1]:

                                if should_yield:
                                    if job_i.id in self._predict_to_gold:
                                        self._predict_to_gold.pop(job_i.id)
                                    yield job_i.pack
                                else:
                                    self.proc_mgr.add_to_queue(
                                        queue_index=next_queue_index,
                                        job=job_i)
                                current_queue.popleft()

                            # set the UNPROCESSED and PROCESSED indices
                            unprocessed_queue_indices[current_queue_index] \
                                = len(current_queue)

                            processed_queue_indices[current_queue_index] \
                                = -1

                            if should_yield:
                                self.proc_mgr.current_processor_index = 0
                                self.proc_mgr.current_queue_index = -1
                            else:
                                self.proc_mgr.current_processor_index \
                                    = next_queue_index
                                self.proc_mgr.current_queue_index \
                                    = next_queue_index

                    # Besides Batch Processors, the other component type only
                    # deal with one pack at a time, these include: PackProcessor
                    # Evaluator, Caster.
                    # - Move them to the next queue
                    else:
                        index = unprocessed_queue_indices[current_queue_index]

                        # there are UNPROCESSED jobs in the queue
                        if index < len(current_queue) - 1:
                            unprocessed_queue_indices[current_queue_index] \
                                += 1
                        else:
                            # current_queue is modified in this array
                            for job_i in list(current_queue):
                                if should_yield:
                                    if job_i.id in self._predict_to_gold:
                                        self._predict_to_gold.pop(job_i.id)
                                    yield job_i.pack
                                else:
                                    self.proc_mgr.add_to_queue(
                                        queue_index=next_queue_index,
                                        job=job_i)
                                current_queue.popleft()

                            # set the UNPROCESSED index
                            # we do not use "processed_queue_indices" as the
                            # jobs get PROCESSED whenever they are passed
                            # into a PackProcessor
                            unprocessed_queue_indices[current_queue_index] \
                                = len(current_queue)

                            # update the current queue and processor only
                            # when all the jobs are processed in the current
                            # queue
                            if should_yield:
                                self.proc_mgr.current_processor_index = 0
                                self.proc_mgr.current_queue_index = -1

                            else:
                                self.proc_mgr.current_processor_index \
                                    = next_queue_index
                                self.proc_mgr.current_queue_index \
                                    = next_queue_index
            else:
                processor.flush()

                # current queue is modified in the loop
                for job in list(current_queue):
                    if job.status != ProcessJobStatus.PROCESSED and \
                            not job.is_poison:
                        raise ValueError("Job is neither PROCESSED nor is "
                                         "a poison. Something went wrong "
                                         "during execution.")

                    if not job.is_poison and should_yield:
                        if job.id in self._predict_to_gold:
                            self._predict_to_gold.pop(job.id)
                        yield job.pack

                    elif not should_yield:
                        self.proc_mgr.add_to_queue(
                            queue_index=next_queue_index, job=job)

                    if not job.is_poison:
                        current_queue.popleft()

                if not should_yield:
                    # set next processor and queue as current
                    self.proc_mgr.current_processor_index = next_queue_index
                    self.proc_mgr.current_queue_index = next_queue_index