示例#1
0
class ResourcesTest(unittest.TestCase):
    def setUp(self):
        self.kwargs = {
            '1': 'one',
            'dummy': DummyObject(1, 2)
        }
        self.resources = Resources(**self.kwargs)
        self.output_dir = tempfile.mkdtemp()

    def test_save_with_keys(self):
        keys = list(self.kwargs.keys())
        self.resources.save(keys=keys, output_dir=self.output_dir)

        new_resources = Resources()
        new_resources.load(keys=keys, path=self.output_dir)

        self.assertEqual(new_resources._resources, self.resources._resources)

    def test_save_without_keys(self):
        self.resources.save(output_dir=self.output_dir)

        new_resources = Resources()
        keys = list(self.kwargs.keys())
        new_resources.load(keys=keys, path=self.output_dir)

        self.assertEqual(new_resources._resources, self.resources._resources)

    def tearDown(self) -> None:
        shutil.rmtree(self.output_dir)
示例#2
0
 def setUp(self):
     self.kwargs = {
         '1': 'one',
         'dummy': DummyObject(1, 2)
     }
     self.resources = Resources(**self.kwargs)
     self.output_dir = tempfile.mkdtemp()
示例#3
0
class ResourcesTest(unittest.TestCase):
    def setUp(self):
        self.kwargs = {
            '1': 'one',
            'dummy': DummyObject(1, 2)
        }
        self.resources = Resources(**self.kwargs)
        self.output_dir = './'

    def test_save_with_keys(self):
        keys = list(self.kwargs.keys())
        self.resources.save(keys=keys, output_dir=self.output_dir)

        new_resources = Resources()
        new_resources.load(keys=keys, path=self.output_dir)

        self.assertEqual(new_resources.resources, self.resources.resources)

    def test_save_without_keys(self):
        self.resources.save(output_dir=self.output_dir)

        new_resources = Resources()
        keys = list(self.kwargs.keys())
        new_resources.load(keys=keys, path=self.output_dir)

        self.assertEqual(new_resources.resources, self.resources.resources)
示例#4
0
    def initialize(self, resource: Resources, configs: HParams):

        self.resource = resource

        self.word_alphabet = resource.get("word_alphabet")
        self.char_alphabet = resource.get("char_alphabet")
        self.ner_alphabet = resource.get("ner_alphabet")

        word_embedding_table = resource.get('word_embedding_table')

        self.config_model = configs.config_model
        self.config_data = configs.config_data

        self.normalize_func = utils.normalize_digit_word

        self.device = torch.device("cuda") if torch.cuda.is_available() \
            else torch.device("cpu")

        utils.set_random_seed(self.config_model.random_seed)

        self.model = BiRecurrentConvCRF(
            word_embedding_table, self.char_alphabet.size(),
            self.ner_alphabet.size(), self.config_model).to(device=self.device)

        self.optim = SGD(self.model.parameters(),
                         lr=self.config_model.learning_rate,
                         momentum=self.config_model.momentum,
                         nesterov=True)

        self.trained_epochs = 0

        self.resource.update(model=self.model)
示例#5
0
    def test_save_without_keys(self):
        self.resources.save(output_dir=self.output_dir)

        new_resources = Resources()
        keys = list(self.kwargs.keys())
        new_resources.load(keys=keys, path=self.output_dir)

        self.assertEqual(new_resources._resources, self.resources._resources)
示例#6
0
    def initialize(self, resources: Resources, configs: Optional[Config]):
        """
        Args:
            resources:
            configs: A config with the following keys:
                * input_pack_name: specify the input pack name of the MultiPack
                  to be processed
                * output_pack_name: specify the output pack name of the
                  MultiPack to be processed
                * max_decoding_length: the maximum decoding length.
                * top_k
                * top_p
                * temperature

        Returns:
        """
        super().initialize(resources, configs)

        if configs is not None:
            self.input_pack_name = configs.input_pack_name
            self.output_pack_name = configs.output_pack_name

            self.max_decoding_length = configs.max_decoding_length
            self.temperature = configs.temperature
            self.top_k = configs.top_k
            self.top_p = configs.top_p
            self.model = tx.modules.GPT2Decoder(configs.pretrained_model_name)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        self.model.to(device=self.device)

        resources.update(model=self.model)
        self.word_processor = tx.data.GPT2Tokenizer(
            pretrained_model_name=configs.pretrained_model_name
        )

        end_token = self.word_processor.map_token_to_id("<|endoftext|>")

        def _get_helper(start_tokens):
            if self.top_p:
                helper = tx.modules.TopPSampleEmbeddingHelper(
                    start_tokens=start_tokens,
                    end_token=end_token,
                    p=self.top_p,
                    softmax_temperature=self.temperature,
                )
            else:
                helper = tx.modules.TopKSampleEmbeddingHelper(
                    start_tokens=start_tokens,
                    end_token=end_token,
                    top_k=self.top_k,
                    softmax_temperature=self.temperature,
                )
            return helper

        self._get_helper = _get_helper
示例#7
0
    def initialize(self, resources: Resources, configs: Config):
        """
        The training pipeline will run this initialization method during
        the initialization phase and send resources in as parameters.

        Args:
            resources: The resources shared in the pipeline.
            configs: configuration object for this trainer.

        Returns:

        """
        self.resource = resources

        self.word_alphabet = resources.get("word_alphabet")
        self.char_alphabet = resources.get("char_alphabet")
        self.ner_alphabet = resources.get("ner_alphabet")

        word_embedding_table = resources.get("word_embedding_table")

        self.config_model = configs.config_model
        self.config_data = configs.config_data

        self.normalize_func = utils.normalize_digit_word

        self.device = (
            torch.device("cuda")
            if torch.cuda.is_available()
            else torch.device("cpu")
        )

        utils.set_random_seed(self.config_model.random_seed)

        self.model = BiRecurrentConvCRF(
            word_embedding_table,
            self.char_alphabet.size(),
            self.ner_alphabet.size(),
            self.config_model,
        ).to(device=self.device)

        self.optim = SGD(
            self.model.parameters(),
            lr=self.config_model.learning_rate,
            momentum=self.config_model.momentum,
            nesterov=True,
        )

        self.trained_epochs = 0

        self.resource.update(model=self.model)
示例#8
0
    def __init__(self, resource: Optional[Resources] = None):
        self._reader: BaseReader
        self._reader_config: Optional[Config] = None

        self._components: List[PipelineComponent] = []
        self._selectors: List[Selector] = []

        self._processors_index: Dict = {'': -1}
        self._configs: List[Optional[Config]] = []

        # Will initialize at `initialize` because the processors length is
        # unknown.
        self._proc_mgr: ProcessManager = None  # type: ignore

        self.evaluator_indices: List[int] = []

        # needed for evaluator
        self._predict_to_gold: Dict[int, PackType] = {}

        if resource is None:
            self.resource = Resources()
        else:
            self.resource = resource

        self.initialized: bool = False
        self._check_type_consistency: bool = False
 def __init__(self):
     self.resources: Resources = Resources()
     self.configs: Config = Config({}, {})
     # Determine whether to check the consistencies between components.
     self._check_type_consistency: bool = False
     # The flag indicating whether the component is initialized.
     self.__is_initialized: bool = False
示例#10
0
def setup(config: Config) -> Pipeline:
    resource = Resources()
    query_pipeline = Pipeline[MultiPack](resource=resource)
    query_pipeline.set_reader(
        reader=MultiPackTerminalReader(), config=config.reader)
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.translator)
    query_pipeline.add(
        component=BertBasedQueryCreator(), config=config.query_creator)
    query_pipeline.add(
        component=SearchProcessor(), config=config.searcher)

    top_response_pack_name = config.indexer.response_pack_name + '_0'

    query_pipeline.add(
        component=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.back_translator)

    query_pipeline.initialize()

    return query_pipeline
示例#11
0
    def __init__(self, resource: Optional[Resources] = None):
        self._reader: BaseReader
        self._reader_config: Optional[Config]

        self._components: List[PipelineComponent] = []
        self._selectors: List[Selector] = []

        self._processors_index: Dict = {'': -1}
        self._configs: List[Optional[Config]] = []

        # Will intialize at `initialize` because the processors length is
        # unknown.
        self.proc_mgr: _ProcessManager

        # This manager controls global pack access information
        self._pack_manager: PackManager = PackManager()
        self._pack_manager.reset()

        self.evaluator_indices: List[int] = []

        # needed for evaluator
        self._predict_to_gold: Dict[int, PackType] = {}

        if resource is None:
            self.resource = Resources()
        else:
            self.resource = resource

        self.initialized: bool = False
示例#12
0
    def __init__(
        self,
        train_reader: BaseReader,
        trainer: BaseTrainer,
        dev_reader: BaseReader,
        configs: Config,
        preprocessors: Optional[List[BaseProcessor]] = None,
        evaluator: Optional[Evaluator] = None,
        predictor: Optional[BaseProcessor] = None,
    ):
        self.resource = Resources()
        self.configs = configs

        train_reader.initialize(self.resource, self.configs.reader)

        if preprocessors is not None:
            for p in preprocessors:
                p.initialize(resources=self.resource,
                             configs=configs.preprocessor)
            self.preprocessors = preprocessors
        else:
            self.preprocessors = []

        self.train_reader = train_reader
        self.dev_reader = dev_reader
        self.trainer = trainer

        if predictor is not None:
            self.predictor = predictor

        if evaluator is not None:
            self.evaluator = evaluator
            self.evaluator.initialize(self.resource, self.configs.evaluator)
示例#13
0
    def finish(self, resources: Resources):  # pylint: disable=unused-argument
        """
        Releasing resources and saving models.

        Args:
            resources: The resources used by the training process.

        Returns:

        """
        if self.resource:
            keys_to_serializers = {}
            for key in resources.keys():
                # pylint: disable=consider-using-with
                if key == "model":
                    keys_to_serializers[key] = lambda x, y: pickle.dump(
                        x.state_dict(), open(y, "wb"))
                else:
                    keys_to_serializers[key] = lambda x, y: pickle.dump(
                        x, open(y, "wb"))

            self.resource.save(keys_to_serializers,
                               output_dir=self.config_model.resource_dir)

        self.__save_model_checkpoint()
示例#14
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        if resources.get("device"):
            self.device = resources.get("device")
        else:
            self.device = torch.device('cuda') if torch.cuda.is_available() \
                else torch.device('cpu')

        self.resources = resources
        self.ft_configs = configs

        model_path = self.ft_configs.model_path
        self.model_config = AutoConfig.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_path,
            from_tf=bool(".ckpt" in model_path),
            config=self.model_config)
        self.model.to(self.device)
示例#15
0
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs_properties: str, base_output_path: str):
    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress('Loading redirects', '\n')
    redirect_pickle = os.path.join(base_output_path, 'redirects.pickle')

    redirect_map: Dict[str, str]
    if os.path.exists(redirect_pickle):
        redirect_map = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", '\n')

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, 'nif_raw')
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", '\n')

    # 2. Add the rest of wiki page structures:
    struct_dir = raw_pack_dir + '_struct'
    add_wiki_info(WikiStructReader(), resources, nif_page_structure,
                  raw_pack_dir, struct_dir, 'page_structures', True)
    print_progress("Done reading wikipedia structures.", '\n')

    link_dir = struct_dir + '_links'
    add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir,
                  link_dir, 'anchor_links', True)
    print_progress("Done reading wikipedia anchors.", '\n')

    property_dir = link_dir + '_property'
    add_wiki_info(WikiPropertyReader(), resources, info_boxs_properties,
                  link_dir, property_dir, 'info_box_properties', True)
    print_progress("Done reading wikipedia info-boxes.", '\n')

    literal_dir = property_dir + '_literals'
    add_wiki_info(WikiInfoBoxReader(), resources, mapping_literals,
                  property_dir, literal_dir, 'literals', True)
    print_progress("Done reading wikipedia info-boxes literals.", '\n')

    mapping_dir = literal_dir + '_objects'
    add_wiki_info(WikiInfoBoxReader(), resources, mapping_objects, literal_dir,
                  mapping_dir, 'objects', True)
    print_progress("Done reading wikipedia info-boxes objects.", '\n')
示例#16
0
    def __init__(self, resource: Optional[Resources] = None):
        self._reader: BaseReader
        self._reader_config: Optional[HParams]

        self._processors: List[BaseProcessor] = []
        self._selectors: List[Selector] = []

        self._processors_index: Dict = {'': -1}
        self._configs: List[Optional[HParams]] = []

        if resource is None:
            self.resource = Resources()
        else:
            self.resource = resource
示例#17
0
    def finish(self, resources: Resources):  # pylint: disable=unused-argument
        if self.resource:
            keys_to_serializers = {}
            for key in resources.keys():
                if key == "model":
                    keys_to_serializers[key] = \
                        lambda x, y: pickle.dump(x.state_dict(), open(y, "wb"))
                else:
                    keys_to_serializers[key] = \
                        lambda x, y: pickle.dump(x, open(y, "wb"))

            self.resource.save(keys_to_serializers)

        self.save_model_checkpoint()
示例#18
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        self.resource = resources
        self.config_model = configs.config_model
        self.config_data = configs.config_data

        resource_path = configs.config_model.resource_dir

        keys = {
            "word_alphabet",
            "char_alphabet",
            "ner_alphabet",
            "word_embedding_table",
        }

        missing_keys = list(keys.difference(self.resource.keys()))

        self.resource.load(keys=missing_keys, path=resource_path)

        self.word_alphabet = resources.get("word_alphabet")
        self.char_alphabet = resources.get("char_alphabet")
        self.ner_alphabet = resources.get("ner_alphabet")
        word_embedding_table = resources.get("word_embedding_table")

        if resources.get("device"):
            self.device = resources.get("device")
        else:
            self.device = (torch.device("cuda") if torch.cuda.is_available()
                           else torch.device("cpu"))

        self.normalize_func = utils.normalize_digit_word

        if "model" not in self.resource.keys():

            def load_model(path):
                model = BiRecurrentConvCRF(
                    word_embedding_table,
                    self.char_alphabet.size(),
                    self.ner_alphabet.size(),
                    self.config_model,
                )

                if os.path.exists(path):
                    with open(path, "rb") as f:
                        weights = torch.load(f, map_location=self.device)
                        model.load_state_dict(weights)
                return model

            self.resource.load(keys={"model": load_model}, path=resource_path)

        self.model = resources.get("model")
        self.model.to(self.device)
        self.model.eval()

        utils.set_random_seed(self.config_model.random_seed)
示例#19
0
    def initialize(self, resource: Resources, configs: HParams):

        self.define_batcher()

        self.resource = resource
        self.config_model = configs.config_model
        self.config_data = configs.config_data

        resource_path = configs.config_model.resource_dir

        keys = {
            "word_alphabet", "char_alphabet", "ner_alphabet",
            "word_embedding_table"
        }

        missing_keys = list(keys.difference(self.resource.keys()))

        self.resource.load(keys=missing_keys, path=resource_path)

        self.word_alphabet = resource.get("word_alphabet")
        self.char_alphabet = resource.get("char_alphabet")
        self.ner_alphabet = resource.get("ner_alphabet")
        word_embedding_table = resource.get("word_embedding_table")

        if resource.get("device"):
            self.device = resource.get("device")
        else:
            self.device = torch.device('cuda') if torch.cuda.is_available() \
                else torch.device('cpu')

        self.normalize_func = utils.normalize_digit_word

        if "model" not in self.resource.keys():

            def load_model(path):
                model = BiRecurrentConvCRF(word_embedding_table,
                                           self.char_alphabet.size(),
                                           self.ner_alphabet.size(),
                                           self.config_model)

                if os.path.exists(path):
                    with open(path, "rb") as f:
                        weights = pickle.load(f)
                        model.load_state_dict(weights)
                return model

            self.resource.load(keys={"model": load_model})

        self.model = resource.get("model")
        self.model.to(self.device)
        self.model.eval()

        utils.set_random_seed(self.config_model.random_seed)
示例#20
0
    def __init__(self, resource: Optional[Resources] = None):
        self._reader: BaseReader
        self._reader_config: Optional[HParams]

        self._processors: List[BaseProcessor] = []
        self._selectors: List[Selector] = []

        self._processors_index: Dict = {'': -1}
        self._configs: List[Optional[HParams]] = []

        self._evaluator: Optional[Evaluator] = None
        self._evaluator_config: Optional[HParams] = None

        # needed for evaluator
        self._predict_to_gold: Dict[int, PackType] = {}

        if resource is None:
            self.resource = Resources()
        else:
            self.resource = resource
示例#21
0
class Pipeline(Generic[PackType]):
    r"""This controls the main inference flow of the system. A pipeline is
    consisted of a set of Components (readers and processors). The data flows
    in the pipeline as data packs, and each component will use or add
    information to the data packs.
    """
    def __init__(
        self,
        resource: Optional[Resources] = None,
        ontology_file: Optional[str] = None,
        enforce_consistency: bool = False,
        do_init_type_check: bool = False,
    ):
        r"""

        Args:
            resource: The ``Resources`` object, which is a global registry used
                in the pipeline. Objects defined as ``Resources`` will be
                passed on to the processors in the
                pipeline for initialization.
            ontology_file: The path to the input ontology specification file,
                which should be a json file, and it should have all the entries
                inside with no import as key.
            enforce_consistency: This boolean determines whether the
                pipeline will check the content expectations specified in each
                pipeline component. Each component will check whether the input
                pack contains the expected data
                via checking the meta-data, and throws a
                :class:`~forte.common.exception.ExpectedEntryNotFound` if it
                fails. When this function is called with enforce is ``True``,
                all the pipeline components would check if the input datapack
                record matches
                with the expected types and attributes if function
                ``expected_types_and_attributes`` is implemented
                for the processor. For example, processor A requires entry type
                of ``ft.onto.base_ontology.Sentence``, and processor B would
                produce this type in the output datapack, so ``record`` function
                of processor B writes the record of this type in the datapack
                and processor A implements ``expected_types_and_attributes`` to
                add this type. Then when the pipeline runs with
                `enforce_consistency=True`, processor A would check if this
                type exists in the record of the output of the
                previous pipeline component.
            do_init_type_check: Determine whether to check records types and
                attributes during pipeline initialization. Default to `False`.
                If this boolean is set to `True`, each component in the
                pipeline will be validated by comparing its
                ``expected_types_and_attributes`` with the accumulated
                ``records`` from all the downstream components.
        """
        self._reader: BaseReader
        self._reader_config: Optional[Config] = None

        # These variables defines the units in the pipeline, they should be
        # of the same length
        self._components: List[PipelineComponent] = []
        self._selectors: List[Selector] = []
        self._configs: List[Optional[Config]] = []

        # Maintain a set of the pipeline components to fast check whether
        # the component is already there.
        self.__component_set: Set[PipelineComponent] = set()

        # Will initialize at `initialize` because the processors length is
        # unknown.
        self._proc_mgr: ProcessManager = None  # type: ignore

        self.evaluator_indices: List[int] = []

        # needed for evaluator
        self._predict_to_gold: Dict[int, PackType] = {}

        if resource is None:
            self.resource = Resources()
        else:
            self.resource = resource

        if ontology_file is None:
            with resources.path("forte.ontology_specs",
                                "base_ontology.json") as data_path:
                ontology_file = str(data_path)

        if ontology_file is not None:
            with open(ontology_file, "r") as f:
                spec_dict = json.load(f)
                self.resource.update(onto_specs_path=ontology_file)
                self.resource.update(onto_specs_dict=spec_dict)

        # The flag indicating whether this pipeline is initialized.
        self._initialized: bool = False
        # The flag indicating whether we want to enforce type consistency
        #  between the processors.
        self._check_type_consistency: bool = False

        # Create one copy of the dummy selector to reduce class creation.
        self.__default_selector: Selector = DummySelector()

        # needed for time profiling of pipeline
        self._enable_profiling: bool = False
        self._profiler: List[float] = []

        self._check_type_consistency = enforce_consistency

        # Indicate whether do type checking during pipeline initialization
        self._do_init_type_check: bool = do_init_type_check

    def enforce_consistency(self, enforce: bool = True):
        r"""This function determines whether the pipeline will check
        the content expectations specified in each pipeline component. This
        function works with :meth:`~forte.pipeline.Pipeline.initialize` called
        after itself. Each component will check whether the input pack contains
        the expected data via checking the meta-data, and throws a
        :class:`~forte.common.exception.ExpectedEntryNotFound` if the check
        fails. The example of implementation is mentioned in the docstrings of
        :meth:`~forte.pipeline.Pipeline.__init__`.

        Args:
            enforce: A boolean of whether to enable consistency checking
                for the pipeline or not.
        """
        self._check_type_consistency = enforce

    def init_from_config_path(self, config_path):
        r"""Read the configurations from the given path ``config_path``
        and build the pipeline with the config.

        Args:
            config_path: A string of the configuration path, which is
                is a YAML file that specify the structure and parameters of the
                pipeline.
        """
        configs = yaml.safe_load(open(config_path))
        self.init_from_config(configs)

    def init_from_config(self, configs: List):
        r"""Initialized the pipeline (ontology and processors) from the
        given configurations.

        Args:
            configs: The configs used to initialize the pipeline.
        """

        is_first: bool = True
        for component_config in configs:
            component = create_class_with_kwargs(
                class_name=component_config["type"],
                class_args=component_config.get("kwargs", {}),
            )

            if is_first:
                if not isinstance(component, BaseReader):
                    raise ProcessorConfigError(
                        "The first component of a pipeline must be a reader.")
                self.set_reader(component, component_config.get("configs", {}))
                is_first = False
            else:
                # Can be processor, caster, or evaluator
                self.add(component, component_config.get("configs", {}))

    def _dump_to_config(self):
        r"""Serialize the pipeline to an IR(intermediate representation).
        The returned IR can be passed to `init_from_config` to initialize
        a pipeline.

        Returns:
            dict: A dictionary storing IR.
        """
        configs: List[Dict] = []
        configs.append({
            "type":
            ".".join([self._reader.__module__,
                      type(self._reader).__name__]),
            "configs":
            self._reader_config.todict(),
        })
        for component, config in zip(self.components, self.component_configs):
            configs.append({
                "type":
                ".".join([component.__module__,
                          type(component).__name__]),
                "configs":
                config.todict(),
            })
        return configs

    def save(self, path: str):
        r"""Store the pipeline as an IR(intermediate representation) in yaml.
        The path can then be passed to ``init_from_config_path`` to initialize
        a pipeline. Note that calling ``init_from_config`` from a different
        python environment may not work for some self defined component classes
        because their module name is `__main__`.

        Args:
            path: The file path to save configurations.
        """
        with open(path, "w") as f:
            yaml.safe_dump(self._dump_to_config(), f)

    def _remote_service_app(self,
                            service_name: str = "",
                            input_format: str = "string"):
        r"""Return a FastAPI app that can be used to serve the pipeline.

        Args:
            service_name: Assign a name to the pipeline service for validation.
                This will appear in the `service_name` field on default page
                and can be queried and validated against the expected service
                name set by user. Default to `''`.
            input_format: Specify format of the input for validation. It can be
                `"string"` or `"DataPack"`. This will appear in the
                `input_format` field on default page and can be queried and
                validated against the expected input format set by user.
                Default to `"string"`.

        Returns:
            FastAPI: A FastAPI app for remote service.
        """
        # TODO: Currently we only support the `process` function, but it can
        # be extended by adding new interfaces that wrap up any Pipeline
        # method. Refer to https://fastapi.tiangolo.com for more info.
        app = FastAPI()
        records: Optional[Dict[str, Set[str]]] = None

        class RequestBody(BaseModel):
            args: str = "[]"
            kwargs: str = "{}"

        # pylint: disable=unused-variable
        @app.get("/")
        def default_page():
            return {
                "status": "OK",
                "service_name": service_name,
                "input_format": input_format,
                "pipeline": self._dump_to_config(),
            }

        @app.get("/records")
        def get_records():
            nonlocal records
            if records is None:
                # Collect records of each pipeline component for validation
                records = {}
                for component in [self._reader] + self.components:
                    component.record(records)
            return {"status": "OK", "records": records}

        @app.get("/expectation")
        def get_expectation():
            expectation: Dict[str, Set[str]] = {}
            if len(self.components) > 0:
                expectation = self.components[0].expected_types_and_attributes(
                )
            return {"status": "OK", "expectation": expectation}

        @app.post("/process")
        def run_pipeline(body: RequestBody):
            args = json.loads(body.args)
            kwargs = json.loads(body.kwargs)
            result = self.process(*args, **kwargs)
            return {"status": "OK", "result": result.serialize()}

        # pylint: enable=unused-variable

        return app

    def serve(
        self,
        host: str = "localhost",
        port: int = 8008,
        service_name: str = "",
        input_format: str = "string",
    ):
        r"""Start a service of the current pipeline at a specified host
        and port.

        Args:
            host: Port number of pipeline service.
            port: Host name of pipeline service.
            service_name: Assign a name to the pipeline service for validation.
                This will appear in the `service_name` field on default page
                and can be queried and validated against the expected service
                name set by user. Default to `''`.
            input_format: Specify format of the input for validation. It can be
                `"string"` or `"DataPack"`. This will appear in the
                `input_format` field on default page and can be queried and
                validated against the expected input format set by user.
                Default to `"string"`.
        """
        self.initialize()
        uvicorn.run(
            self._remote_service_app(service_name=service_name,
                                     input_format=input_format),
            host=host,
            port=port,
            log_level="info",
        )

    def set_profiling(self, enable_profiling: bool = True):
        r"""Set profiling option.

        Args:
            enable_profiling: A boolean of whether to enable profiling
                for the pipeline or not (the default is True).
        """
        self._enable_profiling = enable_profiling

    def initialize(self) -> "Pipeline":
        """
        This function should be called before the pipeline can be used to
        process the actual data. This function will call the `initialize` of
        all the components inside this pipeline.

        Returns:

        """
        # create EntryTree type object merged_entry_tree to store the parsed
        # entry tree from ontology specification file passed in as part of
        # resource and add the result to resource with key of merged_entry_tree.
        merged_entry_tree = EntryTree()
        if self.resource.get("onto_specs_path"):
            OntologyCodeGenerator().parse_schema_for_no_import_onto_specs_file(
                ontology_path=self.resource.get("onto_specs_path"),
                ontology_dict=self.resource.get("onto_specs_dict"),
                merged_entry_tree=merged_entry_tree,
            )
            self.resource.update(merged_entry_tree=merged_entry_tree)

        # The process manager need to be assigned first.
        self._proc_mgr = ProcessManager(len(self._components))

        if self._initialized:
            # The pipeline has already been initialized, so we are doing
            # re-initialization here.
            logging.info("Re-initializing the Pipeline.")

        # Reset the flags of the components before initializing them.
        self._reader.reset_flags()
        for c in self._components:
            c.reset_flags()

        # Handle the reader.
        if not self._reader.is_initialized:
            self._reader.initialize(self.resource, self._reader_config)
        else:
            logging.info(
                "The reader [%s] has already initialized, "
                "will skip its initialization.",
                self._reader.name,
            )

        if self._check_type_consistency:
            self.reader.enforce_consistency(enforce=True)
        else:
            self.reader.enforce_consistency(enforce=False)

        # Handle other components.
        self.initialize_components()
        self._initialized = True

        # Create profiler
        if self._enable_profiling:
            self.reader.set_profiling(True)
            self._profiler = [0.0] * len(self.components)

        # Check record types and attributes of each pipeline component
        if self._do_init_type_check:
            current_records: Dict[str, Set[str]] = {}
            self._reader.record(current_records)
            for component in self.components:
                if hasattr(component, "expected_types_and_attributes"):
                    record_types_and_attributes_check(
                        component.expected_types_and_attributes(
                        ),  # type: ignore
                        current_records,
                    )
                if hasattr(component, "record"):
                    component.record(current_records)  # type: ignore

        return self

    def initialize_components(self):
        """
        This function will initialize all the components in this pipeline,
        except the reader. The components are initialized in a FIFO manner
        based on the order of insertion,

        During initialization, the component will be configured based on its
        corresponding configuration. However, if the component is already
        initialized (for example, being initialized manually or used twice
        in the same pipeline), the new configuration will be ignored.

        The pipeline will check for type dependencies between the components
        inside this pipeline, see
        :func:`~forte.pipeline_component.PipelineComponent.enforce_consistency`
        for more details.

        """
        for component, config in zip(self.components, self.component_configs):
            try:
                if not component.is_initialized:
                    component.initialize(self.resource, config)
                else:
                    logging.info(
                        "The component [%s] has already initialized, "
                        "will skip its initialization.",
                        component.name,
                    )
            except ProcessorConfigError as e:
                logging.error(
                    "Exception occur when initializing "
                    "processor %s",
                    component.name,
                )
                raise e

            component.enforce_consistency(enforce=self._check_type_consistency)

    def set_reader(
        self,
        reader: BaseReader,
        config: Optional[Union[Config, Dict[str, Any]]] = None,
    ) -> "Pipeline":
        """
        Set the reader of the pipeline. A reader is the entry point of
        this pipeline, data flown into the reader will be converted to the
        data pack format, and being passed onto the other components for
        processing.

        Args:
            reader: The reader to be used of the pipeline
            config: The custom configuration to be passed to the reader. If
              the config is not provided, the default config defined by the
              reader class will be used.

        Returns:
            The pipeline itself, which allows you to directly chain other
            pipeline construction code afterwards, i.e., you can do:

            .. code-block:: python

                Pipeline().set_reader(your_reader()).add(your_processor())

        """
        self._reader = reader
        self._reader_config = reader.make_configs(config)
        return self

    @property
    def reader(self) -> BaseReader:
        return self._reader

    @property
    def components(self) -> List[PipelineComponent]:
        """
        Return all the components in this pipeline, except the reader.

        Returns: A list containing the components.

        """
        return self._components

    @property
    def component_configs(self) -> List[Optional[Config]]:
        """
        Return the configs related to the components, except the reader.

        Returns: A list containing the components configs.

        """
        return self._configs

    def add(
        self,
        component: PipelineComponent,
        config: Optional[Union[Config, Dict[str, Any]]] = None,
        selector: Optional[Selector] = None,
    ) -> "Pipeline":
        """
        Adds a pipeline component to the pipeline. The pipeline components
        will form a chain based on the insertion order. The customized
        `config` and `selector` (:class:`~forte.data.selector.Selector`)
        will be associated with this particular component. If the `config`
        or the `selector` is not provided, the default ones will be used.

        Here, note that the same component instance can be added multiple
        times to the pipeline. In such cases, the instance will only be
        setup at the first insertion (i.e. its `initialize` function will
        only be called once). The subsequent insertion of the same component
        instance will not change the behavior nor the states of the instance.
        Thus, a different `config` cannot be provided (should be `None`) when
        added the second time, otherwise a `ProcessorConfigError` will be
        thrown. In the case where one want to them to behave differently, a
        different instance should be used.

        Args:
            component (PipelineComponent): The component to be inserted next
              to the pipeline.
            config (Union[Config, Dict[str, Any]): The custom configuration
              to be used for the added component. Default None, which means
              the `default_configs()` of the component will be used.
            selector (Selector): The selector used to pick the corresponding
              data pack to be consumed by the component. Default None, which
              means the whole pack will be used.

        Returns:
            The pipeline itself, which enables one to chain the creation of
            the pipeline, i.e., you can do:

            .. code-block:: python

                Pipeline().set_reader(your_reader()).add(
                    your_processor()).add(anther_processor())
        """
        if isinstance(component, BaseReader):
            raise ProcessFlowException(
                "Reader need to be set via set_reader()")

        if isinstance(component, Evaluator):
            # This will ask the job to keep a copy of the gold standard.
            self.evaluator_indices.append(len(self.components))

        if component not in self.__component_set:
            # The case where the component is not found.
            self._components.append(component)
            self.__component_set.add(component)
            self.component_configs.append(component.make_configs(config))
        else:
            if config is None:
                self._components.append(component)
                # We insert a `None` value here just to make the config list
                # to match the component list, but this config should not be
                # used.
                self.component_configs.append(None)
            else:
                raise ProcessorConfigError(
                    f"The same instance of a component named {component.name} "
                    f" has already been added to"
                    f" the pipeline, we do not accept a different configuration"
                    f" for it. If you would like to use a differently"
                    f" configured component, please create another instance."
                    f" If you intend to re-use the component instance, please"
                    f" do not provide the `config` (or provide a `None`).")

        if selector is None:
            self._selectors.append(self.__default_selector)
        else:
            self._selectors.append(selector)

        return self

    def add_gold_packs(self, pack):
        r"""Add gold packs to a internal dictionary used for evaluation.
        This dictionary is used by the evaluator while calling
        `consume_next(...)`

        Args:
            pack (Dict): A key, value pair containing job.id -> gold_pack
                mapping
        """
        self._predict_to_gold.update(pack)

    def process(self, *args, **kwargs) -> PackType:
        r"""Alias for :meth:`process_one`.

        Args:
            args: The positional arguments used to get the initial data.
            kwargs: The keyword arguments used to get the initial data.
        """
        return self.process_one(*args, **kwargs)

    def run(self, *args, **kwargs):
        r"""Run the whole pipeline and ignore all returned DataPack. This is
        mostly used when you need to run the pipeline and do not require the
        output but rely on the side-effect. For example, if the pipeline
        writes some data to disk.

        Calling this function will automatically call the :meth:`initialize`
        at the beginning, and call the :meth:`finish` at the end.

        Args:
            args: The positional arguments used to get the initial data.
            kwargs: The keyword arguments used to get the initial data.
        """
        self.initialize()
        for _ in self.process_dataset(*args, **kwargs):
            # Process the whole dataset ignoring the return values.
            # This essentially expect the processors have side effects.
            pass
        self.finish()

    def process_one(self, *args, **kwargs) -> PackType:
        r"""Process one single data pack. This is done by only reading and
        processing the first pack in the reader.

        Args:
            kwargs: the information needed to load the data. For example, if
                :attr:`_reader` is :class:`StringReader`, this should contain a
                single piece of text in the form of a string variable. If
                :attr:`_reader` is a file reader, this can point to the file
                path.
        """
        if not self._initialized:
            raise ProcessFlowException(
                "Please call initialize before running the pipeline")

        first_pack = []

        for p in self._reader.iter(*args, **kwargs):
            first_pack.append(p)
            break

        if len(first_pack) == 1:
            results = list(self._process_packs(iter(first_pack)))
            return results[0]
        else:
            raise ValueError("Input data source contains no packs.")

    def process_dataset(self, *args, **kwargs) -> Iterator[PackType]:
        r"""Process the documents in the data source(s) and return an
        iterator or list of DataPacks. The arguments are directly passed
        to the reader to take data from the source.
        """
        if not self._initialized:
            raise ProcessFlowException(
                "Please call initialize before running the pipeline")

        data_iter = self._reader.iter(*args, **kwargs)
        return self._process_packs(data_iter)

    def finish(self):
        """
        Call the finish method of all pipeline component. This need to be called
        explicitly to release all resources.
        """

        # Report time profiling of readers and processors
        if self._enable_profiling:
            out_header: str = "Pipeline Time Profile\n"
            out_reader: str = (f"- Reader: {self.reader.component_name}, " +
                               f"{self.reader.time_profile} s\n")
            out_processor: str = "\n".join([
                f"- Component [{i}]: {self.components[i].name}, {t} s"
                for i, t in enumerate(self._profiler)
            ])
            logger.info("%s%s%s", out_header, out_reader, out_processor)

        self.reader.finish(self.resource)
        for p in self.components:
            p.finish(self.resource)
        self._initialized = False

    def __update_stream_job_status(self):
        q_index = self._proc_mgr.current_queue_index
        u_index = self._proc_mgr.unprocessed_queue_indices[q_index]
        current_queue = self._proc_mgr.current_queue

        for job_i in itertools.islice(current_queue, 0, u_index + 1):
            if job_i.status == ProcessJobStatus.UNPROCESSED:
                job_i.set_status(ProcessJobStatus.PROCESSED)

    def __update_batch_job_status(self, component: BaseBatchProcessor):
        # update the status of the jobs. The jobs which were removed from
        # data_pack_pool will have status "PROCESSED" else they are "QUEUED"
        q_index = self._proc_mgr.current_queue_index
        u_index = self._proc_mgr.unprocessed_queue_indices[q_index]
        current_queue = self._proc_mgr.current_queue

        data_pool_length = len(component.batcher.data_pack_pool)

        for i, job_i in enumerate(
                itertools.islice(current_queue, 0, u_index + 1)):
            if i <= u_index - data_pool_length:
                job_i.set_status(ProcessJobStatus.PROCESSED)
            else:
                job_i.set_status(ProcessJobStatus.QUEUED)

    def __flush_batch_job_status(self):
        current_queue = self._proc_mgr.current_queue
        for job in current_queue:
            job.set_status(ProcessJobStatus.PROCESSED)

    def _process_with_component(
        self,
        selector: Selector,
        component: PipelineComponent,
        raw_job: ProcessJob,
    ):
        for pack in selector.select(raw_job.pack):
            # First, perform the component action on the pack
            try:
                if isinstance(component, Caster):
                    # Replacing the job pack with the casted version.
                    raw_job.alter_pack(component.cast(pack))
                elif isinstance(component, BaseBatchProcessor):
                    pack.set_control_component(component.name)
                    component.process(pack)
                elif isinstance(component, Evaluator):
                    pack.set_control_component(component.name)
                    component.consume_next(pack,
                                           self._predict_to_gold[raw_job.id])
                elif isinstance(component, BaseProcessor):
                    # Should be BasePackProcessor:
                    # All other processor are considered to be
                    # streaming processor like this.
                    pack.set_control_component(component.name)
                    component.process(pack)
                # After the component action, make sure the entry is
                # added into the index.
                pack.add_all_remaining_entries()
            except ValueError as e:
                raise ProcessExecutionException(
                    f"Exception occurred when running "
                    f"{component.name}") from e

    def _process_packs(self,
                       data_iter: Iterator[PackType]) -> Iterator[PackType]:
        r"""Process the packs received from the reader by the running through
        the pipeline.

        Args:
             data_iter (iterator): Iterator yielding jobs that contain packs

        Returns:
            Yields packs that are processed by the pipeline.
        """

        # pylint: disable=line-too-long

        # Here is the logic for the execution of the pipeline.

        # The basic idea is to yield a pack as soon as it gets processed by all
        # the processors instead of waiting for later jobs to get processed.

        # 1) A job can be in three status
        #  - UNPROCESSED
        #  - QUEUED
        #  - PROCESSED
        #
        # 2) Each processor maintains a queue to hold the jobs to be executed
        # next.
        #
        # 3) In case of a BatchProcessor, a job enters into QUEUED status if the
        # batch is not full according to the batcher of that processor.
        # In that case, the pipeline requests for additional jobs from the
        # reader and starts the execution loop from the beginning.
        #
        # 4) At any point, while moving to the next processor, the pipeline
        # ensures that all jobs are either in QUEUED or PROCESSED status. If
        # they are PROCESSED, they will be moved to the next queue. This design
        # ensures that at any point, while processing the job at processor `i`,
        # all the jobs in the previous queues are in QUEUED status. So whenever
        # a new job is needed, the pipeline can directly request it from the
        # reader instead of looking at previous queues for UNPROCESSED jobs.
        #
        # 5) When a processor receives a poison pack, it flushes all the
        # remaining batches in its memory (this actually has no effect in
        # PackProcessors) and moves the jobs including the poison pack to the
        # next queue. If there is no next processor, the packs are yield.
        #
        # 6) The loop terminates when the last queue contains only a poison pack
        #
        # Here is the sample pipeline and its execution
        #
        # Assume 1 pack corresponds to a batch of size 1
        #
        # After 1st step (iteration), reading from the reader,
        #
        #            batch_size = 2                               batch_size = 2
        #  Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|
        #          |___________|
        #          |___________|
        #          |___________|
        #          |_J1:QUEUED_|
        #
        # B1 needs another pack to process job J1
        #
        # After 2nd step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|       |_______________|
        #          |___________|       |_______________|
        #          |___________|       |_______________|
        #          |___________|       |_J2:UNPROCESSED_|
        #          |___________|       |_J1:UNPROCESSED_|
        #
        # B1 processes both the packs, the jobs are moved to the next queue.
        #
        # After 3rd step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_______________|     |_______________|
        #          |___________|       |_J2:UNPROCESSED_|     |_J1:UNPROCESSED_|
        #
        # P1 processes the first job. However, there exists one UNPROCESSED job
        # J2 in the queue. Pipeline first processes this job before moving to the
        # next processor
        #
        # After 4th step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_J2:UNPROCESSED_|
        #        |___________|       |_______________|     |_J1:UNPROCESSED_|
        #
        #
        # After 5th step (iteration),
        #
        #           batch_size = 2                               batch_size = 2
        # Reader -> B1 (BatchProcessor) -> P1 (PackProcessor) -> B2(BatchProcessor)
        #
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|    --> Yield J1.pack and J2.pack
        #        |___________|       |_______________|     |_______________|
        #        |___________|       |_______________|     |_______________|

        if not self._initialized:
            raise ProcessFlowException(
                "Please call initialize before running the pipeline")

        buffer = ProcessBuffer(self, data_iter)

        if len(self.components) == 0:
            yield from data_iter
            # Write return here instead of using if..else to reduce indent.
            return

        while not self._proc_mgr.exhausted():
            # Take the raw job from the buffer, the job status now should
            # be UNPROCESSED.
            raw_job: ProcessJob = next(buffer)

            component_index = self._proc_mgr.current_processor_index
            component = self.components[component_index]
            selector: Selector = self._selectors[component_index]
            current_queue_index = self._proc_mgr.current_queue_index
            current_queue: Deque[ProcessJob] = self._proc_mgr.current_queue
            pipeline_length = self._proc_mgr.pipeline_length
            unprocessed_queue_indices = self._proc_mgr.unprocessed_queue_indices
            processed_queue_indices = self._proc_mgr.processed_queue_indices
            next_queue_index = current_queue_index + 1
            should_yield = next_queue_index >= pipeline_length

            if not raw_job.is_poison:
                # Start timer
                if self._enable_profiling:
                    start_time: float = time()

                self._process_with_component(selector, component, raw_job)

                # Stop timer and add to time profiler
                if self._enable_profiling:
                    self._profiler[component_index] += time() - start_time

                # Then, based on component type, handle the queue.
                if isinstance(component, BaseBatchProcessor):
                    self.__update_batch_job_status(component)
                    index = unprocessed_queue_indices[current_queue_index]

                    # Check status of all the jobs up to "index".
                    for i, job_i in enumerate(
                            itertools.islice(current_queue, 0, index + 1)):
                        if job_i.status == ProcessJobStatus.PROCESSED:
                            processed_queue_indices[current_queue_index] = i

                    # There are UNPROCESSED jobs in the queue.
                    if index < len(current_queue) - 1:
                        unprocessed_queue_indices[current_queue_index] += 1
                    elif processed_queue_indices[current_queue_index] == -1:
                        # Fetch more data from the reader to process the
                        # first job.
                        unprocessed_queue_indices[current_queue_index] = len(
                            current_queue)
                        self._proc_mgr.current_processor_index = 0
                        self._proc_mgr.current_queue_index = -1
                    else:
                        processed_queue_index = processed_queue_indices[
                            current_queue_index]
                        # Move or yield the pack.
                        c_queue = list(current_queue)
                        for job_i in c_queue[:processed_queue_index + 1]:
                            if job_i.status == ProcessJobStatus.PROCESSED:
                                if should_yield:
                                    if job_i.id in self._predict_to_gold:
                                        self._predict_to_gold.pop(job_i.id)
                                    # TODO: I don't know why these are
                                    #  marked as incompatible type by mypy.
                                    #  the same happens 3 times on every yield.
                                    #  It is observed that the pack returned
                                    #  from the `ProcessJob` is considered to
                                    #  be different from `PackType`.
                                    yield job_i.pack  # type: ignore
                                else:
                                    self._proc_mgr.add_to_queue(
                                        queue_index=next_queue_index,
                                        job=job_i)
                            else:
                                raise ProcessFlowException(
                                    f"The job status should be "
                                    f"{ProcessJobStatus.PROCESSED} "
                                    f"at this point.")
                            current_queue.popleft()

                        # Set the UNPROCESSED and PROCESSED indices.
                        unprocessed_queue_indices[current_queue_index] = len(
                            current_queue)

                        processed_queue_indices[current_queue_index] = -1

                        if should_yield:
                            self._proc_mgr.current_processor_index = 0
                            self._proc_mgr.current_queue_index = -1
                        else:
                            self._proc_mgr.current_processor_index = (
                                next_queue_index)
                            self._proc_mgr.current_queue_index = (
                                next_queue_index)
                # Besides Batch Processors, the other component type only
                # deal with one pack at a time, these include: PackProcessor
                # Evaluator, Caster.
                # - Move them to the next queue
                else:
                    self.__update_stream_job_status()
                    index = unprocessed_queue_indices[current_queue_index]

                    # there are UNPROCESSED jobs in the queue
                    if index < len(current_queue) - 1:
                        unprocessed_queue_indices[current_queue_index] += 1
                    else:
                        # current_queue is modified in this array
                        for job_i in list(current_queue):
                            if job_i.status == ProcessJobStatus.PROCESSED:
                                if should_yield:
                                    if job_i.id in self._predict_to_gold:
                                        self._predict_to_gold.pop(job_i.id)
                                    yield job_i.pack  # type: ignore
                                else:
                                    self._proc_mgr.add_to_queue(
                                        queue_index=next_queue_index,
                                        job=job_i)
                                current_queue.popleft()
                            else:
                                raise ProcessFlowException(
                                    f"The job status should be "
                                    f"{ProcessJobStatus.PROCESSED} "
                                    f"at this point.")

                        # set the UNPROCESSED index
                        # we do not use "processed_queue_indices" as the
                        # jobs get PROCESSED whenever they are passed
                        # into a PackProcessor
                        unprocessed_queue_indices[current_queue_index] = len(
                            current_queue)

                        # update the current queue and processor only
                        # when all the jobs are processed in the current
                        # queue
                        if should_yield:
                            self._proc_mgr.current_processor_index = 0
                            self._proc_mgr.current_queue_index = -1

                        else:
                            self._proc_mgr.current_processor_index = (
                                next_queue_index)
                            self._proc_mgr.current_queue_index = (
                                next_queue_index)
            else:
                component.flush()
                self.__flush_batch_job_status()

                # current queue is modified in the loop
                for job in list(current_queue):
                    if (job.status != ProcessJobStatus.PROCESSED
                            and not job.is_poison):
                        raise ValueError("Job is neither PROCESSED nor is "
                                         "a poison. Something went wrong "
                                         "during execution.")

                    if not job.is_poison and should_yield:
                        if job.id in self._predict_to_gold:
                            self._predict_to_gold.pop(job.id)
                        yield job.pack  # type: ignore

                    elif not should_yield:
                        self._proc_mgr.add_to_queue(
                            queue_index=next_queue_index, job=job)

                    if not job.is_poison:
                        current_queue.popleft()

                if not should_yield:
                    # set next processor and queue as current
                    self._proc_mgr.current_processor_index = next_queue_index
                    self._proc_mgr.current_queue_index = next_queue_index

        self._proc_mgr.reset()

    def evaluate(self) -> Iterator[Tuple[str, Any]]:
        """
        Call the evaluators in the pipeline to collect their results.

        Returns:
            Iterator of the evaluator results. Each element is a tuple, where
            the first one is the name of the evaluator, and the second one
            is the output of the evaluator (see
            :func:`~forte.evaluation.base.evaluator.get_result`).
        """
        for i in self.evaluator_indices:
            p = self.components[i]
            assert isinstance(p, Evaluator)
            yield p.name, p.get_result()
示例#22
0
    def __init__(
        self,
        resource: Optional[Resources] = None,
        ontology_file: Optional[str] = None,
        enforce_consistency: bool = False,
        do_init_type_check: bool = False,
    ):
        r"""

        Args:
            resource: The ``Resources`` object, which is a global registry used
                in the pipeline. Objects defined as ``Resources`` will be
                passed on to the processors in the
                pipeline for initialization.
            ontology_file: The path to the input ontology specification file,
                which should be a json file, and it should have all the entries
                inside with no import as key.
            enforce_consistency: This boolean determines whether the
                pipeline will check the content expectations specified in each
                pipeline component. Each component will check whether the input
                pack contains the expected data
                via checking the meta-data, and throws a
                :class:`~forte.common.exception.ExpectedEntryNotFound` if it
                fails. When this function is called with enforce is ``True``,
                all the pipeline components would check if the input datapack
                record matches
                with the expected types and attributes if function
                ``expected_types_and_attributes`` is implemented
                for the processor. For example, processor A requires entry type
                of ``ft.onto.base_ontology.Sentence``, and processor B would
                produce this type in the output datapack, so ``record`` function
                of processor B writes the record of this type in the datapack
                and processor A implements ``expected_types_and_attributes`` to
                add this type. Then when the pipeline runs with
                `enforce_consistency=True`, processor A would check if this
                type exists in the record of the output of the
                previous pipeline component.
            do_init_type_check: Determine whether to check records types and
                attributes during pipeline initialization. Default to `False`.
                If this boolean is set to `True`, each component in the
                pipeline will be validated by comparing its
                ``expected_types_and_attributes`` with the accumulated
                ``records`` from all the downstream components.
        """
        self._reader: BaseReader
        self._reader_config: Optional[Config] = None

        # These variables defines the units in the pipeline, they should be
        # of the same length
        self._components: List[PipelineComponent] = []
        self._selectors: List[Selector] = []
        self._configs: List[Optional[Config]] = []

        # Maintain a set of the pipeline components to fast check whether
        # the component is already there.
        self.__component_set: Set[PipelineComponent] = set()

        # Will initialize at `initialize` because the processors length is
        # unknown.
        self._proc_mgr: ProcessManager = None  # type: ignore

        self.evaluator_indices: List[int] = []

        # needed for evaluator
        self._predict_to_gold: Dict[int, PackType] = {}

        if resource is None:
            self.resource = Resources()
        else:
            self.resource = resource

        if ontology_file is None:
            with resources.path("forte.ontology_specs",
                                "base_ontology.json") as data_path:
                ontology_file = str(data_path)

        if ontology_file is not None:
            with open(ontology_file, "r") as f:
                spec_dict = json.load(f)
                self.resource.update(onto_specs_path=ontology_file)
                self.resource.update(onto_specs_dict=spec_dict)

        # The flag indicating whether this pipeline is initialized.
        self._initialized: bool = False
        # The flag indicating whether we want to enforce type consistency
        #  between the processors.
        self._check_type_consistency: bool = False

        # Create one copy of the dummy selector to reduce class creation.
        self.__default_selector: Selector = DummySelector()

        # needed for time profiling of pipeline
        self._enable_profiling: bool = False
        self._profiler: List[float] = []

        self._check_type_consistency = enforce_consistency

        # Indicate whether do type checking during pipeline initialization
        self._do_init_type_check: bool = do_init_type_check
示例#23
0
 def setUp(self):
     self.kwargs = {"1": "one", "dummy": DummyObject(1, 2)}
     self.resources = Resources(**self.kwargs)
     self.output_dir = tempfile.mkdtemp()
示例#24
0
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs_properties: str, base_output_path: str):
    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress('Loading redirects', '\n')
    redirect_pickle = os.path.join(base_output_path, 'redirects.pickle')

    redirect_map: Dict[str, str]
    if os.path.exists(redirect_pickle):
        redirect_map = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", '\n')

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, 'nif_raw')
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", '\n')

    # 2. Add wiki page structures, create a new directory for it.
    struct_dir = raw_pack_dir + '_struct'
    add_wiki_info(WikiStructReader(), resources, nif_page_structure,
                  raw_pack_dir, struct_dir, 'page_structures', True)
    print_progress("Done reading wikipedia structures.", '\n')

    # 3. Add wiki links, create a new directory for it.
    link_dir = struct_dir + '_links'
    add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir,
                  link_dir, 'anchor_links', True)
    print_progress("Done reading wikipedia anchors.", '\n')

    # 4 The following steps add info boxes:
    # 4.1 Add un-mapped infobox, we directly write to the previous directory
    property_dir = link_dir
    add_wiki_info(WikiPropertyReader(),
                  resources,
                  info_boxs_properties,
                  link_dir,
                  property_dir,
                  'info_box_properties',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='properties.idx')
    print_progress("Done reading wikipedia info-boxes properties.", '\n')

    # 4.1 Add mapped literal, we directly write to the previous directory.
    literal_dir = property_dir
    add_wiki_info(WikiInfoBoxReader(),
                  resources,
                  mapping_literals,
                  property_dir,
                  literal_dir,
                  'literals',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='literals.idx')
    print_progress("Done reading wikipedia info-boxes literals.", '\n')

    # 4.1 Add mapped object, we directly write to the previous directory.
    mapping_dir = literal_dir
    add_wiki_info(WikiInfoBoxReader(),
                  resources,
                  mapping_objects,
                  literal_dir,
                  mapping_dir,
                  'objects',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='objects.idx')
    print_progress("Done reading wikipedia info-boxes objects.", '\n')
示例#25
0
def main(
    nif_context: str,
    nif_page_structure: str,
    mapping_literals: str,
    mapping_objects: str,
    nif_text_links: str,
    redirects: str,
    info_boxs_properties: str,
    categories: str,
    base_output_path: str,
    resume_existing: bool,
):
    # Whether to skip the whole step.
    if resume_existing:
        skip_existing = False
    else:
        skip_existing = True

    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress("Loading redirects", "\n")

    redirect_map: Dict[str, str] = cache_redirects(base_output_path, redirects)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", "\n")

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, "nif_raw")
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", "\n")

    # Use the same index structure for all writers.
    main_index = os.path.join(raw_pack_dir, "article.idx")

    # 2. Add wiki page structures, create a new directory for it.
    struct_dir = raw_pack_dir + "_struct"
    add_wiki_info(
        WikiStructReader(),
        resources,
        nif_page_structure,
        raw_pack_dir,
        struct_dir,
        "page_structures",
        use_input_index=True,
        skip_existing=skip_existing,
        resume_from_last=resume_existing,
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia structures.", "\n")

    # 3. Add wiki links, create a new directory for it.
    link_dir = struct_dir + "_links"
    add_wiki_info(
        WikiAnchorReader(),
        resources,
        nif_text_links,
        struct_dir,
        link_dir,
        "anchor_links",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia anchors.", "\n")

    # 4 The following steps add info boxes:
    # 4.1 Add un-mapped infobox, we directly write to the previous directory
    property_dir = link_dir
    add_wiki_info(
        WikiPropertyReader(),
        resources,
        info_boxs_properties,
        link_dir,
        property_dir,
        "info_box_properties",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="properties.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes properties.", "\n")

    # 4.1 Add mapped literal, we directly write to the previous directory.
    literal_dir = property_dir
    add_wiki_info(
        WikiInfoBoxReader(),
        resources,
        mapping_literals,
        property_dir,
        literal_dir,
        "literals",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="literals.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes literals.", "\n")

    # 4.1 Add mapped object, we directly write to the previous directory.
    mapping_dir = literal_dir
    add_wiki_info(
        WikiInfoBoxReader(),
        resources,
        mapping_objects,
        literal_dir,
        mapping_dir,
        "objects",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="objects.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes objects.", "\n")

    # 4.2 Add category, directly write to previous directory.
    category_dir = mapping_dir
    add_wiki_info(
        WikiCategoryReader(),
        resources,
        categories,
        mapping_dir,
        category_dir,
        "categories",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="categories.idx",
        input_index_file_path=main_index,
    )
示例#26
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(reader=MultiPackTerminalReader(),
                              config=config.reader)

    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.translator)
    query_pipeline.add_processor(processor=BertBasedQueryCreator(),
                                 config=config.query_creator)
    query_pipeline.add_processor(processor=SearchProcessor(),
                                 config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=SRLPredictor(),
        config=config.SRL,
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack(config.translator.in_pack_name)
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack(config.back_translator.in_pack_name)

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack(config.indexer.response_pack_name[0])
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
示例#27
0
 def __init__(self):
     self.resources: Resources = Resources()
     self.configs: Config = Config({}, {})
     self._check_type_consistency = False
示例#28
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    if not os.path.exists(config.indexer.model_dir):
        print(f"Creating a new index...")
        encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        encoder.to(device)

        feature_original_types = {
            "id": ["int64", "FixedLenFeature"],
            "input_ids": ["int64", "FixedLenFeature",
                          config.indexer.max_seq_length],
            "segment_ids": ["int64", "FixedLenFeature",
                            config.indexer.max_seq_length],
            "text": ["str", "FixedLenFeature"]
        }

        hparam = {
            "allow_smaller_final_batch": True,
            "batch_size": config.indexer.batch_size,
            "dataset": {
                "data_name": "data",
                "feature_original_types": feature_original_types,
                "files": config.indexer.pickle_data_dir
            },
            "shuffle": False
        }

        print(f"Embedding the text using BERTEncoder...")
        record_data = RecordData(hparams=hparam, device=device)
        data_iterator = DataIterator(record_data)
        index = EmbeddingBasedIndexer(hparams={
            "index_type": "GpuIndexFlatIP",
            "dim": 768,
            "device": "gpu0"
        })

        for idx, batch in enumerate(data_iterator):
            ids = batch["id"]
            input_ids = batch["input_ids"]
            segment_ids = batch["segment_ids"]
            text = batch["text"]
            _, pooled_output = get_embeddings(encoder, input_ids, segment_ids)
            index.add(vectors=pooled_output,
                      meta_data={k.item(): v for k, v in zip(ids, text)})

            if (idx + 1) % 50 == 0:
                print(f"Completed {idx+1} batches of size "
                      f"{config.indexer.batch_size}")

        index.save(path=config.indexer.model_dir)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(MultiPackTerminalReader())

    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.translator)
    query_pipeline.add_processor(
        processor=QueryCreator(), config=config.query_creator)
    query_pipeline.add_processor(
        processor=SearchProcessor(), config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name="doc_0"))
    # query_pipeline.add_processor(
    #    processor=CoNLLNERPredictor(), config=config.NER,
    #    selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack("query")
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack("response")

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack("doc_0")
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))