class NLTKSentenceSegmenter(PackProcessor): r"""A wrapper of NLTK sentence tokenizer. """ def __init__(self): super().__init__() self.sent_splitter = PunktSentenceTokenizer() def _process(self, input_pack: DataPack): for begin, end in self.sent_splitter.span_tokenize(input_pack.text): Sentence(input_pack, begin, end)
class NLTKSentenceSegmenter(PackProcessor): r"""A wrapper of NLTK sentence tokenizer.""" def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) nltk.download("punkt") def __init__(self): super().__init__() self.sent_splitter = PunktSentenceTokenizer() def _process(self, input_pack: DataPack): for begin, end in self.sent_splitter.span_tokenize(input_pack.text): Sentence(input_pack, begin, end) def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKSentenceSegmenter`, which is `ft.onto.base_ontology.Sentence` to :attr:`forte.data.data_pack.Meta.record`. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Sentence"] = set()